diff options
Diffstat (limited to 'fs')
348 files changed, 63099 insertions, 7220 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index ff0e8198020..51307b0fdf0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -189,6 +189,8 @@ config OCFS2_FS select CONFIGFS_FS select JBD2 select CRC32 + select QUOTA + select QUOTA_TREE help OCFS2 is a general purpose extent based shared disk cluster file system with many similarities to ext3. It supports 64 bit inode @@ -258,15 +260,33 @@ config OCFS2_DEBUG_FS this option for debugging only as it is likely to decrease performance of the filesystem. -config OCFS2_COMPAT_JBD - bool "Use JBD for compatibility" +config OCFS2_FS_POSIX_ACL + bool "OCFS2 POSIX Access Control Lists" depends on OCFS2_FS + select FS_POSIX_ACL default n - select JBD help - The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 - is backwards compatible with JBD. It is safe to say N here. - However, if you really want to use the original JBD, say Y here. + Posix Access Control Lists (ACLs) support permissions for users and + groups beyond the owner/group/world scheme. + +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. endif # BLOCK @@ -303,6 +323,10 @@ config PRINT_QUOTA_WARNING Note that this behavior is currently deprecated and may go away in future. Please use notification via netlink socket instead. +# Generic support for tree structured quota files. Seleted when needed. +config QUOTA_TREE + tristate + config QFMT_V1 tristate "Old quota format support" depends on QUOTA @@ -314,6 +338,7 @@ config QFMT_V1 config QFMT_V2 tristate "Quota format v2 support" depends on QUOTA + select QUOTA_TREE help This quota format allows using quotas with 32-bit UIDs/GIDs. If you need this functionality say Y here. @@ -715,7 +740,20 @@ config CONFIGFS_FS endmenu -menu "Miscellaneous filesystems" +menuconfig MISC_FILESYSTEMS + bool "Miscellaneous filesystems" + default y + ---help--- + Say Y here to get to see options for various miscellaneous + filesystems, such as filesystems that came from other + operating systems. + + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and + disabled; if unsure, say Y here. + +if MISC_FILESYSTEMS config ADFS_FS tristate "ADFS file system support (EXPERIMENTAL)" @@ -894,6 +932,58 @@ config CRAMFS If unsure, say N. +config SQUASHFS + tristate "SquashFS 4.0 - Squashed file system support" + depends on BLOCK + select ZLIB_INFLATE + help + Saying Y here includes support for SquashFS 4.0 (a Compressed + Read-Only File System). Squashfs is a highly compressed read-only + filesystem for Linux. It uses zlib compression to compress both + files, inodes and directories. Inodes in the system are very small + and all blocks are packed to minimise data overhead. Block sizes + greater than 4K are supported up to a maximum of 1 Mbytes (default + block size 128K). SquashFS 4.0 supports 64 bit filesystems and files + (larger than 4GB), full uid/gid information, hard links and + timestamps. + + Squashfs is intended for general read-only filesystem use, for + archival use (i.e. in cases where a .tar.gz file may be used), and in + embedded systems where low overhead is needed. Further information + and tools are available from http://squashfs.sourceforge.net. + + If you want to compile this as a module ( = code which can be + inserted in and removed from the running kernel whenever you want), + say M here and read <file:Documentation/modules.txt>. The module + will be called squashfs. Note that the root file system (the one + containing the directory /) cannot be compiled as a module. + + If unsure, say N. + +config SQUASHFS_EMBEDDED + + bool "Additional option for memory-constrained systems" + depends on SQUASHFS + default n + help + Saying Y here allows you to specify cache size. + + If unsure, say N. + +config SQUASHFS_FRAGMENT_CACHE_SIZE + int "Number of fragments cached" if SQUASHFS_EMBEDDED + depends on SQUASHFS + default "3" + help + By default SquashFS caches the last 3 fragments read from + the filesystem. Increasing this amount may mean SquashFS + has to re-read fragments less often from disk, at the expense + of extra system memory. Decreasing this amount will mean + SquashFS uses less memory at the expense of extra reads from disk. + + Note there must be at least one cached fragment. Anything + much more than three will probably not make much difference. + config VXFS_FS tristate "FreeVxFS file system support (VERITAS VxFS(TM) compatible)" depends on BLOCK @@ -1085,7 +1175,7 @@ config UFS_DEBUG Y here. This will result in _many_ additional debugging messages to be written to the system log. -endmenu +endif # MISC_FILESYSTEMS menuconfig NETWORK_FILESYSTEMS bool "Network File Systems" diff --git a/fs/Makefile b/fs/Makefile index e6f423d1d22..38bc735c67a 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -54,6 +54,7 @@ obj-$(CONFIG_GENERIC_ACL) += generic_acl.o obj-$(CONFIG_QUOTA) += dquot.o obj-$(CONFIG_QFMT_V1) += quota_v1.o obj-$(CONFIG_QFMT_V2) += quota_v2.o +obj-$(CONFIG_QUOTA_TREE) += quota_tree.o obj-$(CONFIG_QUOTACTL) += quota.o obj-$(CONFIG_PROC_FS) += proc/ @@ -73,6 +74,7 @@ obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ obj-$(CONFIG_CRAMFS) += cramfs/ +obj-$(CONFIG_SQUASHFS) += squashfs/ obj-y += ramfs/ obj-$(CONFIG_HUGETLBFS) += hugetlbfs/ obj-$(CONFIG_CODA_FS) += coda/ @@ -118,4 +120,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ diff --git a/fs/affs/file.c b/fs/affs/file.c index 1377b1240b6..9246cb4aa01 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -628,7 +628,7 @@ static int affs_write_begin_ofs(struct file *file, struct address_space *mapping } index = pos >> PAGE_CACHE_SHIFT; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 415d9c67ac1..3c4ec7d864c 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -119,8 +119,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) goto bad_inode; #else inode->i_mode |= S_IFDIR; - inode->i_op = NULL; - inode->i_fop = NULL; + /* ... and leave ->i_op and ->i_fop pointing to empty */ break; #endif case ST_LINKFILE: diff --git a/fs/afs/write.c b/fs/afs/write.c index d6b85dab35f..3fb36d43362 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -144,7 +144,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { kfree(candidate); return -ENOMEM; diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index c773680d5c6..e1734f2d6e2 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -251,13 +251,11 @@ struct inode *autofs_iget(struct super_block *sb, unsigned long ino) inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; inode->i_nlink = 2; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_blocks = 0; if (ino == AUTOFS_ROOT_INO) { inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &autofs_root_inode_operations; inode->i_fop = &autofs_root_operations; - inode->i_uid = inode->i_gid = 0; /* Changed in read_super */ goto done; } diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index e0f16da00e5..a76803108d0 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -25,8 +25,6 @@ #define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) #define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) -#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) - #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 63b7c7afe8d..025e105bffe 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -124,7 +124,7 @@ static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) /* * Check sanity of parameter control fields and if a path is present - * check that it has a "/" and is terminated. + * check that it is terminated and contains at least one "/". */ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) { @@ -138,15 +138,16 @@ static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) } if (param->size > sizeof(*param)) { - err = check_name(param->path); + err = invalid_str(param->path, + (void *) ((size_t) param + param->size)); if (err) { - AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", - cmd); + AUTOFS_WARN( + "path string terminator missing for cmd(0x%08x)", + cmd); goto out; } - err = invalid_str(param->path, - (void *) ((size_t) param + param->size)); + err = check_name(param->path); if (err) { AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", cmd); @@ -180,7 +181,7 @@ static int autofs_dev_ioctl_protover(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = sbi->version; + param->protover.version = sbi->version; return 0; } @@ -189,7 +190,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = sbi->sub_version; + param->protosubver.sub_version = sbi->sub_version; return 0; } @@ -335,13 +336,13 @@ static int autofs_dev_ioctl_openmount(struct file *fp, int err, fd; /* param->path has already been checked */ - if (!param->arg1) + if (!param->openmount.devid) return -EINVAL; param->ioctlfd = -1; path = param->path; - devid = param->arg1; + devid = param->openmount.devid; err = 0; fd = autofs_dev_ioctl_open_mountpoint(path, devid); @@ -373,7 +374,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, { autofs_wqt_t token; - token = (autofs_wqt_t) param->arg1; + token = (autofs_wqt_t) param->ready.token; return autofs4_wait_release(sbi, token, 0); } @@ -388,8 +389,8 @@ static int autofs_dev_ioctl_fail(struct file *fp, autofs_wqt_t token; int status; - token = (autofs_wqt_t) param->arg1; - status = param->arg2 ? param->arg2 : -ENOENT; + token = (autofs_wqt_t) param->fail.token; + status = param->fail.status ? param->fail.status : -ENOENT; return autofs4_wait_release(sbi, token, status); } @@ -412,10 +413,10 @@ static int autofs_dev_ioctl_setpipefd(struct file *fp, int pipefd; int err = 0; - if (param->arg1 == -1) + if (param->setpipefd.pipefd == -1) return -EINVAL; - pipefd = param->arg1; + pipefd = param->setpipefd.pipefd; mutex_lock(&sbi->wq_mutex); if (!sbi->catatonic) { @@ -457,8 +458,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp, { unsigned long timeout; - timeout = param->arg1; - param->arg1 = sbi->exp_timeout / HZ; + timeout = param->timeout.timeout; + param->timeout.timeout = sbi->exp_timeout / HZ; sbi->exp_timeout = timeout * HZ; return 0; } @@ -489,7 +490,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, path = param->path; devid = sbi->sb->s_dev; - param->arg1 = param->arg2 = -1; + param->requester.uid = param->requester.gid = -1; /* Get nameidata of the parent directory */ err = path_lookup(path, LOOKUP_PARENT, &nd); @@ -505,8 +506,8 @@ static int autofs_dev_ioctl_requester(struct file *fp, err = 0; autofs4_expire_wait(nd.path.dentry); spin_lock(&sbi->fs_lock); - param->arg1 = ino->uid; - param->arg2 = ino->gid; + param->requester.uid = ino->uid; + param->requester.gid = ino->gid; spin_unlock(&sbi->fs_lock); } @@ -529,10 +530,10 @@ static int autofs_dev_ioctl_expire(struct file *fp, int err = -EAGAIN; int how; - how = param->arg1; + how = param->expire.how; mnt = fp->f_path.mnt; - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); else dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); @@ -565,9 +566,9 @@ static int autofs_dev_ioctl_askumount(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - param->arg1 = 0; + param->askumount.may_umount = 0; if (may_umount(fp->f_path.mnt)) - param->arg1 = 1; + param->askumount.may_umount = 1; return 0; } @@ -600,6 +601,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, struct nameidata nd; const char *path; unsigned int type; + unsigned int devid, magic; int err = -ENOENT; if (param->size <= sizeof(*param)) { @@ -608,13 +610,13 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, } path = param->path; - type = param->arg1; + type = param->ismountpoint.in.type; - param->arg1 = 0; - param->arg2 = 0; + param->ismountpoint.out.devid = devid = 0; + param->ismountpoint.out.magic = magic = 0; if (!fp || param->ioctlfd == -1) { - if (type == AUTOFS_TYPE_ANY) { + if (autofs_type_any(type)) { struct super_block *sb; err = path_lookup(path, LOOKUP_FOLLOW, &nd); @@ -622,7 +624,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out; sb = nd.path.dentry->d_sb; - param->arg1 = new_encode_dev(sb->s_dev); + devid = new_encode_dev(sb->s_dev); } else { struct autofs_info *ino; @@ -635,38 +637,41 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp, goto out_release; ino = autofs4_dentry_ino(nd.path.dentry); - param->arg1 = autofs4_get_dev(ino->sbi); + devid = autofs4_get_dev(ino->sbi); } err = 0; if (nd.path.dentry->d_inode && nd.path.mnt->mnt_root == nd.path.dentry) { err = 1; - param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; + magic = nd.path.dentry->d_inode->i_sb->s_magic; } } else { - dev_t devid = new_encode_dev(sbi->sb->s_dev); + dev_t dev = autofs4_get_dev(sbi); err = path_lookup(path, LOOKUP_PARENT, &nd); if (err) goto out; - err = autofs_dev_ioctl_find_super(&nd, devid); + err = autofs_dev_ioctl_find_super(&nd, dev); if (err) goto out_release; - param->arg1 = autofs4_get_dev(sbi); + devid = dev; err = have_submounts(nd.path.dentry); if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { if (follow_down(&nd.path.mnt, &nd.path.dentry)) { struct inode *inode = nd.path.dentry->d_inode; - param->arg2 = inode->i_sb->s_magic; + magic = inode->i_sb->s_magic; } } } + param->ismountpoint.out.devid = devid; + param->ismountpoint.out.magic = magic; + out_release: path_put(&nd.path); out: diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4b6fb3f628c..e3bd50776f9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -63,7 +63,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); /* This is an autofs submount, we can't expire it */ - if (sbi->type == AUTOFS_TYPE_INDIRECT) + if (autofs_type_indirect(sbi->type)) goto done; /* @@ -490,7 +490,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 7b19802cfef..716e12b627b 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -197,9 +197,9 @@ static int autofs4_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",minproto=%d", sbi->min_proto); seq_printf(m, ",maxproto=%d", sbi->max_proto); - if (sbi->type & AUTOFS_TYPE_OFFSET) + if (autofs_type_offset(sbi->type)) seq_printf(m, ",offset"); - else if (sbi->type & AUTOFS_TYPE_DIRECT) + else if (autofs_type_direct(sbi->type)) seq_printf(m, ",direct"); else seq_printf(m, ",indirect"); @@ -284,13 +284,13 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *maxproto = option; break; case Opt_indirect: - *type = AUTOFS_TYPE_INDIRECT; + set_autofs_type_indirect(type); break; case Opt_direct: - *type = AUTOFS_TYPE_DIRECT; + set_autofs_type_direct(type); break; case Opt_offset: - *type = AUTOFS_TYPE_OFFSET; + set_autofs_type_offset(type); break; default: return 1; @@ -338,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; - sbi->type = AUTOFS_TYPE_INDIRECT; + set_autofs_type_indirect(&sbi->type); sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -380,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? + root_inode->i_op = autofs_type_trigger(sbi->type) ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; @@ -455,11 +455,7 @@ struct inode *autofs4_get_inode(struct super_block *sb, if (sb->s_root) { inode->i_uid = sb->s_root->d_inode->i_uid; inode->i_gid = sb->s_root->d_inode->i_gid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; } - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; if (S_ISDIR(inf->mode)) { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index e02cc8ae5eb..eeb24684590 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ - if (sbi->type & AUTOFS_TYPE_TRIGGER) + if (autofs_type_trigger(sbi->type)) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; @@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { qstr.len = autofs4_getpath(sbi, dentry, &name); @@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? + type = autofs_type_trigger(sbi->type) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? + type = autofs_type_trigger(sbi->type) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index 0ed57b5ee01..cc4062d12ca 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -213,6 +213,9 @@ static void bfs_put_super(struct super_block *s) { struct bfs_sb_info *info = BFS_SB(s); + if (!info) + return; + brelse(info->si_sbh); mutex_destroy(&info->bfs_lock); kfree(info->si_imap); @@ -327,6 +330,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) unsigned i, imap_len; struct bfs_sb_info *info; long ret = -EINVAL; + unsigned long i_sblock, i_eblock, i_eoff, s_size; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) @@ -350,6 +354,12 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) s->s_magic = BFS_MAGIC; info->si_sbh = bh; + + if (le32_to_cpu(bfs_sb->s_start) > le32_to_cpu(bfs_sb->s_end)) { + printf("Superblock is corrupted\n"); + goto out; + } + info->si_lasti = (le32_to_cpu(bfs_sb->s_start) - BFS_BSIZE) / sizeof(struct bfs_inode) + BFS_ROOT_INO - 1; @@ -380,6 +390,18 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) - le32_to_cpu(bfs_sb->s_start)) >> BFS_BSIZE_BITS; info->si_freei = 0; info->si_lf_eblk = 0; + + /* can we read the last block? */ + bh = sb_bread(s, info->si_blocks - 1); + if (!bh) { + printf("Last block not available: %lu\n", info->si_blocks - 1); + iput(inode); + ret = -EIO; + kfree(info->si_imap); + goto out; + } + brelse(bh); + bh = NULL; for (i = BFS_ROOT_INO; i <= info->si_lasti; i++) { struct bfs_inode *di; @@ -397,6 +419,29 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) di = (struct bfs_inode *)bh->b_data + off; + /* test if filesystem is not corrupted */ + + i_eoff = le32_to_cpu(di->i_eoffset); + i_sblock = le32_to_cpu(di->i_sblock); + i_eblock = le32_to_cpu(di->i_eblock); + s_size = le32_to_cpu(bfs_sb->s_end); + + if (i_sblock > info->si_blocks || + i_eblock > info->si_blocks || + i_sblock > i_eblock || + i_eoff > s_size || + i_sblock * BFS_BSIZE > i_eoff) { + + printf("Inode 0x%08x corrupted\n", i); + + brelse(bh); + s->s_root = NULL; + kfree(info->si_imap); + kfree(info); + s->s_fs_info = NULL; + return -EIO; + } + if (!di->i_ino) { info->si_freei++; continue; diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c41fa2af767..e3ff2b9e602 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -152,8 +152,10 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, elf_addr_t __user *sp; elf_addr_t __user *u_platform; elf_addr_t __user *u_base_platform; + elf_addr_t __user *u_rand_bytes; const char *k_platform = ELF_PLATFORM; const char *k_base_platform = ELF_BASE_PLATFORM; + unsigned char k_rand_bytes[16]; int items; elf_addr_t *elf_info; int ei_index = 0; @@ -196,6 +198,15 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, return -EFAULT; } + /* + * Generate 16 random bytes for userspace PRNG seeding. + */ + get_random_bytes(k_rand_bytes, sizeof(k_rand_bytes)); + u_rand_bytes = (elf_addr_t __user *) + STACK_ALLOC(p, sizeof(k_rand_bytes)); + if (__copy_to_user(u_rand_bytes, k_rand_bytes, sizeof(k_rand_bytes))) + return -EFAULT; + /* Create the ELF interpreter info */ elf_info = (elf_addr_t *)current->mm->saved_auxv; /* update AT_VECTOR_SIZE_BASE if the number of NEW_AUX_ENT() changes */ @@ -228,6 +239,7 @@ create_elf_tables(struct linux_binprm *bprm, struct elfhdr *exec, NEW_AUX_ENT(AT_GID, cred->gid); NEW_AUX_ENT(AT_EGID, cred->egid); NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_RANDOM, (elf_addr_t)(unsigned long)u_rand_bytes); NEW_AUX_ENT(AT_EXECFN, bprm->exec); if (k_platform) { NEW_AUX_ENT(AT_PLATFORM, diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index aa5b43205e3..f3e72c5c19f 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -168,9 +168,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, struct elf_fdpic_params exec_params, interp_params; struct elf_phdr *phdr; unsigned long stack_size, entryaddr; -#ifndef CONFIG_MMU - unsigned long fullsize; -#endif #ifdef ELF_FDPIC_PLAT_INIT unsigned long dynaddr; #endif @@ -390,11 +387,6 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, goto error_kill; } - /* expand the stack mapping to use up the entire allocation granule */ - fullsize = kobjsize((char *) current->mm->start_brk); - if (!IS_ERR_VALUE(do_mremap(current->mm->start_brk, stack_size, - fullsize, 0, 0))) - stack_size = fullsize; up_write(¤t->mm->mmap_sem); current->mm->brk = current->mm->start_brk; @@ -1567,11 +1559,9 @@ end_coredump: static int elf_fdpic_dump_segments(struct file *file, size_t *size, unsigned long *limit, unsigned long mm_flags) { - struct vm_list_struct *vml; - - for (vml = current->mm->context.vmlist; vml; vml = vml->next) { - struct vm_area_struct *vma = vml->vma; + struct vm_area_struct *vma; + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { if (!maydump(vma, mm_flags)) continue; @@ -1617,9 +1607,6 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, elf_fpxregset_t *xfpu = NULL; #endif int thread_status_size = 0; -#ifndef CONFIG_MMU - struct vm_list_struct *vml; -#endif elf_addr_t *auxv; unsigned long mm_flags; @@ -1685,13 +1672,7 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, fill_prstatus(prstatus, current, signr); elf_core_copy_regs(&prstatus->pr_reg, regs); -#ifdef CONFIG_MMU segs = current->mm->map_count; -#else - segs = 0; - for (vml = current->mm->context.vmlist; vml; vml = vml->next) - segs++; -#endif #ifdef ELF_CORE_EXTRA_PHDRS segs += ELF_CORE_EXTRA_PHDRS; #endif @@ -1766,20 +1747,10 @@ static int elf_fdpic_core_dump(long signr, struct pt_regs *regs, mm_flags = current->mm->flags; /* write program headers for segments dump */ - for ( -#ifdef CONFIG_MMU - vma = current->mm->mmap; vma; vma = vma->vm_next -#else - vml = current->mm->context.vmlist; vml; vml = vml->next -#endif - ) { + for (vma = current->mm->mmap; vma; vma = vma->vm_next) { struct elf_phdr phdr; size_t sz; -#ifndef CONFIG_MMU - vma = vml->vma; -#endif - sz = vma->vm_end - vma->vm_start; phdr.p_type = PT_LOAD; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 7bbd5c6b372..5cebf0b3779 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -417,8 +417,8 @@ static int load_flat_file(struct linux_binprm * bprm, unsigned long textpos = 0, datapos = 0, result; unsigned long realdatastart = 0; unsigned long text_len, data_len, bss_len, stack_len, flags; - unsigned long len, reallen, memp = 0; - unsigned long extra, rlim; + unsigned long len, memp = 0; + unsigned long memp_size, extra, rlim; unsigned long *reloc = 0, *rp; struct inode *inode; int i, rev, relocs = 0; @@ -543,17 +543,10 @@ static int load_flat_file(struct linux_binprm * bprm, } len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); down_write(¤t->mm->mmap_sem); realdatastart = do_mmap(0, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); - /* Remap to use all availabe slack region space */ - if (realdatastart && (realdatastart < (unsigned long)-4096)) { - reallen = kobjsize((void *)realdatastart); - if (reallen > len) { - realdatastart = do_mremap(realdatastart, len, - reallen, MREMAP_FIXED, realdatastart); - } - } up_write(¤t->mm->mmap_sem); if (realdatastart == 0 || realdatastart >= (unsigned long)-4096) { @@ -591,21 +584,14 @@ static int load_flat_file(struct linux_binprm * bprm, reloc = (unsigned long *) (datapos+(ntohl(hdr->reloc_start)-text_len)); memp = realdatastart; - + memp_size = len; } else { len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = PAGE_ALIGN(len); down_write(¤t->mm->mmap_sem); textpos = do_mmap(0, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); - /* Remap to use all availabe slack region space */ - if (textpos && (textpos < (unsigned long) -4096)) { - reallen = kobjsize((void *)textpos); - if (reallen > len) { - textpos = do_mremap(textpos, len, reallen, - MREMAP_FIXED, textpos); - } - } up_write(¤t->mm->mmap_sem); if (!textpos || textpos >= (unsigned long) -4096) { @@ -622,7 +608,7 @@ static int load_flat_file(struct linux_binprm * bprm, reloc = (unsigned long *) (textpos + ntohl(hdr->reloc_start) + MAX_SHARED_LIBS * sizeof(unsigned long)); memp = textpos; - + memp_size = len; #ifdef CONFIG_BINFMT_ZFLAT /* * load it all in and treat it like a RAM load from now on @@ -680,10 +666,12 @@ static int load_flat_file(struct linux_binprm * bprm, * set up the brk stuff, uses any slack left in data/bss/stack * allocation. We put the brk after the bss (between the bss * and stack) like other platforms. + * Userspace code relies on the stack pointer starting out at + * an address right at the end of a page. */ current->mm->start_brk = datapos + data_len + bss_len; current->mm->brk = (current->mm->start_brk + 3) & ~3; - current->mm->context.end_brk = memp + kobjsize((void *) memp) - stack_len; + current->mm->context.end_brk = memp + memp_size - stack_len; } if (flags & FLAT_FLAG_KTRACE) @@ -790,8 +778,8 @@ static int load_flat_file(struct linux_binprm * bprm, /* zero the BSS, BRK and stack areas */ memset((void*)(datapos + data_len), 0, bss_len + - (memp + kobjsize((void *) memp) - stack_len - /* end brk */ - libinfo->lib_list[id].start_brk) + /* start brk */ + (memp + memp_size - stack_len - /* end brk */ + libinfo->lib_list[id].start_brk) + /* start brk */ stack_len); return 0; diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index f2744ab4e5b..c4e83537ead 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -496,9 +496,6 @@ static struct inode *bm_get_inode(struct super_block *sb, int mode) if (inode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = current_fs_time(inode->i_sb); } @@ -652,7 +649,7 @@ static const struct file_operations bm_register_operations = { static ssize_t bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos) { - char *s = enabled ? "enabled" : "disabled"; + char *s = enabled ? "enabled\n" : "disabled\n"; return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s)); } @@ -788,6 +788,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, int i, ret; int nr_pages = 0; unsigned int len = 0; + unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0; for (i = 0; i < iov_count; i++) { unsigned long uaddr; @@ -814,35 +815,42 @@ struct bio *bio_copy_user_iov(struct request_queue *q, bio->bi_rw |= (!write_to_vm << BIO_RW); ret = 0; - i = 0; + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } while (len) { - unsigned int bytes; + unsigned int bytes = PAGE_SIZE; - if (map_data) - bytes = 1U << (PAGE_SHIFT + map_data->page_order); - else - bytes = PAGE_SIZE; + bytes -= offset; if (bytes > len) bytes = len; if (map_data) { - if (i == map_data->nr_entries) { + if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; break; } - page = map_data->pages[i++]; - } else + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - break; + if (!page) { + ret = -ENOMEM; + break; + } } - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) break; len -= bytes; + offset = 0; } if (ret) @@ -851,7 +859,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, /* * success */ - if (!write_to_vm) { + if (!write_to_vm && (!map_data || !map_data->null_mapped)) { ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); if (ret) goto cleanup; diff --git a/fs/block_dev.c b/fs/block_dev.c index 349a26c1000..ac7031f12ea 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1005,6 +1005,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } lock_kernel(); + restart: ret = -ENXIO; disk = get_gendisk(bdev->bd_dev, &partno); @@ -1025,6 +1026,19 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (disk->fops->open) { ret = disk->fops->open(bdev, mode); + if (ret == -ERESTARTSYS) { + /* Lost a race with 'disk' being + * deleted, try again. + * See md.c + */ + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; + module_put(disk->fops->owner); + put_disk(disk); + bdev->bd_disk = NULL; + mutex_unlock(&bdev->bd_mutex); + goto restart; + } if (ret) goto out_clear; } @@ -1220,6 +1234,20 @@ static long block_ioctl(struct file *file, unsigned cmd, unsigned long arg) return blkdev_ioctl(bdev, mode, cmd, arg); } +/* + * Try to release a page associated with block device when the system + * is under memory pressure. + */ +static int blkdev_releasepage(struct page *page, gfp_t wait) +{ + struct super_block *super = BDEV_I(page->mapping->host)->bdev.bd_super; + + if (super && super->s_op->bdev_try_to_free_page) + return super->s_op->bdev_try_to_free_page(super, page, wait); + + return try_to_free_buffers(page); +} + static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, .writepage = blkdev_writepage, @@ -1227,6 +1255,7 @@ static const struct address_space_operations def_blk_aops = { .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .writepages = generic_writepages, + .releasepage = blkdev_releasepage, .direct_IO = blkdev_direct_IO, }; @@ -1262,7 +1291,7 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * @path: special file representing the block device + * @pathname: special file representing the block device * * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 00000000000..d2cf5a54a4b --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,25 @@ +ifneq ($(KERNELRELEASE),) +# kbuild part of makefile + +obj-$(CONFIG_BTRFS_FS) := btrfs.o +btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o +else + +# Normal Makefile + +KERNELDIR := /lib/modules/`uname -r`/build +all: + $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install +clean: + $(MAKE) -C $(KERNELDIR) M=`pwd` clean + +endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 00000000000..1d53b62dbba --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +#ifdef CONFIG_FS_POSIX_ACL + +static void btrfs_update_cached_acl(struct inode *inode, + struct posix_acl **p_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(*p_acl); + *p_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl = NULL, **p_acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return ERR_PTR(-EINVAL); + } + + spin_lock(&inode->i_lock); + if (*p_acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(*p_acl); + spin_unlock(&inode->i_lock); + + if (acl) + return acl; + + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + btrfs_update_cached_acl(inode, p_acl, acl); + } + kfree(value); + } else if (size == -ENOENT) { + acl = NULL; + btrfs_update_cached_acl(inode, p_acl, acl); + } + + return acl; +} + +static int btrfs_xattr_get_acl(struct inode *inode, int type, + void *value, size_t size) +{ + struct posix_acl *acl; + int ret = 0; + + acl = btrfs_get_acl(inode, type); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return ret; +} + +/* + * Needs to be called with fs_mutex held + */ +static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + struct posix_acl **p_acl; + char *value = NULL; + mode_t mode; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + return ret; + ret = 0; + } + + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + ret = 0; + inode->i_mode = mode; + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(inode, name, value, size, 0); + +out: + kfree(value); + + if (!ret) + btrfs_update_cached_acl(inode, p_acl, acl); + + return ret; +} + +static int btrfs_xattr_set_acl(struct inode *inode, int type, + const void *value, size_t size) +{ + int ret = 0; + struct posix_acl *acl = NULL; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (acl == NULL) { + value = NULL; + size = 0; + } else if (IS_ERR(acl)) { + return PTR_ERR(acl); + } + } + + ret = btrfs_set_acl(inode, acl, type); + + posix_acl_release(acl); + + return ret; +} + + +static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl; + int error = -EAGAIN; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + + return error; +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + + if (IS_POSIXACL(dir) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto failed; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(inode, clone, + ACL_TYPE_ACCESS); + } + } + } +failed: + posix_acl_release(acl); + + return ret; +} + +int btrfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int ret = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!IS_POSIXACL(inode)) + return 0; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + + return ret; +} + +struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .get = btrfs_xattr_acl_default_get, + .set = btrfs_xattr_acl_default_set, +}; + +struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .get = btrfs_xattr_acl_access_get, + .set = btrfs_xattr_acl_access_set, +}; + +#else /* CONFIG_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + return 0; +} + +#endif /* CONFIG_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 00000000000..8e2fec05dbe --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,419 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/spinlock.h> +# include <linux/freezer.h> +#include "async-thread.h" + +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + unsigned long sequence; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; + + /* are we currently idle */ + int idle; +}; + +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + list_move(&worker->worker_list, &worker->workers->idle_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + unsigned long flags; + + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock_irqsave(&workers->lock, flags); + + while (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock_irqrestore(&workers->lock, flags); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock_irqsave(&workers->lock, flags); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock_irqrestore(&workers->lock, flags); + return 0; +} + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while (!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + if (!kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + spin_lock_init(&workers->lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); + worker->workers = workers; + if (IS_ERR(worker->task)) { + kfree(worker); + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + int enforce_min = workers->num_workers < workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working + */ + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + return worker; + } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + + /* + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + atomic_inc(&worker->num_pending); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) + list_move_tail(next, &workers->worker_list); + return worker; +} + +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + struct list_head *fallback = NULL; + /* + * we have failed to find any workers, just + * return the force one + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + + spin_unlock_irqrestore(&worker->lock, flags); + +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + worker = find_worker(workers); + if (workers->ordered) { + spin_lock_irqsave(&workers->lock, flags); + list_add_tail(&work->order_list, &workers->order_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + INIT_LIST_HEAD(&work->order_list); + } + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + check_busy_worker(worker); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 00000000000..31be4ed8b63 --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * func should be set to the function you want called + * your work struct is passed as the only arg + * + * ordered_func must be set for work sent to an ordered work queue, + * and it is called to complete a given work item in the same + * order they were sent to the queue. + */ + void (*func)(struct btrfs_work *work); + void (*ordered_func)(struct btrfs_work *work); + void (*ordered_free)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; + struct list_head order_list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* once a worker has this many requests or fewer, it is idle */ + int idle_thresh; + + /* force completions in the order they were queued */ + int ordered; + + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ + struct list_head worker_list; + struct list_head idle_list; + + /* + * when operating in ordered mode, this maintains the list + * of work items waiting for completion + */ + struct list_head order_list; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + + /* extra name for this worker, used for current->name */ + char *name; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +int btrfs_requeue_work(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 00000000000..a8c9693b75a --- /dev/null +++ b/fs/btrfs/btrfs_inode.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while inesrting or deleting extents from files */ + struct mutex extent_mutex; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* standard acl pointers */ + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; + + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* sequence number for NFS changes */ + u64 sequence; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ + u64 log_dirty_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* the start of block group preferred for allocations. */ + u64 block_group; + + struct inode vfs_inode; +}; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + inode->i_size = size; + BTRFS_I(inode)->disk_i_size = size; +} + + +#endif diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h new file mode 100644 index 00000000000..7c4503ef6ef --- /dev/null +++ b/fs/btrfs/compat.h @@ -0,0 +1,7 @@ +#ifndef _COMPAT_H_ +#define _COMPAT_H_ + +#define btrfs_drop_nlink(inode) drop_nlink(inode) +#define btrfs_inc_nlink(inode) inc_nlink(inode) + +#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 00000000000..ee848d8585d --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,709 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/pagevec.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + return sizeof(struct compressed_bio) + + ((disk_size + root->sectorsize - 1) / root->sectorsize) * + csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page, KM_USER0); + csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_USER0); + + if (csum != *cb_sum) { + printk(KERN_INFO "btrfs csum failed ino %lu " + "extent %llu csum %u " + "wanted %u mirror %d\n", inode->i_ino, + (unsigned long long)disk_start, + csum, *cb_sum, cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int bio_index = 0; + struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + while (bio_index < cb->orig_bio->bi_vcnt) { + SetPageChecked(bvec->bv_page); + bvec++; + bio_index++; + } + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (page_index = 0; page_index < cb->nr_pages; page_index++) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + printk("bytes left %lu compress len %lu nr %lu\n", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long page_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + while (last_offset < compressed_end) { + page_index = last_offset >> PAGE_CACHE_SHIFT; + + if (page_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_index); + rcu_read_unlock(); + if (page) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); + if (!page) + break; + + page->index = page_index; + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (add_to_page_cache(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!btrfs_test_flag(inode, NODATASUM)) { + btrfs_lookup_bio_sums(root, inode, comp_bio, + sums); + } + sums += (comp_bio->bi_size + root->sectorsize - 1) / + root->sectorsize; + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + if (!btrfs_test_flag(inode, NODATASUM)) + btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 00000000000..421f5b4aa71 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h new file mode 100644 index 00000000000..6e1b3de3670 --- /dev/null +++ b/fs/btrfs/crc32c.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CRC32C__ +#define __BTRFS_CRC32C__ +#include <linux/crc32c.h> + +/* + * this file used to do more for selecting the HW version of crc32c, + * perhaps it will one day again soon. + */ +#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) +#endif + diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 00000000000..9e46c077681 --- /dev/null +++ b/fs/btrfs/ctree.c @@ -0,0 +1,3953 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); + +inline void btrfs_init_path(struct btrfs_path *p) +{ + memset(p, 0, sizeof(*p)); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); + if (path) { + btrfs_init_path(path); + path->reada = 1; + } + return path; +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + btrfs_release_path(NULL, p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + spin_lock(&root->node_lock); + eb = root->node; + extent_buffer_get(eb); + spin_unlock(&root->node_lock); + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + + spin_lock(&root->node_lock); + if (eb == root->node) { + spin_unlock(&root->node_lock); + break; + } + spin_unlock(&root->node_lock); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + struct btrfs_root *new_root; + + new_root = kmalloc(sizeof(*new_root), GFP_NOFS); + if (!new_root) + return -ENOMEM; + + memcpy(new_root, root, sizeof(*new_root)); + new_root->root_key.objectid = new_root_objectid; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, + new_root_objectid, trans->transid, + level, buf->start, 0); + if (IS_ERR(cow)) { + kfree(new_root); + return PTR_ERR(cow); + } + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, new_root_objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); + kfree(new_root); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. + * btrfs_alloc_reserved_extent is used to finish the allocation. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + u64 prealloc_dest) +{ + u64 parent_start; + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + int unlock_orig = 0; + + if (*cow_ret == buf) + unlock_orig = 1; + + WARN_ON(!btrfs_tree_locked(buf)); + + if (parent) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (prealloc_dest) { + struct btrfs_key ins; + + ins.objectid = prealloc_dest; + ins.offset = buf->len; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_alloc_reserved_extent(trans, root, parent_start, + root->root_key.objectid, + trans->transid, level, &ins); + BUG_ON(ret); + cow = btrfs_init_new_buffer(trans, root, prealloc_dest, + buf->len); + } else { + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, + root->root_key.objectid, + trans->transid, level, + search_start, empty_size); + } + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (btrfs_header_generation(buf) != trans->transid) { + u32 nr_extents; + ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); + if (ret) + return ret; + + ret = btrfs_cache_ref(trans, root, buf, nr_extents); + WARN_ON(ret); + } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { + /* + * There are only two places that can drop reference to + * tree blocks owned by living reloc trees, one is here, + * the other place is btrfs_drop_subtree. In both places, + * we check reference count while tree block is locked. + * Furthermore, if reference count is one, it won't get + * increased by someone else. + */ + u32 refs; + ret = btrfs_lookup_extent_ref(trans, root, buf->start, + buf->len, &refs); + BUG_ON(ret); + if (refs == 1) { + ret = btrfs_update_ref(trans, root, buf, cow, + 0, nritems); + clean_tree_block(trans, root, buf); + } else { + ret = btrfs_inc_ref(trans, root, buf, cow, NULL); + } + BUG_ON(ret); + } else { + ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); + if (ret) + return ret; + clean_tree_block(trans, root, buf); + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); + WARN_ON(ret); + } + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + + spin_lock(&root->node_lock); + root->node = cow; + extent_buffer_get(cow); + spin_unlock(&root->node_lock); + + if (buf != root->commit_root) { + btrfs_free_extent(trans, root, buf->start, + buf->len, buf->start, + root->root_key.objectid, + btrfs_header_generation(buf), + level, 1); + } + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + WARN_ON(btrfs_header_generation(parent) != trans->transid); + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, btrfs_header_owner(parent), + btrfs_header_generation(parent), level, 1); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long) + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(buf) == trans->transid && + btrfs_header_owner(buf) == root->root_key.objectid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + *cow_ret = buf; + spin_unlock(&root->fs_info->hash_lock); + WARN_ON(prealloc_dest); + return 0; + } + spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0, + prealloc_dest); + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + if (k1.objectid > k2->objectid) + return 1; + if (k1.objectid < k2->objectid) + return -1; + if (k1.type > k2->type) + return 1; + if (k1.type < k2->type) + return -1; + if (k1.offset > k2->offset) + return 1; + if (k1.offset < k2->offset) + return -1; + return 0; +} + +/* + * same as comp_keys only with two btrfs_key's + */ +static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + if (cache_only && parent_level != 1) + return 0; + + if (trans->transaction != root->fs_info->running_transaction) + WARN_ON(1); + if (trans->transid != root->fs_info->generation) + WARN_ON(1); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = btrfs_level_size(root, parent_level - 1); + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + + if (!parent->map_token) { + map_extent_buffer(parent, + btrfs_node_key_ptr_offset(i), + sizeof(struct btrfs_key_ptr), + &parent->map_token, &parent->kaddr, + &parent->map_start, &parent->map_len, + KM_USER1); + } + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot - 2) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + + cur = btrfs_find_tree_block(root, blocknr, blocksize); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen); + else + uptodate = 0; + if (!cur || !uptodate) { + if (cache_only) { + free_extent_buffer(cur); + continue; + } + if (!cur) { + cur = read_tree_block(root, blocknr, + blocksize, gen); + } else if (!uptodate) { + btrfs_read_buffer(cur, gen); + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), 0); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ +static int check_node(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *parent = NULL; + struct extent_buffer *node = path->nodes[level]; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key node_key; + int parent_slot; + int slot; + struct btrfs_key cpukey; + u32 nritems = btrfs_header_nritems(node); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + slot = path->slots[level]; + BUG_ON(nritems == 0); + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_node_key(node, &node_key, 0); + BUG_ON(memcmp(&parent_key, &node_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(node)); + } + BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); + if (slot != 0) { + btrfs_node_key_to_cpu(node, &cpukey, slot - 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) <= 0); + } + if (slot < nritems - 1) { + btrfs_node_key_to_cpu(node, &cpukey, slot + 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) >= 0); + } + return 0; +} + +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ +static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *leaf = path->nodes[level]; + struct extent_buffer *parent = NULL; + int parent_slot; + struct btrfs_key cpukey; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key leaf_key; + int slot = path->slots[0]; + + u32 nritems = btrfs_header_nritems(leaf); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + if (nritems == 0) + return 0; + + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_item_key(leaf, &leaf_key, 0); + + BUG_ON(memcmp(&parent_key, &leaf_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(leaf)); + } + if (slot != 0 && slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); + if (comp_keys(&leaf_key, &cpukey) <= 0) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad key\n", slot); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, slot - 1) != + btrfs_item_end_nr(leaf, slot)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + if (slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); + BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + BUG_ON(btrfs_item_offset_nr(leaf, 0) + + btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); + return 0; +} + +static noinline int check_block(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + return 0; + if (level == 0) + return check_leaf(root, path, level); + return check_node(root, path, level); +} + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) { + unmap_extent_buffer(eb, map_token, KM_USER0); + map_token = NULL; + } + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 0; + } + } + *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + } else { + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); + } + return -1; +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + return read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + int err_on_enospc = 0; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + WARN_ON(!path->locks[level]); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + btrfs_tree_lock(child); + BUG_ON(!child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + BUG_ON(ret); + + spin_lock(&root->node_lock); + root->node = child; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, child->start, + mid->start, child->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + ret = btrfs_free_extent(trans, root, mid->start, mid->len, + mid->start, root->root_key.objectid, + btrfs_header_generation(mid), + level, 1); + /* once for the root ptr */ + free_extent_buffer(mid); + return ret; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + u64 bytenr = right->start; + u64 generation = btrfs_header_generation(parent); + u32 blocksize = right->len; + + clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); + free_extent_buffer(right); + right = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + generation, level, 1); + if (wret) + ret = wret; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + /* we've managed to empty the middle node, drop it */ + u64 root_gen = btrfs_header_generation(parent); + u64 bytenr = mid->start; + u32 blocksize = mid->len; + + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + mid = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, + btrfs_header_owner(parent), + root_gen, level, 1); + if (wret) + ret = wret; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + check_block(root, path, level); + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left, 0); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + btrfs_tree_lock(right); + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right, 0); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static noinline void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 lowest_read; + u64 highest_read; + u64 nread = 0; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = btrfs_level_size(root, level - 1); + eb = btrfs_find_tree_block(root, search, blocksize); + if (eb) { + free_extent_buffer(eb); + return; + } + + highest_read = search; + lowest_read = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + while (1) { + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search >= lowest_read && search <= highest_read) || + (search < lowest_read && lowest_read - search <= 16384) || + (search > highest_read && search - highest_read <= 16384)) { + readahead_tree_block(root, search, blocksize, + btrfs_node_ptr_generation(node, nr)); + nread += blocksize; + } + nscan++; + if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + break; + + if (nread > (256 * 1024) || nscan > 128) + break; + + if (search < lowest_read) + lowest_read = search; + if (search > highest_read) + highest_read = search; + } +} + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + struct extent_buffer *tmp; + int slot; + int ret; + int level; + int should_reada = p->reada; + int lowest_unlock = 1; + int blocksize; + u8 lowest_level = 0; + u64 blocknr; + u64 gen; + struct btrfs_key prealloc_block; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + + if (ins_len < 0) + lowest_unlock = 2; + + prealloc_block.objectid = 0; + +again: + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + if (cow) { + int wret; + + /* is a cow on this block not required */ + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(b) == trans->transid && + btrfs_header_owner(b) == root->root_key.objectid && + !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { + spin_unlock(&root->fs_info->hash_lock); + goto cow_done; + } + spin_unlock(&root->fs_info->hash_lock); + + /* ok, we have to cow, is our old prealloc the right + * size? + */ + if (prealloc_block.objectid && + prealloc_block.offset != b->len) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + prealloc_block.objectid = 0; + } + + /* + * for higher level blocks, try not to allocate blocks + * with the block and the parent locks held. + */ + if (level > 1 && !prealloc_block.objectid && + btrfs_path_lock_waiting(p, level)) { + u32 size = b->len; + u64 hint = b->start; + + btrfs_release_path(root, p); + ret = btrfs_reserve_extent(trans, root, + size, size, 0, + hint, (u64)-1, + &prealloc_block, 0); + BUG_ON(ret); + goto again; + } + + wret = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], + &b, prealloc_block.objectid); + prealloc_block.objectid = 0; + if (wret) { + free_extent_buffer(b); + ret = wret; + goto done; + } + } +cow_done: + BUG_ON(!cow && ins_len); + if (level != btrfs_header_level(b)) + WARN_ON(1); + level = btrfs_header_level(b); + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + ret = check_block(root, p, level); + if (ret) { + ret = -1; + goto done; + } + + ret = bin_search(b, key, level, &slot); + if (level != 0) { + if (ret && slot > 0) + slot -= 1; + p->slots[level] = slot; + if ((p->search_for_split || ins_len > 0) && + btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret = split_node(trans, root, p, level); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + } else if (ins_len < 0) { + int sret = balance_level(trans, root, p, + level); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + slot = p->slots[level]; + BUG_ON(btrfs_header_nritems(b) == 1); + } + unlock_up(p, level, lowest_unlock); + + /* this is only true while dropping a snapshot */ + if (level == lowest_level) { + ret = 0; + goto done; + } + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + b = tmp; + } else { + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + if (level > 1) { + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + + tmp = read_tree_block(root, blocknr, + blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + goto again; + } else { + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + b = read_node_slot(root, b, slot); + } + } + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + int sret = split_leaf(trans, root, key, + p, ins_len, ret == 0); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock); + goto done; + } + } + ret = 1; +done: + if (prealloc_block.objectid) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + } + + return ret; +} + +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 bytenr; + u64 generation; + u32 blocksize; + int level; + int slot; + int key_match; + int ret; + + eb = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + BUG_ON(ret); + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + if (level == 0 || level <= lowest_level) + break; + + ret = bin_search(parent, &node_keys[lowest_level], level, + &slot); + if (ret && slot > 0) + slot--; + + bytenr = btrfs_node_blockptr(parent, slot); + if (nodes[level - 1] == bytenr) + break; + + blocksize = btrfs_level_size(root, level - 1); + generation = btrfs_node_ptr_generation(parent, slot); + btrfs_node_key_to_cpu(eb, &key, slot); + key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); + + if (generation == trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + /* + * if node keys match and node pointer hasn't been modified + * in the running transaction, we can merge the path. for + * blocks owened by reloc trees, the node pointer check is + * skipped, this is because these blocks are fully controlled + * by the space balance code, no one else can modify them. + */ + if (!nodes[level - 1] || !key_match || + (generation == trans->transid && + btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { + if (level == 1 || level == lowest_level + 1) { + if (generation == trans->transid) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + break; + } + + if (generation != trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + ret = btrfs_cow_block(trans, root, eb, parent, slot, + &eb, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + if (!nodes[level - 1]) { + nodes[level - 1] = eb->start; + memcpy(&node_keys[level - 1], &key, + sizeof(node_keys[0])); + } else { + WARN_ON(1); + } + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + parent = eb; + continue; + } + + btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); + btrfs_set_node_ptr_generation(parent, slot, trans->transid); + btrfs_mark_buffer_dirty(parent); + + ret = btrfs_inc_extent_ref(trans, root, + nodes[level - 1], + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1); + BUG_ON(ret); + + /* + * If the block was created in the running transaction, + * it's possible this is the last reference to it, so we + * should drop the subtree. + */ + if (generation == trans->transid) { + ret = btrfs_drop_subtree(trans, root, eb, parent); + BUG_ON(ret); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } else { + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1, 1); + BUG_ON(ret); + } + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return 0; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. + */ +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + int ret = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } + return ret; +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + if (comp_keys(&disk_key, new_key) >= 0) + return -1; + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + if (comp_keys(&disk_key, new_key) <= 0) + return -1; + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(eb); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + return 0; +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); + BUG_ON(ret); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); + BUG_ON(ret); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + int ret; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, trans->transid, + level, root->node->start, 0); + if (IS_ERR(c)) + return PTR_ERR(c); + + memset_extent_buffer(c, 0, 0, root->nodesize); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_bytenr(c, c->start); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_owner(c, root->root_key.objectid); + + write_extent_buffer(c, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(c), + BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(c), + BTRFS_UUID_SIZE); + + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(c); + + spin_lock(&root->node_lock); + old = root->node; + root->node = c; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, lower->start, + lower->start, c->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + extent_buffer_get(c); + path->nodes[level] = c; + path->locks[level] = 1; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + * + * returns zero on success and < 0 on any error + */ +static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, u64 bytenr, int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + + BUG_ON(!path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + if (slot > nritems) + BUG(); + if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) + BUG(); + if (slot != nritems) { + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); + return 0; +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + int wret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* trying to split the root, lets make a new one */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + + split = btrfs_alloc_free_block(trans, root, root->nodesize, + path->nodes[level + 1]->start, + root->root_key.objectid, + trans->transid, level, c->start, 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + btrfs_set_header_flags(split, btrfs_header_flags(c)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_bytenr(split, split->start); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_owner(split, root->root_key.objectid); + btrfs_set_header_flags(split, 0); + write_extent_buffer(split, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(split), + BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); + + mid = (c_nritems + 1) / 2; + + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + ret = 0; + + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + btrfs_node_key(split, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, + level + 1); + if (wret) + ret = wret; + + ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); + BUG_ON(ret); + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return ret; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(struct extent_buffer *l, int start, int nr) +{ + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + data_len = btrfs_item_end_nr(l, start); + data_len = data_len - btrfs_item_offset_nr(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +noinline int btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " + "used %d nritems %d\n", + ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int free_space; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 left_nritems; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + int ret; + + slot = path->slots[1]; + if (!path->nodes[1]) + return 1; + + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + WARN_ON(!btrfs_tree_locked(path->nodes[1])); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right, 0); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + if (empty) + nr = 0; + else + nr = 1; + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + i = left_nritems - 1; + while (i >= nr) { + item = btrfs_item_nr(left, i); + + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, left); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + this_item_size = btrfs_item_size(left, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + if (i == 0) + break; + i--; + } + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + if (push_items == 0) + goto out_unlock; + + if (!empty && push_items == left_nritems) + WARN_ON(1); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space -= leaf_data_end(root, left); + + /* make room in the right data area */ + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + + /* copy from the left data area */ + copy_extent_buffer(right, left, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(left) + leaf_data_end(root, left), + push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + + /* copy the items from left to right */ + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); + + /* update the item pointers */ + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + push_space -= btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(right); + + ret = btrfs_update_ref(trans, root, left, right, 0, push_items); + BUG_ON(ret); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int i; + int free_space; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 old_left_nritems; + u32 right_nritems; + u32 nr; + int ret = 0; + int wret; + u32 this_item_size; + u32 old_left_item_size; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + WARN_ON(!btrfs_tree_locked(path->nodes[1])); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left, 0); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + if (empty) + nr = right_nritems; + else + nr = right_nritems - 1; + + for (i = 0; i < nr; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, right); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + if (push_items == 0) { + ret = 1; + goto out; + } + if (!empty && push_items == btrfs_header_nritems(right)) + WARN_ON(1); + + /* push data from right to left */ + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(right, push_items - 1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + + leaf_data_end(root, left) - push_space, + btrfs_leaf_data(right) + + btrfs_item_offset_nr(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + item = btrfs_item_nr(left, i); + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + /* fixup right node */ + if (push_items > right_nritems) { + printk(KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + WARN_ON(1); + } + + if (push_items < right_nritems) { + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + } + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + push_space = push_space - btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_mark_buffer_dirty(left); + if (right_nritems) + btrfs_mark_buffer_dirty(right); + + ret = btrfs_update_ref(trans, root, right, left, + old_left_nritems, push_items); + BUG_ON(ret); + + btrfs_item_key(right, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, 1); + if (wret) + ret = wret; + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + int double_split; + int num_doubles = 0; + struct btrfs_disk_key disk_key; + + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + double_split = 0; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + right = btrfs_alloc_free_block(trans, root, root->leafsize, + path->nodes[1]->start, + root->root_key.objectid, + trans->transid, 0, l->start, 0); + if (IS_ERR(right)) { + BUG_ON(1); + return PTR_ERR(right); + } + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + btrfs_mark_buffer_dirty(right); + return ret; + } + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(right); + return ret; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } + } + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + ret = btrfs_update_ref(trans, root, l, right, 0, nritems); + BUG_ON(ret); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + if (double_split) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + return ret; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + u32 item_size; + struct extent_buffer *leaf; + struct btrfs_key orig_key; + struct btrfs_item *item; + struct btrfs_item *new_item; + int ret = 0; + int slot; + u32 nritems; + u32 orig_offset; + struct btrfs_disk_key disk_key; + char *buf; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); + if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) + goto split; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + btrfs_release_path(root, path); + + path->search_for_split = 1; + path->keep_locks = 1; + + ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); + path->search_for_split = 0; + + /* if our item isn't there or got smaller, return now */ + if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], + path->slots[0])) { + path->keep_locks = 0; + return -EAGAIN; + } + + ret = split_leaf(trans, root, &orig_key, path, + sizeof(struct btrfs_item), 1); + path->keep_locks = 0; + BUG_ON(ret); + + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: + item = btrfs_item_nr(leaf, path->slots[0]); + orig_offset = btrfs_item_offset(leaf, item); + item_size = btrfs_item_size(leaf, item); + + + buf = kmalloc(item_size, GFP_NOFS); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + slot = path->slots[0] + 1; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + new_item = btrfs_item_nr(leaf, slot); + + btrfs_set_item_offset(leaf, new_item, orig_offset); + btrfs_set_item_size(leaf, new_item, item_size - split_offset); + + btrfs_set_item_offset(leaf, item, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, item, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + kfree(buf); + return ret; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size_nr(leaf, slot); + if (old_size == new_size) + return 0; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + old_data_start = btrfs_item_offset_nr(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + if (from_end) { + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + offsetof(struct btrfs_file_extent_item, + disk_bytenr)); + } + } + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + } + + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ +int btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_end_nr(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d too large, nritems %d\n", + slot, nritems); + BUG_ON(1); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - data_size, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + * Returns the number of keys that were inserted. + */ +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int i; + u32 nritems; + u32 total_data = 0; + u32 total_size = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct btrfs_key found_key; + + for (i = 0; i < nr; i++) { + if (total_size + data_size[i] + sizeof(struct btrfs_item) > + BTRFS_LEAF_DATA_SIZE(root)) { + break; + nr = i; + } + total_data += data_size[i]; + total_size += data_size[i] + sizeof(struct btrfs_item); + } + BUG_ON(nr == 0); + + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + for (i = nr; i >= 0; i--) { + total_data -= data_size[i]; + total_size -= data_size[i] + sizeof(struct btrfs_item); + if (total_size < btrfs_leaf_free_space(root, leaf)) + break; + } + nr = i; + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* figure out how many keys we can insert in here */ + total_data = data_size[0]; + for (i = 1; i < nr; i++) { + if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) + break; + total_data += data_size[i]; + } + nr = i; + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } else { + /* + * this sucks but it has to be done, if we are inserting at + * the end of the leaf only insert 1 of the items, since we + * have no way of knowing whats on the next leaf and we'd have + * to drop our current locks to figure it out + */ + nr = 1; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + if (!ret) + ret = nr; + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int slot_orig; + int i; + u32 nritems; + u32 total_size = 0; + u32 total_data = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "not enough freespace need %u have %d\n", + total_size, btrfs_leaf_free_space(root, leaf)); + BUG(); + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + return ret; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + */ +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret = 0; + int wret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(parent); + return ret; +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. bytenr is the node block pointer, but since the callers + * already know it, it is faster to have them pass it down than to + * read it out of the node again. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr) +{ + int ret; + u64 root_gen = btrfs_header_generation(path->nodes[1]); + + ret = del_ptr(trans, root, path, 1, path->slots[1]); + if (ret) + return ret; + + ret = btrfs_free_extent(trans, root, bytenr, + btrfs_level_size(root, 0), + path->nodes[1]->start, + btrfs_header_owner(path->nodes[1]), + root_gen, 0, 1); + return ret; +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int last_off; + int dsize = 0; + int ret = 0; + int wret; + int i; + u32 nritems; + + leaf = path->nodes[0]; + last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size_nr(leaf, slot + i); + + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + int data_end = leaf_data_end(root, leaf); + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + dsize, + btrfs_leaf_data(leaf) + data_end, + last_off - data_end); + + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + nr), + sizeof(struct btrfs_item) * + (nritems - slot - nr)); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + ret = btrfs_del_leaf(trans, root, path, leaf->start); + BUG_ON(ret); + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, + &disk_key, 1); + if (wret) + ret = wret; + } + + /* delete the leaf if it is mostly empty */ + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + extent_buffer_get(leaf); + + wret = push_leaf_left(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + wret = push_leaf_right(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + ret = btrfs_del_leaf(trans, root, path, + leaf->start); + BUG_ON(ret); + free_extent_buffer(leaf); + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(leaf); + } + } + return ret; +} + +/* + * search the tree again to find a leaf with lesser keys + * returns 0 if it found something or 1 if there are no lesser leaves. + * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + + if (key.offset > 0) + key.offset--; + else if (key.type > 0) + key.type--; + else if (key.objectid > 0) + key.objectid--; + else + return 1; + + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + if (ret < 0) + return 0; + return 1; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are either in cache or have a minimum + * transaction id. This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + + WARN_ON(!path->keep_locks); +again: + cur = btrfs_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = 1; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = bin_search(cur, min_key, level, &slot); + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the cache_only and + * min_trans parameters. If it isn't in cache or is too + * old, skip to the next one. + */ + while (slot < nritems) { + u64 blockptr; + u64 gen; + struct extent_buffer *tmp; + struct btrfs_disk_key disk_key; + + blockptr = btrfs_node_blockptr(cur, slot); + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + if (!cache_only) + break; + + if (max_key) { + btrfs_node_key(cur, &disk_key, slot); + if (comp_keys(&disk_key, max_key) >= 0) { + ret = 1; + goto out; + } + } + + tmp = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + free_extent_buffer(tmp); + break; + } + if (tmp) + free_extent_buffer(tmp); + slot++; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + sret = btrfs_find_next_key(root, path, min_key, level, + cache_only, min_trans); + if (sret == 0) { + btrfs_release_path(root, path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + unlock_up(path, level, 1); + goto out; + } + cur = read_node_slot(root, cur, slot); + + btrfs_tree_lock(cur); + path->locks[level - 1] = 1; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1); + } +out: + if (ret == 0) + memcpy(min_key, &found_key, sizeof(found_key)); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the cache_only and min_trans + * parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans) +{ + int level = lowest_level; + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) + return 1; + continue; + } + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 blockptr = btrfs_node_blockptr(c, slot); + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (cache_only) { + struct extent_buffer *cur; + cur = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + slot++; + if (cur) + free_extent_buffer(cur); + goto next; + } + free_extent_buffer(cur); + } + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +/* + * search the tree again to find a leaf with greater keys + * returns 0 if it found something or 1 if there are no greater leaves. + * returns < 0 on io errors. + */ +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + int slot; + int level = 1; + struct extent_buffer *c; + struct extent_buffer *next = NULL; + struct btrfs_key key; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + btrfs_release_path(root, path); + path->keep_locks = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + path->slots[0]++; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) + return 1; + continue; + } + + if (next) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + } + + if (level == 1 && (path->locks[1] || path->skip_locking) && + path->reada) + reada_for_search(root, path, level, slot, 0); + + next = read_node_slot(root, c, slot); + if (!path->skip_locking) { + WARN_ON(!btrfs_tree_locked(c)); + btrfs_tree_lock(next); + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = 1; + if (!level) + break; + if (level == 1 && path->locks[1] && path->reada) + reada_for_search(root, path, level, slot, 0); + next = read_node_slot(root, next, 0); + if (!path->skip_locking) { + WARN_ON(!btrfs_tree_locked(path->nodes[level])); + btrfs_tree_lock(next); + } + } +done: + unlock_up(path, 0, 1); + return 0; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.type == type) + return 0; + if (found_key.objectid < min_objectid) + break; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 00000000000..eee060f8811 --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,2129 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ + +#include <linux/version.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/fs.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> +#include <linux/wait.h> +#include <asm/kmap_types.h> +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +extern struct kmem_cache *btrfs_trans_handle_cachep; +extern struct kmem_cache *btrfs_transaction_cachep; +extern struct kmem_cache *btrfs_bit_radix_cachep; +extern struct kmem_cache *btrfs_path_cachep; +struct btrfs_ordered_sum; + +#define BTRFS_MAGIC "_BHRfS_M" + +#define BTRFS_ACL_NOT_CACHED ((void *)-1) + +#ifdef CONFIG_LOCKDEP +# define BTRFS_MAX_LEVEL 7 +#else +# define BTRFS_MAX_LEVEL 8 +#endif + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +static int btrfs_csum_sizes[] = { 4, 0 }; + +/* four bytes for CRC32 */ +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* + * the key defines the order in the tree, and so it also defines (optimal) + * block layout. objectid corresonds to the inode number. The flags + * tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with flags of 1 might refer to the inode + * data, flags of 2 may point to file data in the btree and flags == 3 + * may point to extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +#define BTRFS_UUID_SIZE 16 +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allowr for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +#define BTRFS_FSID_SIZE 16 +#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) + +/* + * every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* these first four must match the super block */ + u8 csum[BTRFS_CSUM_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* which block this node is supposed to live in */ + __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + u8 level; +} __attribute__ ((__packed__)); + +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ + sizeof(struct btrfs_header)) / \ + sizeof(struct btrfs_key_ptr)) +#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + sizeof(struct btrfs_file_extent_item)) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 +#define BTRFS_LABEL_SIZE 256 + +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ +struct btrfs_super_block { + u8 csum[BTRFS_CSUM_SIZE]; + /* the first 4 fields must match struct btrfs_header */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* this block number */ + __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* this will help find the new super based on the log root */ + __le64 log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + u8 root_level; + u8 chunk_root_level; + u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + /* future expansion */ + __le64 reserved[32]; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; +} __attribute__ ((__packed__)); + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0x0 +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 + +/* + * A leaf is full of items. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; + int reada; + /* keep some upper locks as we walk down */ + int keep_locks; + int skip_locking; + int lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + int search_for_split; +}; + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ +struct btrfs_extent_item { + __le32 refs; +} __attribute__ ((__packed__)); + +struct btrfs_extent_ref { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 num_refs; +} __attribute__ ((__packed__)); + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +typedef enum { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LAST = 2, +} btrfs_compression_type; + +/* we don't understand any encryption methods right now */ +typedef enum { + BTRFS_ENCRYPTION_NONE = 0, + BTRFS_ENCRYPTION_LAST = 1, +} btrfs_encryption_type; + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) +#define BTRFS_BLOCK_GROUP_DUP (1 << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_space_info { + u64 flags; + u64 total_bytes; + u64 bytes_used; + u64 bytes_pinned; + u64 bytes_reserved; + u64 bytes_readonly; + int full; + int force_alloc; + struct list_head list; + + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; + struct rw_semaphore groups_sem; +}; + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + spinlock_t lock; + struct mutex alloc_mutex; + struct mutex cache_mutex; + u64 pinned; + u64 reserved; + u64 flags; + int cached; + int ro; + int dirty; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + struct rb_root free_space_bytes; + struct rb_root free_space_offset; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; + + /* usage count */ + atomic_t count; +}; + +struct btrfs_leaf_ref_tree { + struct rb_root root; + struct list_head list; + spinlock_t lock; +}; + +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *csum_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + + struct extent_io_tree pinned_extents; + struct extent_io_tree pending_del; + struct extent_io_tree extent_ins; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + + u64 generation; + u64 last_trans_committed; + u64 last_trans_new_blockgroup; + u64 open_ioctl_trans; + unsigned long mount_opt; + u64 max_extent; + u64 max_inline; + u64 alloc_start; + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + + wait_queue_head_t async_submit_wait; + wait_queue_head_t tree_log_wait; + + struct btrfs_super_block super_copy; + struct btrfs_super_block super_for_commit; + struct block_device *__bdev; + struct super_block *sb; + struct inode *btree_inode; + struct backing_dev_info bdi; + spinlock_t hash_lock; + struct mutex trans_mutex; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex extent_ins_mutex; + struct mutex pinned_mutex; + struct mutex chunk_mutex; + struct mutex drop_mutex; + struct mutex volume_mutex; + struct mutex tree_reloc_mutex; + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; + atomic_t async_delalloc_pages; + atomic_t tree_log_writers; + atomic_t tree_log_commit; + unsigned long tree_log_batch; + u64 tree_log_transid; + + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + struct list_head ordered_extents; + struct list_head delalloc_inodes; + + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; + struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers submit_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens + * for the sys_munmap function call path + */ + struct btrfs_workers fixup_workers; + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + int thread_pool_size; + + /* tree relocation relocated fields */ + struct list_head dead_reloc_roots; + struct btrfs_leaf_ref_tree reloc_ref_tree; + struct btrfs_leaf_ref_tree shared_ref_tree; + + struct kobject super_kobj; + struct completion kobj_unregister; + int do_barriers; + int closing; + int log_root_recovering; + atomic_t throttles; + atomic_t throttle_gen; + + u64 total_pinned; + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + struct list_head space_info; + spinlock_t delalloc_lock; + spinlock_t new_trans_lock; + u64 delalloc_bytes; + u64 last_alloc; + u64 last_data_alloc; + + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + u64 data_alloc_profile; + u64 metadata_alloc_profile; + u64 system_alloc_profile; + + void *bdev_holder; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_dirty_root; +struct btrfs_root { + struct extent_buffer *node; + + /* the node lock is held while changing the node pointer */ + spinlock_t node_lock; + + struct extent_buffer *commit_root; + struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_leaf_ref_tree ref_tree_struct; + struct btrfs_dirty_root *dirty_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct kobject root_kobj; + struct completion kobj_unregister; + struct mutex objectid_mutex; + struct mutex log_mutex; + + u64 objectid; + u64 last_trans; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + /* leaf allocations are done in leafsize units */ + u32 leafsize; + + u32 stripesize; + + u32 type; + u64 highest_inode; + u64 last_inode_alloc; + int ref_cows; + int track_dirty; + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + int defrag_running; + int defrag_level; + char *name; + int in_sysfs; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; + + spinlock_t list_lock; + struct list_head dead_list; + struct list_head orphan_list; + + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + struct super_block anon_super; +}; + +/* + + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. There are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 +#define BTRFS_EXTENT_REF_KEY 180 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +#define BTRFS_MOUNT_NODATASUM (1 << 0) +#define BTRFS_MOUNT_NODATACOW (1 << 1) +#define BTRFS_MOUNT_NOBARRIER (1 << 2) +#define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ + BTRFS_MOUNT_##opt) +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1 << 0) +#define BTRFS_INODE_NODATACOW (1 << 1) +#define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) +#define BTRFS_INODE_PREALLOC (1 << 4) +#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ + ~BTRFS_INODE_##flag) +#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ + BTRFS_INODE_##flag) +#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ + BTRFS_INODE_##flag) +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#ifndef BTRFS_SETGET_FUNCS +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); +#endif + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + u##bits res = le##bits##_to_cpu(p->member); \ + kunmap_atomic(p, KM_USER0); \ + return res; \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + p->member = cpu_to_le##bits(val); \ + kunmap_atomic(p, KM_USER0); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, + start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); + +static inline struct btrfs_timespec * +btrfs_inode_atime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, atime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_mtime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, mtime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_ctime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, ctime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_otime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, otime); + return (struct btrfs_timespec *)ptr; +} + +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + +static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (u8 *)((unsigned long)dev + ptr); +} + +/* struct btrfs_extent_ref */ +BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); +BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); + +BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, + num_refs, 32); + +/* struct btrfs_extent_item */ +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, + refs, 32); + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); + +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, + int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) +{ + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +} + +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); +} + +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* + * struct btrfs_root_ref + */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + + +static inline u8 btrfs_key_type(struct btrfs_key *key) +{ + return key->type; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) +{ + key->type = val; +} + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); + +static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags | flag); + return (flags & flag) == flag; +} + +static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags & ~flag); + return (flags & flag) == flag; +} + +static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_super_block, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_csum(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, csum); + return (u8 *)ptr; +} + +static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) +{ + return NULL; +} + +static inline int btrfs_is_leaf(struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); + +/* struct btrfs_super_block */ + +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, + log_root_transid, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, + leafsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); + +static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +{ + int t = btrfs_super_csum_type(s); + BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + return btrfs_csum_sizes[t]; +} + +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) +{ + return offsetof(struct btrfs_leaf, items); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); + +static inline unsigned long +btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +{ + unsigned long offset = (unsigned long)e; + offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); + return offset; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} + +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int btrfs_set_root_name(struct btrfs_root *root, + const char *name, int len) +{ + /* if we already have a name just free it */ + kfree(root->name); + + root->name = kmalloc(len+1, GFP_KERNEL); + if (!root->name) + return -ENOMEM; + + memcpy(root->name, name, len); + root->name[len] = '\0'; + + return 0; +} + +static inline u32 btrfs_level_size(struct btrfs_root *root, int level) +{ + if (level == 0) + return root->leafsize; + return root->nodesize; +} + +/* helper function to cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +static inline struct dentry *fdentry(struct file *file) +{ + return file->f_path.dentry; +} + +/* extent-tree.c */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin); +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr); +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr); +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner); +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size); +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_bytes, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data); +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents); +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents); +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr); +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin); +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_io_tree *unpin); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start); +int btrfs_add_dead_reloc_root(struct btrfs_root *root); +int btrfs_cleanup_reloc_trees(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +/* ctree.c */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level); +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u32 data_size); +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); +void btrfs_init_path(struct btrfs_path *p); +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, int nr); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u32 data_size) +{ + return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); +} + +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); +/* root-item.c */ +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id, + u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + btrfs_root_item *item, struct btrfs_key *key); +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest_root); +/* dir-item.c */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +/* inode-map.c */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 dirid, u64 *objectid); +int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); + +/* inode-item.c */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +/* file-item.c */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst); +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len); +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow); +int btrfs_csum_truncate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u64 isize); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, + u64 end, struct list_head *list); +/* inode.c */ + +/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ +#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) +#define ClearPageChecked ClearPageFsMisc +#define SetPageChecked SetPageFsMisc +#define PageChecked PageFsMisc +#endif + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + +int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, struct dentry *dentry, + u64 new_dirid, u64 alloc_hint); +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); + +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index); +int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, + int for_del); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_readpage(struct file *file, struct page *page); +void btrfs_delete_inode(struct inode *inode); +void btrfs_put_inode(struct inode *inode); +void btrfs_read_locked_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, int wait); +void btrfs_dirty_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait); +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new); +int btrfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); +void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t size); + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* file.c */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +int btrfs_check_file(struct btrfs_root *root, struct inode *inode); +extern struct file_operations btrfs_file_operations; +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 inline_limit, u64 *hint_block); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end); +int btrfs_release_file(struct inode *inode, struct file *file); + +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); +int btrfs_sysfs_add_root(struct btrfs_root *root); +void btrfs_sysfs_del_root(struct btrfs_root *root); +void btrfs_sysfs_del_super(struct btrfs_fs_info *root); + +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* super.c */ +u64 btrfs_parse_size(char *str); +int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_sync_fs(struct super_block *sb, int wait); + +/* acl.c */ +int btrfs_check_acl(struct inode *inode, int mask); +int btrfs_init_acl(struct inode *inode, struct inode *dir); +int btrfs_acl_chmod(struct inode *inode); + +/* free-space-cache.c */ +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 00000000000..926a0b287a7 --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,386 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + int ret; + char *ptr; + struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + ret = btrfs_extend_item(trans, root, path, data_size); + WARN_ON(ret > 0); + } + if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (name_len + data_len + sizeof(struct btrfs_dir_item) > + BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) + return -ENOSPC; + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + /* + * FIXME: at some point we should handle xattr's that are larger than + * what we can fit in our leaf. We set location to NULL b/c we arent + * pointing at anything else, that will change if we store the xattr + * data in a separate inode. + */ + BUG_ON(IS_ERR(dir_item)); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, const char *name, int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + data_size = sizeof(*dir_item) + name_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out; + } + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = index; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret2 = PTR_ERR(dir_item); + goto out; + } + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + total_len = btrfs_item_size_nr(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + ret = btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); + } + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 00000000000..81a313874ae --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,2343 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/scatterlist.h> +#include <linux/swap.h> +#include <linux/radix-tree.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include "compat.h" +#include "crc32c.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "print-tree.h" +#include "async-thread.h" +#include "locking.h" +#include "ref-cache.h" +#include "tree-log.h" + +static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); + +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_start; + extent_submit_bio_hook_t *submit_bio_done; + int rw; + int mirror_num; + unsigned long bio_flags; + struct btrfs_work work; +}; + +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ +static struct extent_map *btree_get_extent(struct inode *inode, + struct page *page, size_t page_offset, u64 start, u64 len, + int create) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); + goto out; + } + spin_unlock(&em_tree->lock); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->len = (u64)-1; + em->block_len = (u64)-1; + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + free_extent_map(em); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + ret = -EIO; + } + } else if (ret) { + free_extent_map(em); + em = NULL; + } + spin_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +out: + return em; +} + +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +{ + return btrfs_crc32c(seed, data, len); +} + +void btrfs_csum_final(u32 crc, char *result) +{ + *(__le32 *)result = ~cpu_to_le32(crc); +} + +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ +static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, + int verify) +{ + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + char *result = NULL; + unsigned long len; + unsigned long cur_len; + unsigned long offset = BTRFS_CSUM_SIZE; + char *map_token = NULL; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + int err; + u32 crc = ~(u32)0; + unsigned long inline_result; + + len = buf->len - offset; + while (len > 0) { + err = map_private_extent_buffer(buf, offset, 32, + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + if (err) + return 1; + cur_len = min(len, map_len - (offset - map_start)); + crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc, cur_len); + len -= cur_len; + offset += cur_len; + unmap_extent_buffer(buf, map_token, KM_USER0); + } + if (csum_size > sizeof(inline_result)) { + result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + if (!result) + return 1; + } else { + result = (char *)&inline_result; + } + + btrfs_csum_final(crc, result); + + if (verify) { + if (memcmp_extent_buffer(buf, result, 0, csum_size)) { + u32 val; + u32 found = 0; + memcpy(&found, result, csum_size); + + read_extent_buffer(buf, &val, 0, csum_size); + printk(KERN_INFO "btrfs: %s checksum verify failed " + "on %llu wanted %X found %X level %d\n", + root->fs_info->sb->s_id, + buf->start, val, found, btrfs_header_level(buf)); + if (result != (char *)&inline_result) + kfree(result); + return 1; + } + } else { + write_extent_buffer(buf, result, 0, csum_size); + } + if (result != (char *)&inline_result) + kfree(result); + return 0; +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk("parent transid verify failed on %llu wanted %llu found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + ret = 1; + clear_extent_buffer_uptodate(io_tree, eb); +out: + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} + +/* + * checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block + */ + +static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +{ + struct extent_io_tree *tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + WARN_ON(1); + goto err; + } + if (eb->first_page != page) { + WARN_ON(1); + goto err; + } + if (!PageUptodate(page)) { + WARN_ON(1); + goto err; + } + found_level = btrfs_header_level(eb); + + csum_tree_block(root, eb, 0); +err: + free_extent_buffer(eb); +out: + return 0; +} + +static int check_tree_block_fsid(struct btrfs_root *root, + struct extent_buffer *eb) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + u8 fsid[BTRFS_UUID_SIZE]; + int ret = 1; + + read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE); + while (fs_devices) { + if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + ret = 0; + break; + } + fs_devices = fs_devices->seed; + } + return ret; +} + +static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk(KERN_INFO "btrfs bad first page %lu %lu\n", + eb->first_page->index, page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (check_tree_block_fsid(root, eb)) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; +err: + free_extent_buffer(eb); +out: + return ret; +} + +static void end_workqueue_bio(struct bio *bio, int err) +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + + if (bio->bi_rw & (1 << BIO_RW)) { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_write_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_write_workers, + &end_io_wq->work); + } else { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + } +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) +{ + unsigned long limit = min_t(unsigned long, + info->workers.max_workers, + info->fs_devices->open_devices); + return 256 * limit; +} + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) +{ + return atomic_read(&info->nr_async_bios) > + btrfs_async_submit_limit(info); +} + +static void run_one_async_start(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_done(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + int limit; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + atomic_dec(&fs_info->nr_async_submits); + + if (atomic_read(&fs_info->nr_async_submits) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + async->submit_bio_done(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_start = submit_bio_start; + async->submit_bio_done = submit_bio_done; + + async->work.func = run_one_async_start; + async->work.ordered_func = run_one_async_done; + async->work.ordered_free = run_one_async_free; + + async->work.flags = 0; + async->bio_flags = bio_flags; + + atomic_inc(&fs_info->nr_async_submits); + btrfs_queue_worker(&fs_info->workers, &async->work); +#if 0 + int limit = btrfs_async_submit_limit(fs_info); + if (atomic_read(&fs_info->nr_async_submits) > limit) { + wait_event_timeout(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) < limit), + HZ/10); + + wait_event_timeout(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_bios) < limit), + HZ/10); + } +#endif + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->nr_async_submits)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0)); + } + + return 0; +} + +static int btree_csum_one_bio(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + struct btrfs_root *root; + + WARN_ON(bio->bi_vcnt <= 0); + while (bio_index < bio->bi_vcnt) { + root = BTRFS_I(bvec->bv_page->mapping->host)->root; + csum_dirty_buffer(root, bvec->bv_page); + bio_index++; + bvec++; + } + return 0; +} + +static int __btree_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + btree_csum_one_bio(bio); + return 0; +} + +static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + int ret; + + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + __btree_submit_bio_start, + __btree_submit_bio_done); +} + +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + return extent_write_full_page(tree, page, btree_get_extent, wbc); +} + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + if (wbc->sync_mode == WB_SYNC_NONE) { + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 32 * 1024 * 1024; + + if (wbc->for_kupdate) + return 0; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty < thresh) + return 0; + } + return extent_writepages(tree, mapping, btree_get_extent, wbc); +} + +static int btree_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btree_get_extent); +} + +static int btree_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + if (PageWriteback(page) || PageDirty(page)) + return 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + + ret = try_release_extent_state(map, tree, page, gfp_flags); + if (!ret) + return 0; + + ret = try_release_extent_buffer(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + + return ret; +} + +static void btree_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + printk(KERN_WARNING "btrfs warning page private not zero " + "on page %llu\n", (unsigned long long)page_offset(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +#if 0 +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct buffer_head *bh; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct buffer_head *head; + if (!page_has_buffers(page)) { + create_empty_buffers(page, root->fs_info->sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + head = page_buffers(page); + bh = head; + do { + if (buffer_dirty(bh)) + csum_tree_block(root, bh, 0); + bh = bh->b_this_page; + } while (bh != head); + return block_write_full_page(page, btree_get_block, wbc); +} +#endif + +static struct address_space_operations btree_aops = { + .readpage = btree_readpage, + .writepage = btree_writepage, + .writepages = btree_writepages, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, + .sync_page = block_sync_page, +}; + +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + int ret = 0; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, 0, btree_get_extent, 0); + free_extent_buffer(buf); + return ret; +} + +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, GFP_NOFS); + return eb; +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, NULL, GFP_NOFS); + return eb; +} + + +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1, WB_SYNC_ALL); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); +} + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree; + int ret; + + io_tree = &BTRFS_I(btree_inode)->io_tree; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return NULL; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) + buf->flags |= EXTENT_UPTODATE; + else + WARN_ON(1); + return buf; + +} + +int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + if (btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) { + WARN_ON(!btrfs_tree_locked(buf)); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + } + return 0; +} + +static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + root->node = NULL; + root->commit_root = NULL; + root->ref_tree = NULL; + root->sectorsize = sectorsize; + root->nodesize = nodesize; + root->leafsize = leafsize; + root->stripesize = stripesize; + root->ref_cows = 0; + root->track_dirty = 0; + + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; + root->highest_inode = 0; + root->last_inode_alloc = 0; + root->name = NULL; + root->in_sysfs = 0; + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->orphan_list); + INIT_LIST_HEAD(&root->dead_list); + spin_lock_init(&root->node_lock); + spin_lock_init(&root->list_lock); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping, GFP_NOFS); + + btrfs_leaf_ref_tree_init(&root->ref_tree_struct); + root->ref_tree = &root->ref_tree_struct; + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + root->defrag_trans_start = fs_info->generation; + init_completion(&root->kobj_unregister); + root->defrag_running = 0; + root->defrag_level = 0; + root->root_key.objectid = objectid; + root->anon_super.s_root = NULL; + root->anon_super.s_dev = 0; + INIT_LIST_HEAD(&root->anon_super.s_list); + INIT_LIST_HEAD(&root->anon_super.s_instances); + init_rwsem(&root->anon_super.s_umount); + + return 0; +} + +static int find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) +{ + int ret; + u32 blocksize; + u64 generation; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + struct btrfs_root *log_root_tree = fs_info->log_root_tree; + u64 start = 0; + u64 end = 0; + int ret; + + if (!log_root_tree) + return 0; + + while (1) { + ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(&log_root_tree->dirty_log_pages, + start, end, GFP_NOFS); + } + eb = fs_info->log_root_tree->node; + + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) != 0); + + ret = btrfs_free_reserved_extent(fs_info->tree_root, + eb->start, eb->len); + BUG_ON(ret); + + free_extent_buffer(eb); + kfree(fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + return 0; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct btrfs_root *tree_root = fs_info->tree_root; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + root->ref_cows = 0; + + root->node = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + btrfs_set_header_bytenr(root->node, root->node->start); + btrfs_set_header_generation(root->node, trans->transid); + btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(root->node, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(root->node), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + fs_info->log_root_tree = root; + return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; + u64 highest_inode; + u64 generation; + u32 blocksize; + int ret = 0; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + if (location->offset == (u64)-1) { + ret = find_and_setup_root(tree_root, fs_info, + location->objectid, root); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + goto insert; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, location->objectid); + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); + if (ret != 0) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); +insert: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + root->ref_cows = 1; + ret = btrfs_find_highest_inode(root, &highest_inode); + if (ret == 0) { + root->highest_inode = highest_inode; + root->last_inode_alloc = highest_inode; + } + } + return root; +} + +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_root *root; + + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_objectid); + return root; +} + +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + struct btrfs_root *root; + int ret; + + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); + if (root) + return root; + + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + + set_anon_super(&root->anon_super, NULL); + + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid, root); + BUG_ON(ret); + btrfs_orphan_cleanup(root); + } + return root; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) +{ + struct btrfs_root *root; + int ret; + + root = btrfs_read_fs_root_no_name(fs_info, location); + if (!root) + return NULL; + + if (root->in_sysfs) + return root; + + ret = btrfs_set_root_name(root, name, namelen); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } +#if 0 + ret = btrfs_sysfs_add_root(root); + if (ret) { + free_extent_buffer(root->node); + kfree(root->name); + kfree(root); + return ERR_PTR(ret); + } +#endif + root->in_sysfs = 1; + return root; +} + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + struct list_head *cur; + struct btrfs_device *device; + struct backing_dev_info *bdi; +#if 0 + if ((bdi_bits & (1 << BDI_write_congested)) && + btrfs_congested_async(info, 0)) + return 1; +#endif + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi && bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + return ret; +} + +/* + * this unplugs every device on the box, and it is only used when page + * is null + */ +static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct list_head *cur; + struct btrfs_device *device; + struct btrfs_fs_info *info; + + info = (struct btrfs_fs_info *)bdi->unplug_io_data; + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, page); + } +} + +static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct inode *inode; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct address_space *mapping; + u64 offset; + + /* the generic O_DIRECT read code does this */ + if (1 || !page) { + __unplug_io_fn(bdi, page); + return; + } + + /* + * page->mapping may change at any time. Get a consistent copy + * and use that for everything below + */ + smp_mb(); + mapping = page->mapping; + if (!mapping) + return; + + inode = mapping->host; + + /* + * don't do the expensive searching for a small number of + * devices + */ + if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { + __unplug_io_fn(bdi, page); + return; + } + + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + if (!em) { + __unplug_io_fn(bdi, page); + return; + } + + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + __unplug_io_fn(bdi, page); + return; + } + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); +} + +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ + bdi_init(bdi); + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->state = 0; + bdi->capabilities = default_backing_dev_info.capabilities; + bdi->unplug_io_fn = btrfs_unplug_io_fn; + bdi->unplug_io_data = info; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct btrfs_fs_info *info = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + info = BTRFS_I(page->mapping->host)->root->fs_info; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + if (ret == 1) + return ret; + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bio reads are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); + bio_endio(bio, error); +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + smp_mb(); + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + unsigned long now; + unsigned long delay; + int ret; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + delay = HZ * 30; + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { + printk(KERN_INFO "btrfs: total reference cache " + "size %llu\n", + root->fs_info->total_ref_cache_size); + } + + mutex_lock(&root->fs_info->trans_mutex); + cur = root->fs_info->running_transaction; + if (!cur) { + mutex_unlock(&root->fs_info->trans_mutex); + goto sleep; + } + + now = get_seconds(); + if (now < cur->start_time || now - cur->start_time < 30) { + mutex_unlock(&root->fs_info->trans_mutex); + delay = HZ * 5; + goto sleep; + } + mutex_unlock(&root->fs_info->trans_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 leafsize; + u32 blocksize; + u32 stripesize; + u64 generation; + u64 features; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), + GFP_NOFS); + struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *log_tree_root; + + int ret; + int err = -EINVAL; + + struct btrfs_super_block *disk_super; + + if (!extent_root || !tree_root || !fs_info || + !chunk_root || !dev_root || !csum_root) { + err = -ENOMEM; + goto fail; + } + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + spin_lock_init(&fs_info->hash_lock); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; + fs_info->extent_root = extent_root; + fs_info->csum_root = csum_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + btrfs_mapping_init(&fs_info->mapping_tree); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->async_submit_draining, 0); + atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->throttles, 0); + atomic_set(&fs_info->throttle_gen, 0); + fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; + setup_bdi(fs_info, &fs_info->bdi); + fs_info->btree_inode = new_inode(sb); + fs_info->btree_inode->i_ino = 1; + fs_info->btree_inode->i_nlink = 1; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_extents); + spin_lock_init(&fs_info->ordered_extent_lock); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping, + GFP_NOFS); + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + GFP_NOFS); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + + extent_io_tree_init(&fs_info->pinned_extents, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->pending_del, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->extent_ins, + fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->do_barriers = 1; + + INIT_LIST_HEAD(&fs_info->dead_reloc_roots); + btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); + btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + insert_inode_hash(fs_info->btree_inode); + + mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->extent_ins_mutex); + mutex_init(&fs_info->pinned_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->tree_reloc_mutex); + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->tree_log_wait); + atomic_set(&fs_info->tree_log_commit, 0); + atomic_set(&fs_info->tree_log_writers, 0); + fs_info->tree_log_transid = 0; + + __setup_root(4096, 4096, 4096, 4096, tree_root, + fs_info, BTRFS_ROOT_TREE_OBJECTID); + + + bh = btrfs_read_dev_super(fs_devices->latest_bdev); + if (!bh) + goto fail_iput; + + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_for_commit, &fs_info->super_copy, + sizeof(fs_info->super_for_commit)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + goto fail_iput; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { + err = ret; + goto fail_iput; + } + + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "BTRFS: couldn't mount because of " + "unsupported optional features (%Lx).\n", + features); + err = -EINVAL; + goto fail_iput; + } + + features = btrfs_super_compat_ro_flags(disk_super) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " + "unsupported option features (%Lx).\n", + features); + err = -EINVAL; + goto fail_iput; + } + + /* + * we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time, and so it + * cannot dynamically grow. + */ + btrfs_init_workers(&fs_info->workers, "worker", + fs_info->thread_pool_size); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", + fs_info->thread_pool_size); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, + fs_info->thread_pool_size)); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers.idle_thresh = 64; + + fs_info->workers.idle_thresh = 16; + fs_info->workers.ordered = 1; + + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->endio_workers, "endio", + fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", + fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_meta_write_workers, + "endio-meta-write", fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", + fs_info->thread_pool_size); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_write_workers.idle_thresh = 64; + fs_info->endio_meta_write_workers.idle_thresh = 64; + + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_meta_workers, + fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_meta_write_workers, + fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_write_workers, + fs_info->thread_pool_size); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + tree_root->nodesize = nodesize; + tree_root->leafsize = leafsize; + tree_root->sectorsize = sectorsize; + tree_root->stripesize = stripesize; + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read the system " + "array on %s\n", sb->s_id); + goto fail_sys_array; + } + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + generation = btrfs_super_chunk_root_generation(disk_super); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize, generation); + BUG_ON(!chunk_root->node); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + sb->s_id); + goto fail_chunk_root; + } + + btrfs_close_extra_devices(fs_devices); + + blocksize = btrfs_level_size(tree_root, + btrfs_super_root_level(disk_super)); + generation = btrfs_super_generation(disk_super); + + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super), + blocksize, generation); + if (!tree_root->node) + goto fail_chunk_root; + + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root); + if (ret) + goto fail_tree_root; + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + dev_root->track_dirty = 1; + + if (ret) + goto fail_extent_root; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_CSUM_TREE_OBJECTID, csum_root); + if (ret) + goto fail_extent_root; + + csum_root->track_dirty = 1; + + btrfs_read_block_groups(extent_root); + + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (!fs_info->cleaner_kthread) + goto fail_csum_root; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (!fs_info->transaction_kthread) + goto fail_cleaner; + + if (btrfs_super_log_root(disk_super) != 0) { + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "Btrfs log replay required " + "on RO media\n"); + err = -EIO; + goto fail_trans_kthread; + } + blocksize = + btrfs_level_size(tree_root, + btrfs_super_log_root_level(disk_super)); + + log_tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + blocksize, + generation + 1); + ret = btrfs_recover_log_trees(log_tree_root); + BUG_ON(ret); + + if (sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + BUG_ON(ret); + } + } + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_reloc_trees(tree_root); + BUG_ON(ret); + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (!fs_info->fs_root) + goto fail_trans_kthread; + return tree_root; + +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_csum_root: + free_extent_buffer(csum_root->node); +fail_extent_root: + free_extent_buffer(extent_root->node); +fail_tree_root: + free_extent_buffer(tree_root->node); +fail_chunk_root: + free_extent_buffer(chunk_root->node); +fail_sys_array: + free_extent_buffer(dev_root->node); +fail_sb_buffer: + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); +fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); +fail: + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + kfree(extent_root); + kfree(tree_root); + bdi_destroy(&fs_info->bdi); + kfree(fs_info); + kfree(chunk_root); + kfree(dev_root); + kfree(csum_root); + return ERR_PTR(err); +} + +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +{ + struct buffer_head *bh; + struct buffer_head *latest = NULL; + struct btrfs_super_block *super; + int i; + u64 transid = 0; + u64 bytenr; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + break; + bh = __bread(bdev, bytenr / 4096, 4096); + if (!bh) + continue; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + strncmp((char *)(&super->magic), BTRFS_MAGIC, + sizeof(super->magic))) { + brelse(bh); + continue; + } + + if (!latest || btrfs_super_generation(super) > transid) { + brelse(latest); + latest = bh; + transid = btrfs_super_generation(super); + } else { + brelse(bh); + } + } + return latest; +} + +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, + int do_barriers, int wait, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int ret; + int errors = 0; + u32 crc; + u64 bytenr; + int last_barrier = 0; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + /* make sure only the last submit_bh does a barrier */ + if (do_barriers) { + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->total_bytes) + break; + last_barrier = i; + } + } + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + break; + + if (wait) { + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + BUG_ON(!bh); + brelse(bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) { + brelse(bh); + continue; + } + } else { + btrfs_set_super_bytenr(sb, bytenr); + + crc = ~(u32)0; + crc = btrfs_csum_data(NULL, (char *)sb + + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - + BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + + set_buffer_uptodate(bh); + get_bh(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + } + + if (i == last_barrier && do_barriers && device->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + set_buffer_uptodate(bh); + device->barriers = 0; + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE, bh); + } + } else { + ret = submit_bh(WRITE, bh); + } + + if (!ret && wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + } else if (ret) { + errors++; + } + if (wait) + brelse(bh); + } + return errors < i ? 0 : -1; +} + +int write_all_supers(struct btrfs_root *root, int max_mirrors) +{ + struct list_head *cur; + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + + total_errors = 0; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + return 0; +} + +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors) +{ + int ret; + + ret = write_all_supers(root, max_mirrors); + return ret; +} + +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } + if (root->node) + free_extent_buffer(root->node); + if (root->commit_root) + free_extent_buffer(root->commit_root); + kfree(root->name); + kfree(root); + return 0; +} + +static int del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_free_fs_root(fs_info, gang[i]); + } + return 0; +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i; + int ret; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) { + root_objectid = gang[i]->root_key.objectid; + ret = btrfs_find_dead_roots(fs_info->tree_root, + root_objectid, gang[i]); + BUG_ON(ret); + btrfs_orphan_cleanup(gang[i]); + } + root_objectid++; + } + return 0; +} + +int btrfs_commit_super(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + /* run commit again to drop the original snapshot */ + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(NULL, root); + BUG_ON(ret); + + ret = write_ctree_super(NULL, root, 0); + return ret; +} + +int close_ctree(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + fs_info->closing = 1; + smp_mb(); + + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", + fs_info->delalloc_bytes); + } + if (fs_info->total_ref_cache_size) { + printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", + (unsigned long long)fs_info->total_ref_cache_size); + } + + if (fs_info->extent_root->node) + free_extent_buffer(fs_info->extent_root->node); + + if (fs_info->tree_root->node) + free_extent_buffer(fs_info->tree_root->node); + + if (root->fs_info->chunk_root->node) + free_extent_buffer(root->fs_info->chunk_root->node); + + if (root->fs_info->dev_root->node) + free_extent_buffer(root->fs_info->dev_root->node); + + if (root->fs_info->csum_root->node) + free_extent_buffer(root->fs_info->csum_root->node); + + btrfs_free_block_groups(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); + +#if 0 + while (!list_empty(&fs_info->hashers)) { + struct btrfs_hasher *hasher; + hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, + hashers); + list_del(&hasher->hashers); + crypto_free_hash(&fs_info->hash_tfm); + kfree(hasher); + } +#endif + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + return 0; +} + +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +{ + int ret; + struct inode *btree_inode = buf->first_page->mapping->host; + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) +{ + struct inode *btree_inode = buf->first_page->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, + buf); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; + + WARN_ON(!btrfs_tree_locked(buf)); + if (transid != root->fs_info->generation) { + printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + "found %llu running %llu\n", + (unsigned long long)buf->start, + (unsigned long long)transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); +} + +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + struct extent_io_tree *tree; + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 32 * 1024 * 1024; + tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + + if (current_is_pdflush() || current->flags & PF_MEMALLOC) + return; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) + buf->flags |= EXTENT_UPTODATE; + return ret; +} + +int btree_lock_page_hook(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_buffer *eb; + unsigned long len; + u64 bytenr = page_offset(page); + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + + len = page->private >> 2; + eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); + if (!eb) + goto out; + + btrfs_tree_lock(eb); + spin_lock(&root->fs_info->hash_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_unlock(&root->fs_info->hash_lock); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); +out: + lock_page(page); + return 0; +} + +static struct extent_io_ops btree_extent_io_ops = { + .write_cache_pages_lock_hook = btree_lock_page_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, +}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 00000000000..c0ff404c31b --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DISKIO__ +#define __DISKIO__ + +#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = 16 * 1024; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid); +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +int close_ctree(struct btrfs_root *root); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors); +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_commit_super(struct btrfs_root *root); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen); +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location); +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +int btrfs_insert_dev_radix(struct btrfs_root *root, + struct block_device *bdev, + u64 device_id, + u64 block_start, + u64 num_blocks); +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int wait_on_tree_block_writeback(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +void btrfs_csum_final(u32 crc, char *result); +int btrfs_open_device(struct btrfs_device *dev); +int btrfs_verify_block_csum(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata); +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btree_lock_page_hook(struct page *page); +#endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 00000000000..85315d2c90d --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,203 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + return 255; + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct btrfs_key key; + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + + root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); + if (IS_ERR(root)) + return ERR_CAST(root); + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if (generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return d_obtain_alias(inode); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + /* Error */ + btrfs_free_path(path); + return ERR_PTR(ret); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + if (ret) { + /* btrfs_search_slot() returns the slot where we'd want to + insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. + The _real_ backref, telling us what the parent inode + _actually_ is, will be in the slot _before_ the one + that btrfs_search_slot() returns. */ + if (!slot) { + /* Unless there is _no_ key in the tree before... */ + btrfs_free_path(path); + return ERR_PTR(-EIO); + } + slot--; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_free_path(path); + + if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) + return ERR_PTR(-EINVAL); + + objectid = key.offset; + + /* If we are already at the root of a subvol, return the real root */ + if (objectid == dir->i_ino) + return dget(dir->i_sb->s_root); + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 00000000000..074348a9584 --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 00000000000..293da650873 --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,5986 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/version.h> +#include "compat.h" +#include "hash.h" +#include "crc32c.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "ref-cache.h" +#include "compat.h" + +#define PENDING_EXTENT_INSERT 0 +#define PENDING_EXTENT_DELETE 1 +#define PENDING_BACKREF_UPDATE 2 + +struct pending_extent_op { + int type; + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 orig_parent; + u64 generation; + u64 orig_generation; + int level; + struct list_head list; + int del; +}; + +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free); + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) + atomic_inc(&ret->count); + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static int add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size; + int ret; + + mutex_lock(&info->pinned_mutex); + while (start < end) { + ret = find_first_extent_bit(&info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY); + if (ret) + break; + + if (extent_start == start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + } + mutex_unlock(&info->pinned_mutex); + + return 0; +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, 0, + &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + btrfs_remove_free_space(cache, logical[nr], + stripe_len); + } + kfree(logical); + } + return 0; +} + +static int cache_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path; + int ret = 0; + struct btrfs_key key; + struct extent_buffer *leaf; + int slot; + u64 last; + + if (!block_group) + return 0; + + root = root->fs_info->extent_root; + + if (block_group->cached) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + /* + * we get into deadlocks with paths held by callers of this function. + * since the alloc_mutex is protecting things right now, just + * skip the locking here + */ + path->skip_locking = 1; + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + key.objectid = last; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + if (ret == 0) + continue; + else + break; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid < block_group->key.objectid) + goto next; + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { + add_new_free_space(block_group, root->fs_info, last, + key.objectid); + + last = key.objectid + key.offset; + } +next: + path->slots[0]++; + } + + add_new_free_space(block_group, root->fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + remove_sb_from_cache(root, block_group); + block_group->cached = 1; + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +/* + * return the block group that starts at or after bytenr + */ +static struct btrfs_block_group_cache * +btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 0); + + return cache; +} + +/* + * return the block group that contains teh given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 1); + + return cache; +} + +static inline void put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct list_head *cur; + struct btrfs_space_info *found; + list_for_each(cur, head) { + found = list_entry(cur, struct btrfs_space_info, list); + if (found->flags == flags) + return found; + } + return NULL; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner) +{ + struct btrfs_block_group_cache *cache; + u64 used; + u64 last = max(search_hint, search_start); + u64 group_start = 0; + int full_search = 0; + int factor = 9; + int wrapped = 0; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + if (!cache) + break; + + spin_lock(&cache->lock); + last = cache->key.objectid + cache->key.offset; + used = btrfs_block_group_used(&cache->item); + + if ((full_search || !cache->ro) && + block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { + if (used + cache->pinned + cache->reserved < + div_factor(cache->key.offset, factor)) { + group_start = cache->key.objectid; + spin_unlock(&cache->lock); + put_block_group(cache); + goto found; + } + } + spin_unlock(&cache->lock); + put_block_group(cache); + cond_resched(); + } + if (!wrapped) { + last = search_start; + wrapped = 1; + goto again; + } + if (!full_search && factor < 10) { + last = search_start; + full_search = 1; + factor = 10; + goto again; + } +found: + return group_start; +} + +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = start; + key.offset = len; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure has fields for: + * + * - Objectid of the subvolume root + * - Generation number of the tree holding the reference + * - objectid of the file holding the reference + * - number of references holding by parent node (alway 1 for tree blocks) + * + * Btree leaf may hold multiple references to a file extent. In most cases, + * these references are from same file and the corresponding offsets inside + * the file are close together. + * + * When a file extent is allocated the fields are filled in: + * (root_key.objectid, trans->transid, inode objectid, 1) + * + * When a leaf is cow'd new references are added for every file extent found + * in the leaf. It looks similar to the create case, but trans->transid will + * be different when the block is cow'd. + * + * (root_key.objectid, trans->transid, inode objectid, + * number of references in the leaf) + * + * When a file extent is removed either during snapshot deletion or + * file truncation, we find the corresponding back reference and check + * the following fields: + * + * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), + * inode objectid) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * - Different generations of the same subvolume + * + * When a tree block is created, back references are inserted: + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a tree block is cow'd, new back references are added for all the + * blocks it points to. If the tree block isn't in reference counted root, + * the old back references are removed. These new back references are of + * the form (trans->transid will have increased since creation): + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a backref is in deleting, the following fields are checked: + * + * if backref was for a tree root: + * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) + * else + * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, the key + * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first + * byte of parent extent. If a extent is tree root, the key offset is set + * to the key objectid. + */ + +static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid, int del) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct extent_buffer *leaf; + u64 ref_objectid; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; + + ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation || + (ref_objectid != owner_objectid && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + ret = -EIO; + WARN_ON(1); + goto out; + } + ret = 0; +out: + return ret; +} + +/* + * updates all the backrefs that are pending on update_list for the + * extent_root + */ +static noinline int update_backrefs(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *update_list) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct btrfs_fs_info *info = extent_root->fs_info; + struct pending_extent_op *op; + struct extent_buffer *leaf; + int ret = 0; + struct list_head *cur = update_list->next; + u64 ref_objectid; + u64 ref_root = extent_root->root_key.objectid; + + op = list_entry(cur, struct pending_extent_op, list); + +search: + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = op->orig_parent; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + +loop: + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + + ref_objectid = btrfs_ref_objectid(leaf, ref); + + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != op->orig_generation || + (ref_objectid != op->level && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " + "root %llu, owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)op->orig_parent, + (unsigned long long)ref_root, op->level); + btrfs_print_leaf(extent_root, leaf); + BUG(); + } + + key.objectid = op->bytenr; + key.offset = op->parent; + key.type = BTRFS_EXTENT_REF_KEY; + ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); + BUG_ON(ret); + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + btrfs_set_ref_generation(leaf, ref, op->generation); + + cur = cur->next; + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + + if (cur == update_list) { + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto out; + } + + op = list_entry(cur, struct pending_extent_op, list); + + path->slots[0]++; + while (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid == op->bytenr && + key.type == BTRFS_EXTENT_REF_KEY) + goto loop; + path->slots[0]++; + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto search; + +out: + return 0; +} + +static noinline int insert_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *insert_list, int nr) +{ + struct btrfs_key *keys; + u32 *data_size; + struct pending_extent_op *op; + struct extent_buffer *leaf; + struct list_head *cur = insert_list->next; + struct btrfs_fs_info *info = extent_root->fs_info; + u64 ref_root = extent_root->root_key.objectid; + int i = 0, last = 0, ret; + int total = nr * 2; + + if (!nr) + return 0; + + keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); + if (!keys) + return -ENOMEM; + + data_size = kzalloc(total * sizeof(u32), GFP_NOFS); + if (!data_size) { + kfree(keys); + return -ENOMEM; + } + + list_for_each_entry(op, insert_list, list) { + keys[i].objectid = op->bytenr; + keys[i].offset = op->num_bytes; + keys[i].type = BTRFS_EXTENT_ITEM_KEY; + data_size[i] = sizeof(struct btrfs_extent_item); + i++; + + keys[i].objectid = op->bytenr; + keys[i].offset = op->parent; + keys[i].type = BTRFS_EXTENT_REF_KEY; + data_size[i] = sizeof(struct btrfs_extent_ref); + i++; + } + + op = list_entry(cur, struct pending_extent_op, list); + i = 0; + while (i < total) { + int c; + ret = btrfs_insert_some_items(trans, extent_root, path, + keys+i, data_size+i, total-i); + BUG_ON(ret < 0); + + if (last && ret > 1) + BUG(); + + leaf = path->nodes[0]; + for (c = 0; c < ret; c++) { + int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; + + /* + * if the first item we inserted was a backref, then + * the EXTENT_ITEM will be the odd c's, else it will + * be the even c's + */ + if ((ref_first && (c % 2)) || + (!ref_first && !(c % 2))) { + struct btrfs_extent_item *itm; + + itm = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], itm, 1); + op->del++; + } else { + struct btrfs_extent_ref *ref; + + ref = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, + op->generation); + btrfs_set_ref_objectid(leaf, ref, op->level); + btrfs_set_ref_num_refs(leaf, ref, 1); + op->del++; + } + + /* + * using del to see when its ok to free up the + * pending_extent_op. In the case where we insert the + * last item on the list in order to help do batching + * we need to not free the extent op until we actually + * insert the extent_item + */ + if (op->del == 2) { + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, + GFP_NOFS); + cur = cur->next; + list_del_init(&op->list); + kfree(op); + if (cur != insert_list) + op = list_entry(cur, + struct pending_extent_op, + list); + } + } + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); + + /* + * Ok backref's and items usually go right next to eachother, + * but if we could only insert 1 item that means that we + * inserted on the end of a leaf, and we have no idea what may + * be on the next leaf so we just play it safe. In order to + * try and help this case we insert the last thing on our + * insert list so hopefully it will end up being the last + * thing on the leaf and everything else will be before it, + * which will let us insert a whole bunch of items at the same + * time. + */ + if (ret == 1 && !last && (i + ret < total)) { + /* + * last: where we will pick up the next time around + * i: our current key to insert, will be total - 1 + * cur: the current op we are screwing with + * op: duh + */ + last = i + ret; + i = total - 1; + cur = insert_list->prev; + op = list_entry(cur, struct pending_extent_op, list); + } else if (last) { + /* + * ok we successfully inserted the last item on the + * list, lets reset everything + * + * i: our current key to insert, so where we left off + * last time + * last: done with this + * cur: the op we are messing with + * op: duh + * total: since we inserted the last key, we need to + * decrement total so we dont overflow + */ + i = last; + last = 0; + total--; + if (i < total) { + cur = insert_list->next; + op = list_entry(cur, struct pending_extent_op, + list); + } + } else { + i += ret; + } + + cond_resched(); + } + ret = 0; + kfree(keys); + kfree(data_size); + return ret; +} + +static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref; + u32 num_refs; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, ref_generation); + btrfs_set_ref_objectid(leaf, ref, owner_objectid); + btrfs_set_ref_num_refs(leaf, ref, 1); + } else if (ret == -EEXIST) { + u64 existing_owner; + BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation) { + ret = -EIO; + WARN_ON(1); + goto out; + } + + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); + + existing_owner = btrfs_ref_objectid(leaf, ref); + if (existing_owner != owner_objectid && + existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { + btrfs_set_ref_objectid(leaf, ref, + BTRFS_MULTIPLE_OBJECTIDS); + } + ret = 0; + } else { + goto out; + } + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_release_path(root, path); + return ret; +} + +static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref; + u32 num_refs; + int ret = 0; + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + num_refs -= 1; + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + btrfs_set_ref_num_refs(leaf, ref, num_refs); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_release_path(root, path); + return ret; +} + +#ifdef BIO_RW_DISCARD +static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); +} +#endif + +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ +#ifdef BIO_RW_DISCARD + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; + + if (map_length > num_bytes) + map_length = num_bytes; + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + map_length); + } + kfree(multi); + } + + return ret; +#else + return 0; +#endif +} + +static noinline int free_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct list_head *del_list) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct list_head *cur; + struct pending_extent_op *op; + struct btrfs_extent_item *ei; + int ret, num_to_del, extent_slot = 0, found_extent = 0; + u32 refs; + u64 bytes_freed = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + +search: + /* search for the backref for the current ref we want to delete */ + cur = del_list->next; + op = list_entry(cur, struct pending_extent_op, list); + ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, + op->orig_parent, + extent_root->root_key.objectid, + op->orig_generation, op->level, 1); + if (ret) { + printk(KERN_ERR "btrfs unable to find backref byte nr %llu " + "root %llu gen %llu owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)extent_root->root_key.objectid, + (unsigned long long)op->orig_generation, op->level); + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + goto out; + } + + extent_slot = path->slots[0]; + num_to_del = 1; + found_extent = 0; + + /* + * if we aren't the first item on the leaf we can move back one and see + * if our ref is right next to our extent item + */ + if (likely(extent_slot)) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid == op->bytenr && + found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == op->num_bytes) { + num_to_del++; + found_extent = 1; + } + } + + /* + * if we didn't find the extent we need to delete the backref and then + * search for the extent item key so we can update its ref count + */ + if (!found_extent) { + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = op->num_bytes; + + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + BUG_ON(ret); + extent_slot = path->slots[0]; + } + + /* this is where we update the ref count for the extent */ + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs--; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + /* + * This extent needs deleting. The reason cur_slot is extent_slot + + * num_to_del is because extent_slot points to the slot where the extent + * is, and if the backref was not right next to the extent we will be + * deleting at least 1 item, and will want to start searching at the + * slot directly next to extent_slot. However if we did find the + * backref next to the extent item them we will be deleting at least 2 + * items and will want to start searching directly after the ref slot + */ + if (!refs) { + struct list_head *pos, *n, *end; + int cur_slot = extent_slot+num_to_del; + u64 super_used; + u64 root_used; + + path->slots[0] = extent_slot; + bytes_freed = op->num_bytes; + + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, op->bytenr, + op->num_bytes, op->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + op->del = ret; + + /* + * we need to see if we can delete multiple things at once, so + * start looping through the list of extents we are wanting to + * delete and see if their extent/backref's are right next to + * eachother and the extents only have 1 ref + */ + for (pos = cur->next; pos != del_list; pos = pos->next) { + struct pending_extent_op *tmp; + + tmp = list_entry(pos, struct pending_extent_op, list); + + /* we only want to delete extent+ref at this stage */ + if (cur_slot >= btrfs_header_nritems(leaf) - 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY || + found_key.offset != tmp->num_bytes) + break; + + /* check to make sure this extent only has one ref */ + ei = btrfs_item_ptr(leaf, cur_slot, + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, ei) != 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY || + found_key.offset != tmp->orig_parent) + break; + + /* + * the ref is right next to the extent, we can set the + * ref count to 0 since we will delete them both now + */ + btrfs_set_extent_refs(leaf, ei, 0); + + /* pin down the bytes for this extent */ + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, tmp->bytenr, + tmp->num_bytes, tmp->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + + /* + * use the del field to tell if we need to go ahead and + * free up the extent when we delete the item or not. + */ + tmp->del = ret; + bytes_freed += tmp->num_bytes; + + num_to_del += 2; + cur_slot += 2; + } + end = pos; + + /* update the free space counters */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - bytes_freed); + + root_used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + root_used - bytes_freed); + spin_unlock(&info->delalloc_lock); + + /* delete the items */ + ret = btrfs_del_items(trans, extent_root, path, + path->slots[0], num_to_del); + BUG_ON(ret); + + /* + * loop through the extents we deleted and do the cleanup work + * on them + */ + for (pos = cur, n = pos->next; pos != end; + pos = n, n = pos->next) { + struct pending_extent_op *tmp; + tmp = list_entry(pos, struct pending_extent_op, list); + + /* + * remember tmp->del tells us wether or not we pinned + * down the extent + */ + ret = update_block_group(trans, extent_root, + tmp->bytenr, tmp->num_bytes, 0, + tmp->del); + BUG_ON(ret); + + list_del_init(&tmp->list); + unlock_extent(&info->extent_ins, tmp->bytenr, + tmp->bytenr + tmp->num_bytes - 1, + GFP_NOFS); + kfree(tmp); + } + } else if (refs && found_extent) { + /* + * the ref and extent were right next to eachother, but the + * extent still has a ref, so just free the backref and keep + * going + */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } else { + /* + * the extent has multiple refs and the backref we were looking + * for was not right next to it, so just unlock and go next, + * we're good to go + */ + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } + + btrfs_release_path(extent_root, path); + if (!list_empty(del_list)) + goto search; + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + + if (root == root->fs_info->extent_root) { + struct pending_extent_op *extent_op; + u64 num_bytes; + + BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); + num_bytes = btrfs_level_size(root, (int)owner_objectid); + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + BUG_ON(extent_op->parent != orig_parent); + BUG_ON(extent_op->generation != orig_generation); + + extent_op->parent = parent; + extent_op->generation = ref_generation; + } else { + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_BACKREF_UPDATE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = orig_parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = orig_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->extent_ins, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + bytenr, (unsigned long)extent_op); + } + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, orig_parent, orig_root, + orig_generation, owner_objectid, 1); + if (ret) + goto out; + ret = remove_extent_backref(trans, extent_root, path); + if (ret) + goto out; + ret = insert_extent_backref(trans, extent_root, path, bytenr, + parent, ref_root, ref_generation, + owner_objectid); + BUG_ON(ret); + finish_current_insert(trans, extent_root, 0); + del_pending_extents(trans, extent_root, 0); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, + parent, ref_root, ref_root, + ref_generation, ref_generation, + owner_objectid); + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + u32 refs; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 1); + if (ret < 0) + return ret; + BUG_ON(ret == 0 || path->slots[0] == 0); + + path->slots[0]--; + l = path->nodes[0]; + + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + if (key.objectid != bytenr) { + btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); + printk(KERN_ERR "btrfs wanted %llu found %llu\n", + (unsigned long long)bytenr, + (unsigned long long)key.objectid); + BUG(); + } + BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); + + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(l, item); + btrfs_set_extent_refs(l, item, refs + 1); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_release_path(root->fs_info->extent_root, path); + + path->reada = 1; + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, + ref_root, ref_generation, + owner_objectid); + BUG_ON(ret); + finish_current_insert(trans, root->fs_info->extent_root, 0); + del_pending_extents(trans, root->fs_info->extent_root, 0); + + btrfs_free_path(path); + return 0; +} + +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, + 0, ref_root, 0, ref_generation, + owner_objectid); + return ret; +} + +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + return 0; +} + +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + + WARN_ON(num_bytes < root->sectorsize); + path = btrfs_alloc_path(); + path->reada = 1; + key.objectid = bytenr; + key.offset = num_bytes; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 0); + if (ret < 0) + goto out; + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_INFO "btrfs failed to find block number %llu\n", + (unsigned long long)bytenr); + BUG(); + } + l = path->nodes[0]; + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + *refs = btrfs_extent_refs(l, item); +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref_item; + struct btrfs_key key; + struct btrfs_key found_key; + u64 ref_root; + u64 last_snapshot; + u32 nritems; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret == 0) + continue; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr) + break; + + if (found_key.type != BTRFS_EXTENT_REF_KEY) { + path->slots[0]++; + continue; + } + + ref_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_root = btrfs_ref_root(leaf, ref_item); + if ((ref_root != root->root_key.objectid && + ref_root != BTRFS_TREE_LOG_OBJECTID) || + objectid != btrfs_ref_objectid(leaf, ref_item)) { + ret = 1; + goto out; + } + if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { + ret = 1; + goto out; + } + + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 root_gen; + u32 nritems; + int i; + int level; + int ret = 0; + int shared = 0; + + if (!root->ref_cows) + return 0; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + shared = 0; + root_gen = root->root_key.offset; + } else { + shared = 1; + root_gen = trans->transid - 1; + } + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_extent_info *info; + + ref = btrfs_alloc_leaf_ref(root, nr_extents); + if (!ref) { + ret = -ENOMEM; + goto out; + } + + ref->root_gen = root_gen; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ref->nritems = nr_extents; + info = ref->extents; + + for (i = 0; nr_extents > 0 && i < nritems; i++) { + u64 disk_bytenr; + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (disk_bytenr == 0) + continue; + + info->bytenr = disk_bytenr; + info->num_bytes = + btrfs_file_extent_disk_num_bytes(buf, fi); + info->objectid = key.objectid; + info->offset = key.offset; + info++; + } + + ret = btrfs_add_leaf_ref(root, ref, shared); + if (ret == -EEXIST && shared) { + struct btrfs_leaf_ref *old; + old = btrfs_lookup_leaf_ref(root, ref->bytenr); + BUG_ON(!old); + btrfs_remove_leaf_ref(root, old); + btrfs_free_leaf_ref(root, old); + ret = btrfs_add_leaf_ref(root, ref, shared); + } + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } +out: + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents) +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + u32 nritems; + u32 nr_file_extents = 0; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int level; + int ret = 0; + int faili = 0; + int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + u64, u64, u64, u64, u64, u64, u64, u64); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (root->ref_cows) { + process_func = __btrfs_inc_extent_ref; + } else { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + goto out; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + goto out; + process_func = __btrfs_update_extent_ref; + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + nr_file_extents++; + + ret = process_func(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + + if (ret) { + faili = i; + WARN_ON(1); + goto fail; + } + } else { + bytenr = btrfs_node_blockptr(buf, i); + ret = process_func(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) { + faili = i; + WARN_ON(1); + goto fail; + } + } + } +out: + if (nr_extents) { + if (level == 0) + *nr_extents = nr_file_extents; + else + *nr_extents = nritems; + } + return 0; +fail: + WARN_ON(1); + return ret; +} + +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr) + +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int ret; + int slot; + int level; + + BUG_ON(start_slot < 0); + BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + level = btrfs_header_level(buf); + + if (!root->ref_cows) { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + return 0; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + } + + for (i = 0, slot = start_slot; i < nr; i++, slot++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, slot, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + ret = __btrfs_update_extent_ref(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, slot); + ret = __btrfs_update_extent_ref(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) + goto fail; + } + } + return 0; +fail: + WARN_ON(1); + return -1; +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + int ret; + int pending_ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret < 0) + goto fail; + BUG_ON(ret); + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); +fail: + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); + if (ret) + return ret; + if (pending_ret) + return pending_ret; + return 0; + +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *entry; + struct rb_node *n; + int err = 0; + int werr = 0; + struct btrfs_path *path; + u64 last = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + cache = NULL; + spin_lock(&root->fs_info->block_group_cache_lock); + for (n = rb_first(&root->fs_info->block_group_cache_tree); + n; n = rb_next(n)) { + entry = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + if (entry->dirty) { + cache = entry; + break; + } + } + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (!cache) + break; + + cache->dirty = 0; + last += cache->key.offset; + + err = write_one_cache_group(trans, root, + path, cache); + /* + * if we fail to write the cache group, we want + * to keep it marked dirty in hopes that a later + * write will work + */ + if (err) { + werr = err; + continue; + } + } + btrfs_free_path(path); + return werr; +} + +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + int readonly = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + put_block_group(block_group); + return readonly; +} + +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + + found = __find_space_info(info, flags); + if (found) { + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->bytes_used += bytes_used; + found->full = 0; + spin_unlock(&found->lock); + *space_info = found; + return 0; + } + found = kzalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + list_add(&found->list, &info->space_info); + INIT_LIST_HEAD(&found->block_groups); + init_rwsem(&found->groups_sem); + spin_lock_init(&found->lock); + found->flags = flags; + found->total_bytes = total_bytes; + found->bytes_used = bytes_used; + found->bytes_pinned = 0; + found->bytes_reserved = 0; + found->bytes_readonly = 0; + found->full = 0; + found->force_alloc = 0; + *space_info = found; + return 0; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } +} + +static void set_block_group_readonly(struct btrfs_block_group_cache *cache) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (!cache->ro) { + cache->space_info->bytes_readonly += cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->ro = 1; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); +} + +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +{ + u64 num_devices = root->fs_info->fs_devices->rw_devices; + + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) { + flags &= ~BTRFS_BLOCK_GROUP_DUP; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID1) && + (flags & BTRFS_BLOCK_GROUP_RAID10)) { + flags &= ~BTRFS_BLOCK_GROUP_RAID1; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID0) && + ((flags & BTRFS_BLOCK_GROUP_RAID1) | + (flags & BTRFS_BLOCK_GROUP_RAID10) | + (flags & BTRFS_BLOCK_GROUP_DUP))) + flags &= ~BTRFS_BLOCK_GROUP_RAID0; + return flags; +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force) +{ + struct btrfs_space_info *space_info; + u64 thresh; + int ret = 0; + + mutex_lock(&extent_root->fs_info->chunk_mutex); + + flags = btrfs_reduce_alloc_profile(extent_root, flags); + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); + } + BUG_ON(!space_info); + + spin_lock(&space_info->lock); + if (space_info->force_alloc) { + force = 1; + space_info->force_alloc = 0; + } + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; + } + + thresh = space_info->total_bytes - space_info->bytes_readonly; + thresh = div_factor(thresh, 6); + if (!force && + (space_info->bytes_used + space_info->bytes_pinned + + space_info->bytes_reserved + alloc_bytes) < thresh) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + if (ret) + space_info->full = 1; +out: + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +} + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -1; + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->dirty = 1; + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + cache->space_info->bytes_used += num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + cache->space_info->bytes_used -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + if (mark_free) { + int ret; + + ret = btrfs_discard_extent(root, bytenr, + num_bytes); + WARN_ON(ret); + + ret = btrfs_add_free_space(cache, bytenr, + num_bytes); + WARN_ON(ret); + } + } + put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + return 0; +} + +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) + return 0; + + bytenr = cache->key.objectid; + put_block_group(cache); + + return bytenr; +} + +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin) +{ + u64 len; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + + WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); + if (pin) { + set_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + num - 1, GFP_NOFS); + } else { + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + num - 1, GFP_NOFS); + } + while (num > 0) { + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + if (pin) { + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += len; + cache->space_info->bytes_pinned += len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fs_info->total_pinned += len; + } else { + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fs_info->total_pinned -= len; + if (cache->cached) + btrfs_add_free_space(cache, bytenr, len); + } + put_block_group(cache); + bytenr += len; + num -= len; + } + return 0; +} + +static int update_reserved_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int reserve) +{ + u64 len; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + + while (num > 0) { + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += len; + cache->space_info->bytes_reserved += len; + } else { + cache->reserved -= len; + cache->space_info->bytes_reserved -= len; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + put_block_group(cache); + bytenr += len; + num -= len; + } + return 0; +} + +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) +{ + u64 last = 0; + u64 start; + u64 end; + struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; + int ret; + + mutex_lock(&root->fs_info->pinned_mutex); + while (1) { + ret = find_first_extent_bit(pinned_extents, last, + &start, &end, EXTENT_DIRTY); + if (ret) + break; + set_extent_dirty(copy, start, end, GFP_NOFS); + last = end + 1; + } + mutex_unlock(&root->fs_info->pinned_mutex); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_io_tree *unpin) +{ + u64 start; + u64 end; + int ret; + + mutex_lock(&root->fs_info->pinned_mutex); + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + ret = btrfs_discard_extent(root, start, end + 1 - start); + + btrfs_update_pinned_extents(root, start, end + 1 - start, 0); + clear_extent_dirty(unpin, start, end, GFP_NOFS); + + if (need_resched()) { + mutex_unlock(&root->fs_info->pinned_mutex); + cond_resched(); + mutex_lock(&root->fs_info->pinned_mutex); + } + } + mutex_unlock(&root->fs_info->pinned_mutex); + return ret; +} + +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + u64 start; + u64 end; + u64 priv; + u64 search = 0; + u64 skipped = 0; + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct pending_extent_op *extent_op, *tmp; + struct list_head insert_list, update_list; + int ret; + int num_inserts = 0, max_inserts; + + path = btrfs_alloc_path(); + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + + max_inserts = extent_root->leafsize / + (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + + sizeof(struct btrfs_extent_ref) + + sizeof(struct btrfs_extent_item)); +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(&info->extent_ins, search, &start, + &end, EXTENT_WRITEBACK); + if (ret) { + if (skipped && all && !num_inserts) { + skipped = 0; + search = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } + + ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); + if (!ret) { + skipped = 1; + search = end + 1; + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + continue; + } + + ret = get_state_private(&info->extent_ins, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long) priv; + + if (extent_op->type == PENDING_EXTENT_INSERT) { + num_inserts++; + list_add_tail(&extent_op->list, &insert_list); + search = end + 1; + if (num_inserts == max_inserts) { + mutex_unlock(&info->extent_ins_mutex); + break; + } + } else if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &update_list); + search = end + 1; + } else { + BUG(); + } + } + + /* + * process the update list, clear the writeback bit for it, and if + * somebody marked this thing for deletion then just unlock it and be + * done, the free_extents will handle it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &update_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + kfree(extent_op); + } + } + mutex_unlock(&info->extent_ins_mutex); + + /* + * still have things left on the update list, go ahead an update + * everything + */ + if (!list_empty(&update_list)) { + ret = update_backrefs(trans, extent_root, path, &update_list); + BUG_ON(ret); + } + + /* + * if no inserts need to be done, but we skipped some extents and we + * need to make sure everything is cleaned then reset everything and + * go back to the beginning + */ + if (!num_inserts && all && skipped) { + search = 0; + skipped = 0; + INIT_LIST_HEAD(&update_list); + INIT_LIST_HEAD(&insert_list); + goto again; + } else if (!num_inserts) { + goto out; + } + + /* + * process the insert extents list. Again if we are deleting this + * extent, then just unlock it, pin down the bytes if need be, and be + * done with it. Saves us from having to actually insert the extent + * into the tree and then subsequently come along and delete it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + u64 used; + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + spin_lock(&info->delalloc_lock); + used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + used - extent_op->num_bytes); + used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + used - extent_op->num_bytes); + spin_unlock(&info->delalloc_lock); + + ret = update_block_group(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, + 0, ret > 0); + BUG_ON(ret); + kfree(extent_op); + num_inserts--; + } + } + mutex_unlock(&info->extent_ins_mutex); + + ret = insert_extents(trans, extent_root, path, &insert_list, + num_inserts); + BUG_ON(ret); + + /* + * if we broke out of the loop in order to insert stuff because we hit + * the maximum number of inserts at a time we can handle, then loop + * back and pick up where we left off + */ + if (num_inserts == max_inserts) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + num_inserts = 0; + goto again; + } + + /* + * again, if we need to make absolutely sure there are no more pending + * extent operations left and we know that we skipped some, go back to + * the beginning and do it all again + */ + if (all && skipped) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + search = 0; + skipped = 0; + num_inserts = 0; + goto again; + } +out: + btrfs_free_path(path); + return 0; +} + +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data) +{ + int err = 0; + struct extent_buffer *buf; + + if (is_data) + goto pinit; + + buf = btrfs_find_tree_block(root, bytenr, num_bytes); + if (!buf) + goto pinit; + + /* we can reuse a block if it hasn't been written + * and it is from this transaction. We can't + * reuse anything from the tree log root because + * it has tiny sub-transactions. + */ + if (btrfs_buffer_uptodate(buf, 0) && + btrfs_try_tree_lock(buf)) { + u64 header_owner = btrfs_header_owner(buf); + u64 header_transid = btrfs_header_generation(buf); + if (header_owner != BTRFS_TREE_LOG_OBJECTID && + header_owner != BTRFS_TREE_RELOC_OBJECTID && + header_transid == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + clean_tree_block(NULL, root, buf); + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + return 1; + } + btrfs_tree_unlock(buf); + } + free_extent_buffer(buf); +pinit: + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + + BUG_ON(err < 0); + return 0; +} + +/* + * remove an extent from the root, returns 0 on success + */ +static int __free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, int mark_free) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; + int ret; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + struct btrfs_extent_item *ei; + u32 refs; + + key.objectid = bytenr; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.offset = num_bytes; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, parent, root_objectid, + ref_generation, owner_objectid, 1); + if (ret == 0) { + struct btrfs_key found_key; + extent_slot = path->slots[0]; + while (extent_slot > 0) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid != bytenr) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == num_bytes) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + } + if (!found_extent) { + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + } + } else { + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + printk(KERN_ERR "btrfs unable to find ref byte nr %llu " + "root %llu gen %llu owner %llu\n", + (unsigned long long)bytenr, + (unsigned long long)root_objectid, + (unsigned long long)ref_generation, + (unsigned long long)owner_objectid); + } + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs -= 1; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { + struct btrfs_extent_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); + /* if the back ref and the extent are next to each other + * they get deleted below in one shot + */ + path->slots[0] = extent_slot; + num_to_del = 2; + } else if (found_extent) { + /* otherwise delete the extent back ref */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + /* if refs are 0, we need to setup the path for deletion */ + if (refs == 0) { + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + BUG_ON(ret); + } + } + + if (refs == 0) { + u64 super_used; + u64 root_used; + + if (pin) { + mutex_lock(&root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, root, bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&root->fs_info->pinned_mutex); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); + } + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, + root_used - num_bytes); + spin_unlock(&info->delalloc_lock); + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + BUG_ON(ret); + } + + ret = update_block_group(trans, root, bytenr, num_bytes, 0, + mark_free); + BUG_ON(ret); + } + btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + return ret; +} + +/* + * find all the blocks marked as pending in the radix tree and remove + * them from the extent map + */ +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + int ret; + int err = 0; + u64 start; + u64 end; + u64 priv; + u64 search = 0; + int nr = 0, skipped = 0; + struct extent_io_tree *pending_del; + struct extent_io_tree *extent_ins; + struct pending_extent_op *extent_op; + struct btrfs_fs_info *info = extent_root->fs_info; + struct list_head delete_list; + + INIT_LIST_HEAD(&delete_list); + extent_ins = &extent_root->fs_info->extent_ins; + pending_del = &extent_root->fs_info->pending_del; + +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(pending_del, search, &start, &end, + EXTENT_WRITEBACK); + if (ret) { + if (all && skipped && !nr) { + search = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } + + ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); + if (!ret) { + search = end+1; + skipped = 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + + continue; + } + BUG_ON(ret < 0); + + ret = get_state_private(pending_del, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long)priv; + + clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, + GFP_NOFS); + if (!test_range_bit(extent_ins, start, end, + EXTENT_WRITEBACK, 0)) { + list_add_tail(&extent_op->list, &delete_list); + nr++; + } else { + kfree(extent_op); + + ret = get_state_private(&info->extent_ins, start, + &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + clear_extent_bits(&info->extent_ins, start, end, + EXTENT_WRITEBACK, GFP_NOFS); + + if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &delete_list); + search = end + 1; + nr++; + continue; + } + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, start, + end + 1 - start, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + ret = update_block_group(trans, extent_root, start, + end + 1 - start, 0, ret > 0); + + unlock_extent(extent_ins, start, end, GFP_NOFS); + BUG_ON(ret); + kfree(extent_op); + } + if (ret) + err = ret; + + search = end + 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + } + + if (nr) { + ret = free_extents(trans, extent_root, &delete_list); + BUG_ON(ret); + } + + if (all && skipped) { + INIT_LIST_HEAD(&delete_list); + search = 0; + nr = 0; + goto again; + } + + return err; +} + +/* + * remove an extent from the root, returns 0 on success + */ +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + int pending_ret; + int ret; + + WARN_ON(num_bytes < root->sectorsize); + if (root == extent_root) { + struct pending_extent_op *extent_op = NULL; + + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + extent_op->del = 1; + if (extent_op->type == PENDING_EXTENT_INSERT) { + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + } + + if (extent_op) { + ref_generation = extent_op->orig_generation; + parent = extent_op->orig_parent; + } + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_DELETE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = ref_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->pending_del, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->pending_del, + bytenr, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + /* if metadata always pin */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_block_group_cache *cache; + + /* btrfs_free_reserved_extent */ + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + btrfs_add_free_space(cache, bytenr, num_bytes); + put_block_group(cache); + update_reserved_extents(root, bytenr, num_bytes, 0); + return 0; + } + pin = 1; + } + + /* if data pin when any transaction has committed this */ + if (ref_generation != trans->transid) + pin = 1; + + ret = __free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin, pin == 0); + + finish_current_insert(trans, root->fs_info->extent_root, 0); + pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); + return ret ? ret : pending_ret; +} + +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) +{ + int ret; + + ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin); + return ret; +} + +static u64 stripe_align(struct btrfs_root *root, u64 val) +{ + u64 mask = ((u64)root->stripesize - 1); + u64 ret = (val + mask) & ~mask; + return ret; +} + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == block start + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == number of blocks + * Any available blocks before search_start are skipped. + */ +static noinline int find_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *orig_root, + u64 num_bytes, u64 empty_size, + u64 search_start, u64 search_end, + u64 hint_byte, struct btrfs_key *ins, + u64 exclude_start, u64 exclude_nr, + int data) +{ + int ret = 0; + struct btrfs_root *root = orig_root->fs_info->extent_root; + u64 total_needed = num_bytes; + u64 *last_ptr = NULL; + u64 last_wanted = 0; + struct btrfs_block_group_cache *block_group = NULL; + int chunk_alloc_done = 0; + int empty_cluster = 2 * 1024 * 1024; + int allowed_chunk_alloc = 0; + struct list_head *head = NULL, *cur = NULL; + int loop = 0; + int extra_loop = 0; + struct btrfs_space_info *space_info; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->objectid = 0; + ins->offset = 0; + + if (orig_root->ref_cows || empty_size) + allowed_chunk_alloc = 1; + + if (data & BTRFS_BLOCK_GROUP_METADATA) { + last_ptr = &root->fs_info->last_alloc; + empty_cluster = 64 * 1024; + } + + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) + last_ptr = &root->fs_info->last_data_alloc; + + if (last_ptr) { + if (*last_ptr) { + hint_byte = *last_ptr; + last_wanted = *last_ptr; + } else + empty_size += empty_cluster; + } else { + empty_cluster = 0; + } + search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, hint_byte); + + if (last_wanted && search_start != last_wanted) { + last_wanted = 0; + empty_size += empty_cluster; + } + + total_needed += empty_size; + block_group = btrfs_lookup_block_group(root->fs_info, search_start); + if (!block_group) + block_group = btrfs_lookup_first_block_group(root->fs_info, + search_start); + space_info = __find_space_info(root->fs_info, data); + + down_read(&space_info->groups_sem); + while (1) { + struct btrfs_free_space *free_space; + /* + * the only way this happens if our hint points to a block + * group thats not of the proper type, while looping this + * should never happen + */ + if (empty_size) + extra_loop = 1; + + if (!block_group) + goto new_group_no_lock; + + if (unlikely(!block_group->cached)) { + mutex_lock(&block_group->cache_mutex); + ret = cache_block_group(root, block_group); + mutex_unlock(&block_group->cache_mutex); + if (ret) + break; + } + + mutex_lock(&block_group->alloc_mutex); + if (unlikely(!block_group_bits(block_group, data))) + goto new_group; + + if (unlikely(block_group->ro)) + goto new_group; + + free_space = btrfs_find_free_space(block_group, search_start, + total_needed); + if (free_space) { + u64 start = block_group->key.objectid; + u64 end = block_group->key.objectid + + block_group->key.offset; + + search_start = stripe_align(root, free_space->offset); + + /* move on to the next group */ + if (search_start + num_bytes >= search_end) + goto new_group; + + /* move on to the next group */ + if (search_start + num_bytes > end) + goto new_group; + + if (last_wanted && search_start != last_wanted) { + total_needed += empty_cluster; + empty_size += empty_cluster; + last_wanted = 0; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= start && + search_start < end) { + mutex_unlock(&block_group->alloc_mutex); + continue; + } + + /* else we go to the next block group */ + goto new_group; + } + + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= start && + search_start < end) { + mutex_unlock(&block_group->alloc_mutex); + last_wanted = 0; + continue; + } + + /* else we go to the next block group */ + goto new_group; + } + + ins->objectid = search_start; + ins->offset = num_bytes; + + btrfs_remove_free_space_lock(block_group, search_start, + num_bytes); + /* we are all good, lets return */ + mutex_unlock(&block_group->alloc_mutex); + break; + } +new_group: + mutex_unlock(&block_group->alloc_mutex); + put_block_group(block_group); + block_group = NULL; +new_group_no_lock: + /* don't try to compare new allocations against the + * last allocation any more + */ + last_wanted = 0; + + /* + * Here's how this works. + * loop == 0: we were searching a block group via a hint + * and didn't find anything, so we start at + * the head of the block groups and keep searching + * loop == 1: we're searching through all of the block groups + * if we hit the head again we have searched + * all of the block groups for this space and we + * need to try and allocate, if we cant error out. + * loop == 2: we allocated more space and are looping through + * all of the block groups again. + */ + if (loop == 0) { + head = &space_info->block_groups; + cur = head->next; + loop++; + } else if (loop == 1 && cur == head) { + int keep_going; + + /* at this point we give up on the empty_size + * allocations and just try to allocate the min + * space. + * + * The extra_loop field was set if an empty_size + * allocation was attempted above, and if this + * is try we need to try the loop again without + * the additional empty_size. + */ + total_needed -= empty_size; + empty_size = 0; + keep_going = extra_loop; + loop++; + + if (allowed_chunk_alloc && !chunk_alloc_done) { + up_read(&space_info->groups_sem); + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + down_read(&space_info->groups_sem); + if (ret < 0) + goto loop_check; + head = &space_info->block_groups; + /* + * we've allocated a new chunk, keep + * trying + */ + keep_going = 1; + chunk_alloc_done = 1; + } else if (!allowed_chunk_alloc) { + space_info->force_alloc = 1; + } +loop_check: + if (keep_going) { + cur = head->next; + extra_loop = 0; + } else { + break; + } + } else if (cur == head) { + break; + } + + block_group = list_entry(cur, struct btrfs_block_group_cache, + list); + atomic_inc(&block_group->count); + + search_start = block_group->key.objectid; + cur = cur->next; + } + + /* we found what we needed */ + if (ins->objectid) { + if (!(data & BTRFS_BLOCK_GROUP_DATA)) + trans->block_group = block_group->key.objectid; + + if (last_ptr) + *last_ptr = ins->objectid + ins->offset; + ret = 0; + } else if (!ret) { + printk(KERN_ERR "btrfs searching for %llu bytes, " + "num_bytes %llu, loop %d, allowed_alloc %d\n", + (unsigned long long)total_needed, + (unsigned long long)num_bytes, + loop, allowed_chunk_alloc); + ret = -ENOSPC; + } + if (block_group) + put_block_group(block_group); + + up_read(&space_info->groups_sem); + return ret; +} + +static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +{ + struct btrfs_block_group_cache *cache; + struct list_head *l; + + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - + info->bytes_pinned - info->bytes_reserved), + (info->full) ? "" : "not "); + + down_read(&info->groups_sem); + list_for_each(l, &info->block_groups) { + cache = list_entry(l, struct btrfs_block_group_cache, list); + spin_lock(&cache->lock); + printk(KERN_INFO "block group %llu has %llu bytes, %llu used " + "%llu pinned %llu reserved\n", + (unsigned long long)cache->key.objectid, + (unsigned long long)cache->key.offset, + (unsigned long long)btrfs_block_group_used(&cache->item), + (unsigned long long)cache->pinned, + (unsigned long long)cache->reserved); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + up_read(&info->groups_sem); +} + +static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + u64 search_start = 0; + u64 alloc_profile; + struct btrfs_fs_info *info = root->fs_info; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } +again: + data = btrfs_reduce_alloc_profile(root, data); + /* + * the only place that sets empty_size is btrfs_realloc_node, which + * is not called recursively on allocations + */ + if (empty_size || root->ref_cows) { + if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + BTRFS_BLOCK_GROUP_METADATA | + (info->metadata_alloc_profile & + info->avail_metadata_alloc_bits), 0); + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, data, 0); + } + + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(trans, root, num_bytes, empty_size, + search_start, search_end, hint_byte, ins, + trans->alloc_exclude_start, + trans->alloc_exclude_nr, data); + + if (ret == -ENOSPC && num_bytes > min_alloc_size) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, 1); + goto again; + } + if (ret) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes); + BUG(); + } + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %llu\n", + (unsigned long long)start); + return -ENOSPC; + } + + ret = btrfs_discard_extent(root, start, len); + + btrfs_add_free_space(cache, start, len); + put_block_group(cache); + update_reserved_extents(root, start, len, 0); + + return ret; +} + +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, + empty_size, hint_byte, search_end, ins, + data); + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; +} + +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + int pending_ret; + u64 super_used; + u64 root_used; + u64 num_bytes = ins->offset; + u32 sizes[2]; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_ref *ref; + struct btrfs_path *path; + struct btrfs_key keys[2]; + + if (parent == 0) + parent = ins->objectid; + + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, root_used + num_bytes); + spin_unlock(&info->delalloc_lock); + + if (root == extent_root) { + struct pending_extent_op *extent_op; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_INSERT; + extent_op->bytenr = ins->objectid; + extent_op->num_bytes = ins->offset; + extent_op->parent = parent; + extent_op->orig_parent = 0; + extent_op->generation = ref_generation; + extent_op->orig_generation = 0; + extent_op->level = (int)owner; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + mutex_lock(&root->fs_info->extent_ins_mutex); + set_extent_bits(&root->fs_info->extent_ins, ins->objectid, + ins->objectid + ins->offset - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + ins->objectid, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + goto update_block; + } + + memcpy(&keys[0], ins, sizeof(*ins)); + keys[1].objectid = ins->objectid; + keys[1].type = BTRFS_EXTENT_REF_KEY; + keys[1].offset = parent; + sizes[0] = sizeof(*extent_item); + sizes[1] = sizeof(*ref); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + ret = btrfs_insert_empty_items(trans, extent_root, path, keys, + sizes, 2); + BUG_ON(ret); + + extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], extent_item, 1); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_extent_ref); + + btrfs_set_ref_root(path->nodes[0], ref, root_objectid); + btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); + btrfs_set_ref_objectid(path->nodes[0], ref, owner); + btrfs_set_ref_num_refs(path->nodes[0], ref, 1); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + trans->alloc_exclude_start = 0; + trans->alloc_exclude_nr = 0; + btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); + + if (ret) + goto out; + if (pending_ret) { + ret = pending_ret; + goto out; + } + +update_block: + ret = update_block_group(trans, root, ins->objectid, + ins->offset, 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } +out: + return ret; +} + +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + update_reserved_extents(root, ins->objectid, ins->offset, 0); + return ret; +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + mutex_lock(&block_group->cache_mutex); + cache_block_group(root, block_group); + mutex_unlock(&block_group->cache_mutex); + + ret = btrfs_remove_free_space(block_group, ins->objectid, + ins->offset); + BUG_ON(ret); + put_block_group(block_group); + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + return ret; +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns 0 if everything worked, non-zero otherwise. + */ +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_alloc_size, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data) +{ + int ret; + + ret = __btrfs_reserve_extent(trans, root, num_bytes, + min_alloc_size, empty_size, hint_byte, + search_end, ins, data); + BUG_ON(ret); + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = __btrfs_alloc_reserved_extent(trans, root, parent, + root_objectid, ref_generation, + owner_objectid, ins); + BUG_ON(ret); + + } else { + update_reserved_extents(root, ins->objectid, ins->offset, 1); + } + return ret; +} + +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct extent_buffer *buf; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return ERR_PTR(-ENOMEM); + btrfs_set_header_generation(buf, trans->transid); + btrfs_tree_lock(buf); + clean_tree_block(trans, root, buf); + btrfs_set_buffer_uptodate(buf); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } else { + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } + trans->blocks_used++; + return buf; +} + +/* + * helper function to allocate a block for a given tree + * returns the tree buffer or NULL. + */ +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_key ins; + int ret; + struct extent_buffer *buf; + + ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, + root_objectid, ref_generation, level, + empty_size, hint, (u64)-1, &ins, 0); + if (ret) { + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + return buf; +} + +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf) +{ + u64 leaf_owner; + u64 leaf_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int nritems; + int ret; + + BUG_ON(!btrfs_is_leaf(leaf)); + nritems = btrfs_header_nritems(leaf); + leaf_owner = btrfs_header_owner(leaf); + leaf_generation = btrfs_header_generation(leaf); + + for (i = 0; i < nritems; i++) { + u64 disk_bytenr; + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + /* + * FIXME make sure to insert a trans record that + * repeats the snapshot del on crash + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + continue; + + ret = __btrfs_free_extent(trans, root, disk_bytenr, + btrfs_file_extent_disk_num_bytes(leaf, fi), + leaf->start, leaf_owner, leaf_generation, + key.objectid, 0); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } + return 0; +} + +static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_leaf_ref *ref) +{ + int i; + int ret; + struct btrfs_extent_info *info = ref->extents; + + for (i = 0; i < ref->nritems; i++) { + ret = __btrfs_free_extent(trans, root, info->bytenr, + info->num_bytes, ref->bytenr, + ref->owner, ref->generation, + info->objectid, 0); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + BUG_ON(ret); + info++; + } + + return 0; +} + +static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, + u64 len, u32 *refs) +{ + int ret; + + ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + +#if 0 /* some debugging code in case we see problems here */ + /* if the refs count is one, it won't get increased again. But + * if the ref count is > 1, someone may be decreasing it at + * the same time we are. + */ + if (*refs != 1) { + struct extent_buffer *eb = NULL; + eb = btrfs_find_create_tree_block(root, start, len); + if (eb) + btrfs_tree_lock(eb); + + mutex_lock(&root->fs_info->alloc_mutex); + ret = lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + mutex_unlock(&root->fs_info->alloc_mutex); + + if (eb) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + if (*refs == 1) { + printk(KERN_ERR "btrfs block %llu went down to one " + "during drop_snap\n", (unsigned long long)start); + } + + } +#endif + + cond_resched(); + return ret; +} + +/* + * helper function for drop_snapshot, this walks down the tree dropping ref + * counts as it goes. + */ +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + struct btrfs_leaf_ref *ref; + u32 blocksize; + int ret; + u32 refs; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + path->nodes[*level]->len, &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + /* + * walk down to the last node level and free all the leaves + */ + while (*level >= 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + break; + } + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + BUG_ON(ret); + if (refs != 1) { + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + path->slots[*level]++; + + ret = __btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + root_owner, root_gen, + *level - 1, 1); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + continue; + } + /* + * at this point, we have a single ref, and since the + * only place referencing this extent is a dead root + * the reference count should never go higher. + * So, we don't need to check it again + */ + if (*level == 1) { + ref = btrfs_lookup_leaf_ref(root, bytenr); + if (ref && ref->generation != ptr_gen) { + btrfs_free_leaf_ref(root, ref); + ref = NULL; + } + if (ref) { + ret = cache_drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(root, ref); + *level = 0; + break; + } + } + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { + free_extent_buffer(next); + + next = read_tree_block(root, bytenr, blocksize, + ptr_gen); + cond_resched(); +#if 0 + /* + * this is a debugging check and can go away + * the ref should never go all the way down to 1 + * at this point + */ + ret = lookup_extent_ref(NULL, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + WARN_ON(refs != 1); +#endif + } + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } +out: + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) { + parent = path->nodes[*level]; + bytenr = path->nodes[*level]->start; + } else { + parent = path->nodes[*level + 1]; + bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); + } + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, root_owner, root_gen, + *level, 1); + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + BUG_ON(ret); + + cond_resched(); + return 0; +} + +/* + * helper function for drop_subtree, this function is similar to + * walk_down_tree. The main difference is that it checks reference + * counts while tree blocks are locked. + */ +static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 refs; + int ret; + + cur = path->nodes[*level]; + ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, + &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + while (*level >= 0) { + cur = path->nodes[*level]; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + clean_tree_block(trans, root, cur); + break; + } + if (path->slots[*level] >= btrfs_header_nritems(cur)) { + clean_tree_block(trans, root, cur); + break; + } + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + + next = read_tree_block(root, bytenr, blocksize, ptr_gen); + btrfs_tree_lock(next); + + ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + if (refs > 1) { + parent = path->nodes[*level]; + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + *level - 1, 1); + BUG_ON(ret); + path->slots[*level]++; + btrfs_tree_unlock(next); + free_extent_buffer(next); + continue; + } + + *level = btrfs_header_level(next); + path->nodes[*level] = next; + path->slots[*level] = 0; + path->locks[*level] = 1; + cond_resched(); + } +out: + parent = path->nodes[*level + 1]; + bytenr = path->nodes[*level]->start; + blocksize = path->nodes[*level]->len; + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, btrfs_header_owner(parent), + btrfs_header_generation(parent), *level, 1); + BUG_ON(ret); + + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + cond_resched(); + return 0; +} + +/* + * helper for dropping snapshots. This walks back up the tree in the path + * to find the first node higher up where we haven't yet gone through + * all the slots + */ +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int *level, int max_level) +{ + u64 root_owner; + u64 root_gen; + struct btrfs_root_item *root_item = &root->root_item; + int i; + int slot; + int ret; + + for (i = *level; i < max_level && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + btrfs_node_key(node, &disk_key, path->slots[i]); + memcpy(&root_item->drop_progress, + &disk_key, sizeof(disk_key)); + root_item->drop_level = i; + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + clean_tree_block(trans, root, path->nodes[*level]); + ret = btrfs_free_extent(trans, root, + path->nodes[*level]->start, + path->nodes[*level]->len, + parent->start, root_owner, + root_gen, *level, 1); + BUG_ON(ret); + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + struct btrfs_root_item *root_item = &root->root_item; + + WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(root->node); + orig_level = level; + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + path->nodes[level] = root->node; + extent_buffer_get(root->node); + path->slots[level] = 0; + } else { + struct btrfs_key key; + struct btrfs_disk_key found_key; + struct extent_buffer *node; + + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + level = root_item->drop_level; + path->lowest_level = level; + wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (wret < 0) { + ret = wret; + goto out; + } + node = path->nodes[level]; + btrfs_node_key(node, &found_key, path->slots[level]); + WARN_ON(memcmp(&found_key, &root_item->drop_progress, + sizeof(found_key))); + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (path->nodes[i] && path->locks[i]) { + path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[i]); + } + } + } + while (1) { + wret = walk_down_tree(trans, root, path, &level); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_tree(trans, root, path, &level, + BTRFS_MAX_LEVEL); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + if (trans->transaction->in_commit) { + ret = -EAGAIN; + break; + } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + } + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_path *path; + int level; + int parent_level; + int ret = 0; + int wret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + BUG_ON(!btrfs_tree_locked(parent)); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + BUG_ON(!btrfs_tree_locked(node)); + level = btrfs_header_level(node); + extent_buffer_get(node); + path->nodes[level] = node; + path->slots[level] = 0; + + while (1) { + wret = walk_down_subtree(trans, root, path, &level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + + wret = walk_up_tree(trans, root, path, &level, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + btrfs_free_path(path); + return ret; +} + +static unsigned long calc_ra(unsigned long start, unsigned long last, + unsigned long nr) +{ + return min(last, start + nr - 1); +} + +static noinline int relocate_inode_pages(struct inode *inode, u64 start, + u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long first_index; + unsigned long last_index; + unsigned long i; + struct page *page; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + calc_ra(i, last_index, ra->ra_pages)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } + +out_unlock: + kfree(ra); + mutex_unlock(&inode->i_mutex); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline int relocate_data_extent(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - offset; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em || IS_ERR(em)); + + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(reloc_inode, start, end, 0); + } + unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(reloc_inode, start, extent_key->offset); +} + +struct btrfs_ref_path { + u64 extent_start; + u64 nodes[BTRFS_MAX_LEVEL]; + u64 root_objectid; + u64 root_generation; + u64 owner_objectid; + u32 num_refs; + int lowest_level; + int current_level; + int shared_level; + + struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; + u64 new_nodes[BTRFS_MAX_LEVEL]; +}; + +struct disk_extent { + u64 ram_bytes; + u64 disk_bytenr; + u64 disk_num_bytes; + u64 offset; + u64 num_bytes; + u8 compression; + u8 encryption; + u16 other_encoding; +}; + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static noinline int __next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + int first_time) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_extent_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + u64 bytenr; + u32 nritems; + int level; + int ret = 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (first_time) { + ref_path->lowest_level = -1; + ref_path->current_level = -1; + ref_path->shared_level = -1; + goto walk_up; + } +walk_down: + level = ref_path->current_level - 1; + while (level >= -1) { + u64 parent; + if (level < ref_path->lowest_level) + break; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + BUG_ON(bytenr == 0); + + parent = ref_path->nodes[level + 1]; + ref_path->nodes[level + 1] = 0; + ref_path->current_level = level; + BUG_ON(parent == 0); + + key.objectid = bytenr; + key.offset = parent + 1; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) + goto next; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid == bytenr && + found_key.type == BTRFS_EXTENT_REF_KEY) { + if (level < ref_path->shared_level) + ref_path->shared_level = level; + goto found; + } +next: + level--; + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached lowest level */ + ret = 1; + goto out; +walk_up: + level = ref_path->current_level; + while (level < BTRFS_MAX_LEVEL - 1) { + u64 ref_objectid; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + + BUG_ON(bytenr == 0); + + key.objectid = bytenr; + key.offset = 0; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) + goto out; + btrfs_release_path(extent_root, path); + goto walk_down; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) { + ret = 1; + goto out; + } + btrfs_release_path(extent_root, path); + goto walk_down; + } +found: + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (first_time) { + level = (int)ref_objectid; + BUG_ON(level >= BTRFS_MAX_LEVEL); + ref_path->lowest_level = level; + ref_path->current_level = level; + ref_path->nodes[level] = bytenr; + } else { + WARN_ON(ref_objectid != level); + } + } else { + WARN_ON(level != -1); + } + first_time = 0; + + if (ref_path->lowest_level == level) { + ref_path->owner_objectid = ref_objectid; + ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); + } + + /* + * the block is tree root or the block isn't in reference + * counted tree. + */ + if (found_key.objectid == found_key.offset || + is_cowonly_root(btrfs_ref_root(leaf, ref))) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + if (level < 0) { + /* special reference from the tree log */ + ref_path->nodes[0] = found_key.offset; + ref_path->current_level = 0; + } + ret = 0; + goto out; + } + + level++; + BUG_ON(ref_path->nodes[level] != 0); + ref_path->nodes[level] = found_key.offset; + ref_path->current_level = level; + + /* + * the reference was created in the running transaction, + * no need to continue walking up. + */ + if (btrfs_ref_generation(leaf, ref) == trans->transid) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + ret = 0; + goto out; + } + + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached max tree level, but no tree root found. */ + BUG(); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + u64 extent_start) +{ + memset(ref_path, 0, sizeof(*ref_path)); + ref_path->extent_start = extent_start; + + return __next_ref_path(trans, extent_root, ref_path, 1); +} + +static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path) +{ + return __next_ref_path(trans, extent_root, ref_path, 0); +} + +static noinline int get_new_locations(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset, int no_fragment, + struct disk_extent **extents, + int *nr_extents) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct disk_extent *exts = *extents; + struct btrfs_key found_key; + u64 cur_pos; + u64 last_byte; + u32 nritems; + int nr = 0; + int max = *nr_extents; + int ret; + + WARN_ON(!no_fragment && *extents); + if (!exts) { + max = 1; + exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) + return -ENOMEM; + } + + path = btrfs_alloc_path(); + BUG_ON(!path); + + cur_pos = extent_key->objectid - offset; + last_byte = extent_key->objectid + extent_key->offset; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + cur_pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.offset != cur_pos || + found_key.type != BTRFS_EXTENT_DATA_KEY || + found_key.objectid != reloc_inode->i_ino) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + break; + + if (nr == max) { + struct disk_extent *old = exts; + max *= 2; + exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + memcpy(exts, old, sizeof(*exts) * nr); + if (old != *extents) + kfree(old); + } + + exts[nr].disk_bytenr = + btrfs_file_extent_disk_bytenr(leaf, fi); + exts[nr].disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + exts[nr].offset = btrfs_file_extent_offset(leaf, fi); + exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + exts[nr].compression = btrfs_file_extent_compression(leaf, fi); + exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); + exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, + fi); + BUG_ON(exts[nr].offset > 0); + BUG_ON(exts[nr].compression || exts[nr].encryption); + BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); + + cur_pos += exts[nr].num_bytes; + nr++; + + if (cur_pos + offset >= last_byte) + break; + + if (no_fragment) { + ret = 1; + goto out; + } + path->slots[0]++; + } + + BUG_ON(cur_pos + offset > last_byte); + if (cur_pos + offset < last_byte) { + ret = -ENOENT; + goto out; + } + ret = 0; +out: + btrfs_free_path(path); + if (ret) { + if (exts != *extents) + kfree(exts); + } else { + *extents = exts; + *nr_extents = nr; + } + return ret; +} + +static noinline int replace_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_key *leaf_key, + struct btrfs_ref_path *ref_path, + struct disk_extent *new_extents, + int nr_extents) +{ + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct btrfs_key key; + u64 lock_start = 0; + u64 lock_end = 0; + u64 num_bytes; + u64 ext_offset; + u64 first_pos; + u32 nritems; + int nr_scaned = 0; + int extent_locked = 0; + int extent_type; + int ret; + + memcpy(&key, leaf_key, sizeof(key)); + first_pos = INT_LIMIT(loff_t) - extent_key->offset; + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if (key.objectid < ref_path->owner_objectid || + (key.objectid == ref_path->owner_objectid && + key.type < BTRFS_EXTENT_DATA_KEY)) { + key.objectid = ref_path->owner_objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + } + } + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); +next: + if (extent_locked && ret > 0) { + /* + * the file extent item was modified by someone + * before the extent got locked. + */ + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } + + if (path->slots[0] >= nritems) { + if (++nr_scaned > 2) + break; + + BUG_ON(extent_locked); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if ((key.objectid > ref_path->owner_objectid) || + (key.objectid == ref_path->owner_objectid && + key.type > BTRFS_EXTENT_DATA_KEY) || + (key.offset >= first_pos + extent_key->offset)) + break; + } + + if (inode && key.objectid != inode->i_ino) { + BUG_ON(extent_locked); + btrfs_release_path(root, path); + mutex_unlock(&inode->i_mutex); + iput(inode); + inode = NULL; + continue; + } + + if (key.type != BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + ret = 1; + goto next; + } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if ((extent_type != BTRFS_FILE_EXTENT_REG && + extent_type != BTRFS_FILE_EXTENT_PREALLOC) || + (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid)) { + path->slots[0]++; + ret = 1; + goto next; + } + + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + ext_offset = btrfs_file_extent_offset(leaf, fi); + + if (first_pos > key.offset - ext_offset) + first_pos = key.offset - ext_offset; + + if (!extent_locked) { + lock_start = key.offset; + lock_end = lock_start + num_bytes - 1; + } else { + if (lock_start > key.offset || + lock_end + 1 < key.offset + num_bytes) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + extent_locked = 0; + } + } + + if (!inode) { + btrfs_release_path(root, path); + + inode = btrfs_iget_locked(root->fs_info->sb, + key.objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = + key.objectid; + BTRFS_I(inode)->location.type = + BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + /* + * some code call btrfs_commit_transaction while + * holding the i_mutex, so we can't use mutex_lock + * here. + */ + if (is_bad_inode(inode) || + !mutex_trylock(&inode->i_mutex)) { + iput(inode); + inode = NULL; + key.offset = (u64)-1; + goto skip; + } + } + + if (!extent_locked) { + struct btrfs_ordered_extent *ordered; + + btrfs_release_path(root, path); + + lock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + lock_end); + if (ordered && + ordered->file_offset <= lock_end && + ordered->file_offset + ordered->len > lock_start) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + key.offset += num_bytes; + goto skip; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + extent_locked = 1; + continue; + } + + if (nr_extents == 1) { + /* update extent pointer in place */ + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[0].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[0].disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[0].disk_bytenr, + new_extents[0].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, + key.objectid); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, + extent_key->objectid, + extent_key->offset, + leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + + btrfs_release_path(root, path); + key.offset += num_bytes; + } else { + BUG_ON(1); +#if 0 + u64 alloc_hint; + u64 extent_len; + int i; + /* + * drop old extent pointer at first, then insert the + * new pointers one bye one + */ + btrfs_release_path(root, path); + ret = btrfs_drop_extents(trans, root, inode, key.offset, + key.offset + num_bytes, + key.offset, &alloc_hint); + BUG_ON(ret); + + for (i = 0; i < nr_extents; i++) { + if (ext_offset >= new_extents[i].num_bytes) { + ext_offset -= new_extents[i].num_bytes; + continue; + } + extent_len = min(new_extents[i].num_bytes - + ext_offset, num_bytes); + + ret = btrfs_insert_empty_item(trans, root, + path, &key, + sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[i].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[i].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[i].ram_bytes); + + btrfs_set_file_extent_compression(leaf, fi, + new_extents[i].compression); + btrfs_set_file_extent_encryption(leaf, fi, + new_extents[i].encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, + new_extents[i].other_encoding); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len); + ext_offset += new_extents[i].offset; + btrfs_set_file_extent_offset(leaf, fi, + ext_offset); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + extent_len - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[i].disk_bytenr, + new_extents[i].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + btrfs_release_path(root, path); + + inode_add_bytes(inode, extent_len); + + ext_offset = 0; + num_bytes -= extent_len; + key.offset += extent_len; + + if (num_bytes == 0) + break; + } + BUG_ON(i >= nr_extents); +#endif + } + + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } +skip: + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && + key.offset >= first_pos + extent_key->offset) + break; + + cond_resched(); + } + ret = 0; +out: + btrfs_release_path(root, path); + if (inode) { + mutex_unlock(&inode->i_mutex); + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + } + iput(inode); + } + return ret; +} + +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start) +{ + int level; + int ret; + + BUG_ON(btrfs_header_generation(buf) != trans->transid); + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + level = btrfs_header_level(buf); + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_leaf_ref *orig_ref; + + orig_ref = btrfs_lookup_leaf_ref(root, orig_start); + if (!orig_ref) + return -ENOENT; + + ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); + if (!ref) { + btrfs_free_leaf_ref(root, orig_ref); + return -ENOMEM; + } + + ref->nritems = orig_ref->nritems; + memcpy(ref->extents, orig_ref->extents, + sizeof(ref->extents[0]) * ref->nritems); + + btrfs_free_leaf_ref(root, orig_ref); + + ref->root_gen = trans->transid; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } + return 0; +} + +static noinline int invalidate_extent_cache(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct btrfs_root *target_root) +{ + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_file_extent_item *fi; + u64 num_bytes; + u64 skip_objectid = 0; + u32 nritems; + u32 i; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.objectid == skip_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + continue; + if (!inode || inode->i_ino != key.objectid) { + iput(inode); + inode = btrfs_ilookup(target_root->fs_info->sb, + key.objectid, target_root, 1); + } + if (!inode) { + skip_objectid = key.objectid; + continue; + } + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + + lock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + cond_resched(); + } + iput(inode); + return 0; +} + +static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_key key; + struct btrfs_key extent_key; + struct btrfs_file_extent_item *fi; + struct btrfs_leaf_ref *ref; + struct disk_extent *new_extent; + u64 bytenr; + u64 num_bytes; + u32 nritems; + u32 i; + int ext_index; + int nr_extent; + int ret; + + new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); + BUG_ON(!new_extent); + + ref = btrfs_lookup_leaf_ref(root, leaf->start); + BUG_ON(!ref); + + ext_index = -1; + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + + ext_index++; + if (bytenr >= group->key.objectid + group->key.offset || + bytenr + num_bytes <= group->key.objectid) + continue; + + extent_key.objectid = bytenr; + extent_key.offset = num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; + nr_extent = 1; + ret = get_new_locations(reloc_inode, &extent_key, + group->key.objectid, 1, + &new_extent, &nr_extent); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + BUG_ON(ref->extents[ext_index].bytenr != bytenr); + BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); + ref->extents[ext_index].bytenr = new_extent->disk_bytenr; + ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; + + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extent->disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extent->disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, + new_extent->disk_bytenr, + new_extent->disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + ret = btrfs_free_extent(trans, root, + bytenr, num_bytes, leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + cond_resched(); + } + kfree(new_extent); + BUG_ON(ext_index + 1 != ref->nritems); + btrfs_free_leaf_ref(root, ref); + return 0; +} + +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + root->reloc_root = NULL; + list_add(&reloc_root->dead_list, + &root->fs_info->dead_reloc_roots); + + btrfs_set_root_bytenr(&reloc_root->root_item, + reloc_root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(reloc_root->node)); + memset(&reloc_root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + reloc_root->root_item.drop_level = 0; + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, + &reloc_root->root_item); + BUG_ON(ret); + } + return 0; +} + +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root *prev_root = NULL; + struct list_head dead_roots; + int ret; + unsigned long nr; + + INIT_LIST_HEAD(&dead_roots); + list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); + + while (!list_empty(&dead_roots)) { + reloc_root = list_entry(dead_roots.prev, + struct btrfs_root, dead_list); + list_del_init(&reloc_root->dead_list); + + BUG_ON(reloc_root->commit_root != NULL); + while (1) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, reloc_root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + } + + free_extent_buffer(reloc_root->node); + + ret = btrfs_del_root(trans, root->fs_info->tree_root, + &reloc_root->root_key); + BUG_ON(ret); + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + + kfree(prev_root); + prev_root = reloc_root; + } + if (prev_root) { + btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); + kfree(prev_root); + } + return 0; +} + +int btrfs_add_dead_reloc_root(struct btrfs_root *root) +{ + list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); + return 0; +} + +int btrfs_cleanup_reloc_trees(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + struct btrfs_key location; + int found; + int ret; + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); + BUG_ON(ret); + found = !list_empty(&root->fs_info->dead_reloc_roots); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + if (found) { + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } + + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + location.offset = (u64)-1; + location.type = BTRFS_ROOT_ITEM_KEY; + + reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + BUG_ON(!reloc_root); + btrfs_orphan_cleanup(reloc_root); + return 0; +} + +static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + BUG_ON(!root->ref_cows); + if (root->reloc_root) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + ret = btrfs_copy_root(trans, root, root->commit_root, + &eb, BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.offset = root->root_key.objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + + memcpy(root_item, &root->root_item, sizeof(root_item)); + btrfs_set_root_refs(root_item, 0); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(!reloc_root); + reloc_root->last_trans = trans->transid; + reloc_root->commit_root = NULL; + reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; + + root->reloc_root = reloc_root; + return 0; +} + +/* + * Core function of space balance. + * + * The idea is using reloc trees to relocate tree blocks in reference + * counted roots. There is one reloc tree for each subvol, and all + * reloc trees share same root key objectid. Reloc trees are snapshots + * of the latest committed roots of subvols (root->commit_root). + * + * To relocate a tree block referenced by a subvol, there are two steps. + * COW the block through subvol's reloc tree, then update block pointer + * in the subvol to point to the new block. Since all reloc trees share + * same root key objectid, doing special handing for tree blocks owned + * by them is easy. Once a tree block has been COWed in one reloc tree, + * we can use the resulting new block directly when the same block is + * required to COW again through other reloc trees. By this way, relocated + * tree blocks are shared between reloc trees, so they are also shared + * between subvols. + */ +static noinline int relocate_one_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb = NULL; + struct btrfs_key *keys; + u64 *nodes; + int level; + int shared_level; + int lowest_level = 0; + int ret; + + if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + lowest_level = ref_path->owner_objectid; + + if (!root->ref_cows) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); + BUG_ON(ret < 0); + path->lowest_level = 0; + btrfs_release_path(root, path); + return 0; + } + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = init_reloc_tree(trans, root); + BUG_ON(ret); + reloc_root = root->reloc_root; + + shared_level = ref_path->shared_level; + ref_path->shared_level = BTRFS_MAX_LEVEL - 1; + + keys = ref_path->node_keys; + nodes = ref_path->new_nodes; + memset(&keys[shared_level + 1], 0, + sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); + memset(&nodes[shared_level + 1], 0, + sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); + + if (nodes[lowest_level] == 0) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 1); + BUG_ON(ret); + for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { + eb = path->nodes[level]; + if (!eb || eb == reloc_root->node) + break; + nodes[level] = eb->start; + if (level == 0) + btrfs_item_key_to_cpu(eb, &keys[level], 0); + else + btrfs_node_key_to_cpu(eb, &keys[level], 0); + } + if (nodes[0] && + ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + eb = path->nodes[0]; + ret = replace_extents_in_leaf(trans, reloc_root, eb, + group, reloc_inode); + BUG_ON(ret); + } + btrfs_release_path(reloc_root, path); + } else { + ret = btrfs_merge_path(trans, reloc_root, keys, nodes, + lowest_level); + BUG_ON(ret); + } + + /* + * replace tree blocks in the fs tree with tree blocks in + * the reloc tree. + */ + ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); + BUG_ON(ret < 0); + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 0); + BUG_ON(ret); + extent_buffer_get(path->nodes[0]); + eb = path->nodes[0]; + btrfs_release_path(reloc_root, path); + ret = invalidate_extent_cache(reloc_root, eb, group, root); + BUG_ON(ret); + free_extent_buffer(eb); + } + + mutex_unlock(&root->fs_info->tree_reloc_mutex); + path->lowest_level = 0; + return 0; +} + +static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path) +{ + int ret; + + ret = relocate_one_path(trans, root, path, first_key, + ref_path, NULL, NULL); + BUG_ON(ret); + + if (root == root->fs_info->extent_root) + btrfs_extent_post_op(trans, root); + + return 0; +} + +static noinline int del_extent_zero(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + int ret; + + ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); + if (ret) + goto out; + ret = btrfs_del_item(trans, extent_root, path); +out: + btrfs_release_path(extent_root, path); + return ret; +} + +static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, + struct btrfs_ref_path *ref_path) +{ + struct btrfs_key root_key; + + root_key.objectid = ref_path->root_objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(ref_path->root_objectid)) + root_key.offset = 0; + else + root_key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &root_key); +} + +static noinline int relocate_one_extent(struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode, int pass) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *found_root; + struct btrfs_ref_path *ref_path = NULL; + struct disk_extent *new_extents = NULL; + int nr_extents = 0; + int loops; + int ret; + int level; + struct btrfs_key first_key; + u64 prev_block = 0; + + + trans = btrfs_start_transaction(extent_root, 1); + BUG_ON(!trans); + + if (extent_key->objectid == 0) { + ret = del_extent_zero(trans, extent_root, path, extent_key); + goto out; + } + + ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); + if (!ref_path) { + ret = -ENOMEM; + goto out; + } + + for (loops = 0; ; loops++) { + if (loops == 0) { + ret = btrfs_first_ref_path(trans, extent_root, ref_path, + extent_key->objectid); + } else { + ret = btrfs_next_ref_path(trans, extent_root, ref_path); + } + if (ret < 0) + goto out; + if (ret > 0) + break; + + if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || + ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) + continue; + + found_root = read_ref_root(extent_root->fs_info, ref_path); + BUG_ON(!found_root); + /* + * for reference counted tree, only process reference paths + * rooted at the latest committed root. + */ + if (found_root->ref_cows && + ref_path->root_generation != found_root->root_key.offset) + continue; + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (pass == 0) { + /* + * copy data extents to new locations + */ + u64 group_start = group->key.objectid; + ret = relocate_data_extent(reloc_inode, + extent_key, + group_start); + if (ret < 0) + goto out; + break; + } + level = 0; + } else { + level = ref_path->owner_objectid; + } + + if (prev_block != ref_path->nodes[level]) { + struct extent_buffer *eb; + u64 block_start = ref_path->nodes[level]; + u64 block_size = btrfs_level_size(found_root, level); + + eb = read_tree_block(found_root, block_start, + block_size, 0); + btrfs_tree_lock(eb); + BUG_ON(level != btrfs_header_level(eb)); + + if (level == 0) + btrfs_item_key_to_cpu(eb, &first_key, 0); + else + btrfs_node_key_to_cpu(eb, &first_key, 0); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + prev_block = block_start; + } + + btrfs_record_root_in_trans(found_root); + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + /* + * try to update data extent references while + * keeping metadata shared between snapshots. + */ + if (pass == 1) { + ret = relocate_one_path(trans, found_root, + path, &first_key, ref_path, + group, reloc_inode); + if (ret < 0) + goto out; + continue; + } + /* + * use fallback method to process the remaining + * references. + */ + if (!new_extents) { + u64 group_start = group->key.objectid; + new_extents = kmalloc(sizeof(*new_extents), + GFP_NOFS); + nr_extents = 1; + ret = get_new_locations(reloc_inode, + extent_key, + group_start, 1, + &new_extents, + &nr_extents); + if (ret) + goto out; + } + ret = replace_one_extent(trans, found_root, + path, extent_key, + &first_key, ref_path, + new_extents, nr_extents); + } else { + ret = relocate_tree_block(trans, found_root, path, + &first_key, ref_path); + } + if (ret < 0) + goto out; + } + ret = 0; +out: + btrfs_end_transaction(trans, extent_root); + kfree(new_extents); + kfree(ref_path); + return ret; +} + +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + num_devices = root->fs_info->fs_devices->rw_devices; + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + return flags; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* turn single device chunks into raid0 */ + return stripped | BTRFS_BLOCK_GROUP_RAID0; + } + return flags; +} + +static int __alloc_chunk_for_shrink(struct btrfs_root *root, + struct btrfs_block_group_cache *shrink_block_group, + int force) +{ + struct btrfs_trans_handle *trans; + u64 new_alloc_flags; + u64 calc; + + spin_lock(&shrink_block_group->lock); + if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + spin_unlock(&shrink_block_group->lock); + + trans = btrfs_start_transaction(root, 1); + spin_lock(&shrink_block_group->lock); + + new_alloc_flags = update_block_group_flags(root, + shrink_block_group->flags); + if (new_alloc_flags != shrink_block_group->flags) { + calc = + btrfs_block_group_used(&shrink_block_group->item); + } else { + calc = shrink_block_group->key.offset; + } + spin_unlock(&shrink_block_group->lock); + + do_chunk_alloc(trans, root->fs_info->extent_root, + calc + 2 * 1024 * 1024, new_alloc_flags, force); + + btrfs_end_transaction(trans, root); + } else + spin_unlock(&shrink_block_group->lock); + return 0; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 size) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key root_key; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid, group->key.offset); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, + group->key.offset, 0, group->key.offset, + 0, 0, 0); + BUG_ON(err); + + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + BUG_ON(is_bad_inode(inode)); + } else { + BUG_ON(1); + } + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + btrfs_end_transaction(trans, root); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct list_head list; + size_t offset; + int ret; + u64 disk_bytenr; + + INIT_LIST_HEAD(&list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} + +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_fs_info *info = root->fs_info; + struct extent_buffer *leaf; + struct inode *reloc_inode; + struct btrfs_block_group_cache *block_group; + struct btrfs_key key; + u64 skipped; + u64 cur_byte; + u64 total_found; + u32 nritems; + int ret; + int progress; + int pass = 0; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(info, group_start); + BUG_ON(!block_group); + + printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->flags); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + reloc_inode = create_reloc_inode(info, block_group); + BUG_ON(IS_ERR(reloc_inode)); + + __alloc_chunk_for_shrink(root, block_group, 1); + set_block_group_readonly(block_group); + + btrfs_start_delalloc_inodes(info->tree_root); + btrfs_wait_ordered_extents(info->tree_root, 0); +again: + skipped = 0; + total_found = 0; + progress = 0; + key.objectid = block_group->key.objectid; + key.offset = 0; + key.type = 0; + cur_byte = key.objectid; + + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(info->tree_root); + btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); + mutex_unlock(&root->fs_info->cleaner_mutex); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; +next: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + break; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (progress && need_resched()) { + btrfs_release_path(root, path); + cond_resched(); + progress = 0; + continue; + } + progress = 1; + + if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= cur_byte) { + path->slots[0]++; + goto next; + } + + total_found++; + cur_byte = key.objectid + key.offset; + btrfs_release_path(root, path); + + __alloc_chunk_for_shrink(root, block_group, 0); + ret = relocate_one_extent(root, path, &key, block_group, + reloc_inode, pass); + BUG_ON(ret < 0); + if (ret > 0) + skipped++; + + key.objectid = cur_byte; + key.type = 0; + key.offset = 0; + } + + btrfs_release_path(root, path); + + if (pass == 0) { + btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); + invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); + } + + if (total_found > 0) { + printk(KERN_INFO "btrfs found %llu extents in pass %d\n", + (unsigned long long)total_found, pass); + pass++; + if (total_found == skipped && pass > 2) { + iput(reloc_inode); + reloc_inode = create_reloc_inode(info, block_group); + pass = 0; + } + goto again; + } + + /* delete reloc_inode */ + iput(reloc_inode); + + /* unpin extents in this range */ + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + + spin_lock(&block_group->lock); + WARN_ON(block_group->pinned > 0); + WARN_ON(block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&block_group->item) > 0); + spin_unlock(&block_group->lock); + put_block_group(block_group); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } + path->slots[0]++; + } + ret = -ENOENT; +out: + return ret; +} + +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct rb_node *n; + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + WARN_ON(atomic_read(&block_group->count) != 1); + kfree(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + return 0; +} + +int btrfs_read_block_groups(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + root = info->extent_root; + key.objectid = 0; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + ret = find_first_block_group(root, path, &key); + if (ret > 0) { + ret = 0; + goto error; + } + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + ret = -ENOMEM; + break; + } + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + mutex_init(&cache->alloc_mutex); + mutex_init(&cache->cache_mutex); + INIT_LIST_HEAD(&cache->list); + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + memcpy(&cache->key, &found_key, sizeof(found_key)); + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(root, path); + cache->flags = btrfs_block_group_flags(&cache->item); + + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + BUG_ON(ret); + cache->space_info = space_info; + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups); + up_write(&space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + set_avail_alloc_bits(root->fs_info, cache->flags); + if (btrfs_chunk_readonly(root, cache->key.objectid)) + set_block_group_readonly(cache); + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size) +{ + int ret; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + + extent_root = root->fs_info->extent_root; + + root->fs_info->last_trans_new_blockgroup = trans->transid; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return -ENOMEM; + + cache->key.objectid = chunk_offset; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + mutex_init(&cache->alloc_mutex); + mutex_init(&cache->cache_mutex); + INIT_LIST_HEAD(&cache->list); + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); + down_write(&cache->space_info->groups_sem); + list_add_tail(&cache->list, &cache->space_info->block_groups); + up_write(&cache->space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, + sizeof(cache->item)); + BUG_ON(ret); + + finish_current_insert(trans, extent_root, 0); + ret = del_pending_extents(trans, extent_root, 0); + BUG_ON(ret); + set_avail_alloc_bits(extent_root->fs_info, type); + + return 0; +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start) +{ + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_key key; + int ret; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(root->fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + memcpy(&key, &block_group->key, sizeof(key)); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + btrfs_remove_free_space_cache(block_group); + rb_erase(&block_group->cache_node, + &root->fs_info->block_group_cache_tree); + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + spin_lock(&block_group->space_info->lock); + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + spin_unlock(&block_group->space_info->lock); + block_group->space_info->full = 0; + + put_block_group(block_group); + put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 00000000000..e086d407f1f --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,3717 @@ +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/bio.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/page-flags.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/version.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "extent_io.h" +#include "extent_map.h" +#include "compat.h" +#include "ctree.h" +#include "btrfs_inode.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; + +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +#define LEAK_DEBUG 0 +#ifdef LEAK_DEBUG +static DEFINE_SPINLOCK(leak_lock); +#endif + +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + int extent_locked; +}; + +int __init extent_io_init(void) +{ + extent_state_cache = btrfs_cache_create("extent_state", + sizeof(struct extent_state), 0, + NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = btrfs_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + return -ENOMEM; +} + +void extent_io_exit(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->state.rb_node = NULL; + tree->buffer.rb_node = NULL; + tree->ops = NULL; + tree->dirty_bytes = 0; + spin_lock_init(&tree->lock); + spin_lock_init(&tree->buffer_lock); + tree->mapping = mapping; +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + state->private = 0; + state->tree = NULL; +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + return state; +} + +static void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_node *prev = NULL; + struct rb_node *ret; + + ret = __etree_search(tree, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, + u64 offset, struct rb_node *node) +{ + struct rb_root *root = &tree->buffer; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_buffer *eb; + + while (*p) { + parent = *p; + eb = rb_entry(parent, struct extent_buffer, rb_node); + + if (offset < eb->start) + p = &(*p)->rb_left; + else if (offset > eb->start) + p = &(*p)->rb_right; + else + return eb; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct extent_buffer *buffer_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_root *root = &tree->buffer; + struct rb_node *n = root->rb_node; + struct extent_buffer *eb; + + while (n) { + eb = rb_entry(n, struct extent_buffer, rb_node); + if (offset < eb->start) + n = n->rb_left; + else if (offset > eb->start) + n = n->rb_right; + else + return eb; + } + return NULL; +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + } + } + return 0; +} + +static void set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->set_bit_hook) { + tree->ops->set_bit_hook(tree->mapping->host, state->start, + state->end, state->state, bits); + } +} + +static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->clear_bit_hook) { + tree->ops->clear_bit_hook(tree->mapping->host, state->start, + state->end, state->state, bits); + } +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", + (unsigned long long)end, + (unsigned long long)start); + WARN_ON(1); + } + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; + set_state_cb(tree, state, bits); + state->state |= bits; + state->start = start; + state->end = end; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk(KERN_ERR "btrfs found node %llu %llu on insert of " + "%llu %llu\n", (unsigned long long)found->start, + (unsigned long long)found->end, + (unsigned long long)start, (unsigned long long)end); + free_extent_state(state); + return -EEXIST; + } + state->tree = tree; + merge_state(tree, state); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + free_extent_state(prealloc); + return -EEXIST; + } + prealloc->tree = tree; + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int ret = state->state & bits; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); + state->state &= ~bits; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->tree) { + clear_state_cb(tree, state, state->state); + rb_erase(&state->rb_node, &tree->state); + state->tree = NULL; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + goto out; + WARN_ON(state->end < start); + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, + wake, delete); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + if (wake) + wake_up(&state->wq); + set |= clear_state_bit(tree, prealloc, bits, + wake, delete); + prealloc = NULL; + goto out; + } + + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, wake, delete); + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +static int wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + spin_lock(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } +out: + spin_unlock(&tree->lock); + return 0; +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) +{ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + set_state_cb(tree, state, bits); + state->state |= bits; +} + +/* + * set some bits on a range in the tree. This may require allocations + * or sleeping, so the gfp mask is used to indicate what is allowed. + * + * If 'exclusive' == 1, this will fail with -EEXIST if some part of the + * range already has the desired bits set. The start of the existing + * range is returned in failed_start in this case. + * + * [start, end] is inclusive + * This takes the tree lock. + */ +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive, u64 *failed_start, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + int set; + u64 last_start; + u64 last_end; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + + state = rb_entry(node, struct extent_state, rb_node); + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set = state->state & bits; + if (set && exclusive) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + err = insert_state(tree, prealloc, start, this_end, + bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + if (err) + goto out; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, bits); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + mask); +} + +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); +} + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + mask); +} + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, mask); +} + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY, + 0, NULL, mask); +} + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); +} + +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); +} + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + mask); +} + +static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +} + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + mask); +} + +static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +} + +static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, + 0, NULL, mask); +} + +static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +} + +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} + +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int err; + u64 failed_start; + + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, mask); + return 0; + } + return 1; +} + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +} + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; +} + +/* + * helper function to set both pages and extents in the tree writeback + */ +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; +} + +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits) +{ + struct rb_node *node; + struct extent_state *state; + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) + return state; + + node = rb_next(node); + if (!node) + break; + } +out: + return NULL; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) { + if (!found) + *end = (u64)-1; + goto out; + } + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) + *start = state->start; + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + spin_unlock(&tree->lock); + return found; +} + +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while (nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } + page_cache_release(pages[i]); + pages_locked++; + } + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes); + if (!found || delalloc_end <= *start) { + *start = delalloc_start; + *end = delalloc_end; + return found; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) + delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1); + if (!ret) { + unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int unlock_pages, + int clear_unlock, + int clear_delalloc, int clear_dirty, + int set_writeback, + int end_writeback) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = 0; + + if (clear_unlock) + clear_bits |= EXTENT_LOCKED; + if (clear_dirty) + clear_bits |= EXTENT_DIRTY; + + if (clear_delalloc) + clear_bits |= EXTENT_DELALLOC; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (clear_dirty) + clear_page_dirty_for_io(pages[i]); + if (set_writeback) + set_page_writeback(pages[i]); + if (end_writeback) + end_page_writeback(pages[i]); + if (unlock_pages) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + int found = 0; + + if (search_end <= cur_start) { + WARN_ON(1); + return 0; + } + + spin_lock(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (state->end >= cur_start && (state->state & bits)) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = state->start; + found = 1; + } + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return total_bytes; +} + +#if 0 +/* + * helper function to lock both pages and extents in the tree. + * pages must be locked first. + */ +static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + + while (index <= end_index) { + page = grab_cache_page(tree->mapping, index); + if (!page) { + err = -ENOMEM; + goto failed; + } + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed; + } + index++; + } + lock_extent(tree, start, end, GFP_NOFS); + return 0; + +failed: + /* + * we failed above in getting the page at 'index', so we undo here + * up to but not including the page at 'index' + */ + end_index = index; + index = start >> PAGE_CACHE_SHIFT; + while (index < end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + return err; +} + +/* + * helper function to unlock both pages and extents in the tree. + */ +static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + unlock_extent(tree, start, end, GFP_NOFS); + return 0; +} +#endif + +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + spin_unlock(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if every extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); + node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = (err == 0); + continue; + } + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + ClearPageUptodate(page); + SetPageError(page); + } + + clear_extent_writeback(tree, start, end, GFP_ATOMIC); + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct bio *bio, int err) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + if (err) + uptodate = 0; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end, + NULL); + if (ret) + uptodate = 0; + } + if (!uptodate && tree->ops && + tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = + test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; + continue; + } + } + + if (uptodate) { + set_extent_uptodate(tree, start, end, + GFP_ATOMIC); + } + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { + if (uptodate) { + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + check_page_locked(tree, page); + } + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +static void end_bio_extent_preparewrite(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +static struct bio * +extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + int ret = 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + bio->bi_private = NULL; + + bio_get(bio); + + if (tree->ops && tree->ops->submit_bio_hook) + tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags); + else + submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func, + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + int ret = 0; + struct bio *bio; + int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || + (tree->ops && tree->ops->merge_bio_hook && + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); + bio = NULL; + } else { + return 0; + } + } + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + + bio_add_page(bio, page, page_size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + if (bio_ret) + *bio_ret = bio; + else + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + + return ret; +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } +} + +static void set_page_extent_head(struct page *page, unsigned long len) +{ + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags) +{ + struct inode *inode = page->mapping->host; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t disk_io_size; + size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; + + set_page_extent_mapped(page); + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + while (cur <= end) { + if (cur >= last_byte) { + char *userpage; + iosize = PAGE_CACHE_SIZE - page_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, + end - cur + 1, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = EXTENT_BIO_COMPRESSED; + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } + bdev = em->bdev; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* we have an inline extent but it didn't get marked up + * to date. Error out + */ + if (block_start == EXTENT_MAP_INLINE) { + SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, + sector, disk_io_size, page_offset, + bdev, bio, pnr, + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); + nr++; + *bio_flags = this_bio_flag; + } + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_io_tree *tree = epd->tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 iosize; + u64 unlock_start; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t pg_offset = 0; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + int page_started; + int compressed; + unsigned long nr_written = 0; + + WARN_ON(!PageLocked(page)); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage, KM_USER0); + flush_dcache_page(page); + } + pg_offset = 0; + + set_page_extent_mapped(page); + + delalloc_start = start; + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); + delalloc_start = delalloc_end + 1; + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + ret = 0; + goto update_nr_written; + } + } + lock_extent(tree, start, page_end, GFP_NOFS); + + unlock_start = start; + + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { + unlock_extent(tree, start, page_end, GFP_NOFS); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + ret = 0; + goto update_nr_written; + } + } + + nr_written++; + + end = page_end; + if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) + printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); + + if (last_byte <= start) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + unlock_extent(tree, start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); + unlock_start = page_end + 1; + goto done; + } + + set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + clear_extent_dirty(tree, cur, page_end, GFP_NOFS); + unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); + unlock_start = page_end + 1; + break; + } + em = epd->get_extent(inode, page, pg_offset, cur, + end - cur + 1, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + free_extent_map(em); + em = NULL; + + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + clear_extent_dirty(tree, cur, + cur + iosize - 1, GFP_NOFS); + + unlock_extent(tree, unlock_start, cur + iosize - 1, + GFP_NOFS); + + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + cur + iosize - 1, + NULL, 1); + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; + pg_offset += iosize; + unlock_start = cur; + continue; + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) { + SetPageError(page); + } else { + unsigned long max_nr = end_index + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk(KERN_ERR "btrfs warning page %lu not " + "writeback, cur %llu end %llu\n", + page->index, (unsigned long long)cur, + (unsigned long long)end); + } + + ret = submit_extent_page(WRITE, tree, page, sector, + iosize, pg_offset, bdev, + &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + pg_offset += iosize; + nr++; + } +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + if (unlock_start <= page_end) + unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_page(page); + +update_nr_written: + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; + return 0; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct extent_io_tree *tree, + struct address_space *mapping, + struct writeback_control *wbc, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + if (tree->ops && tree->ops->write_cache_pages_lock_hook) + tree->ops->write_cache_pages_lock_hook(page); + else + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); + wait_on_page_writeback(page); + } + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || wbc->nr_to_write <= 0) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + if (epd->bio) { + submit_one_bio(WRITE, epd->bio, 0, 0); + epd->bio = NULL; + } +} + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct address_space *mapping = page->mapping; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + }; + struct writeback_control wbc_writepages = { + .bdi = wbc->bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + + + ret = __extent_writepage(page, wbc, &epd); + + extent_write_cache_pages(tree, mapping, &wbc_writepages, + __extent_writepage, &epd, flush_write_bio); + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + }; + struct writeback_control wbc_writepages = { + .bdi = inode->i_mapping->backing_dev_info, + .sync_mode = mode, + .older_than_this = NULL, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + }; + + ret = extent_write_cache_pages(tree, mapping, wbc, + __extent_writepage, &epd, + flush_write_bio); + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + struct pagevec pvec; + unsigned long bio_flags = 0; + + pagevec_init(&pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + __extent_read_full_page(tree, page, get_extent, + &bio, 0, &bio_flags); + } + page_cache_release(page); + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return 0; +} + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize - 1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_extent_writeback(tree, start, end); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + 1, 1, GFP_NOFS); + return 0; +} + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_extent_mapped(page); + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + set_page_extent_mapped(page); + + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while (block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end - block_start + 1, 1); + if (IS_ERR(em) || !em) + goto err; + + cur_end = min(block_end, extent_map_end(em) - 1); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if ((em->block_start != EXTENT_MAP_HOLE && + em->block_start != EXTENT_MAP_INLINE) && + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, + end_bio_extent_preparewrite, 0, + 0, 0); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} + +/* + * a helper for releasepage, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, + EXTENT_IOBITS | EXTENT_ORDERED, 0)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + 1, 1, mask); + } + return ret; +} + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + struct extent_map *em; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + + if ((mask & __GFP_WAIT) && + page->mapping->host->i_size > 16 * 1024 * 1024) { + u64 len; + while (start <= end) { + len = end - start + 1; + spin_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em || IS_ERR(em)) { + spin_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + spin_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED | EXTENT_WRITEBACK | + EXTENT_ORDERED, + 0)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + spin_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + } + return try_release_extent_state(map, tree, page, mask); +} + +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent) +{ + struct inode *inode = mapping->host; + u64 start = iblock << inode->i_blkbits; + sector_t sector = 0; + size_t blksize = (1 << inode->i_blkbits); + struct extent_map *em; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + em = get_extent(inode, NULL, 0, start, blksize, 0); + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + if (!em || IS_ERR(em)) + return 0; + + if (em->block_start > EXTENT_MAP_LAST_BYTE) + goto out; + + sector = (em->block_start + start - em->start) >> inode->i_blkbits; +out: + free_extent_map(em); + return sector; +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + if (!mapping) + return NULL; + + /* + * extent_buffer_page is only called after pinning the page + * by increasing the reference count. So we know the page must + * be in the radix tree. + */ + rcu_read_lock(); + p = radix_tree_lookup(&mapping->page_tree, i); + rcu_read_unlock(); + + return p; +} + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb = NULL; +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb->start = start; + eb->len = len; + mutex_init(&eb->mutex); +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&eb->leak_list, &buffers); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&eb->refs, 1); + + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#ifdef LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_buffer_cache, eb); +} + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) { + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + mark_page_accessed(eb->first_page); + return eb; + } + spin_unlock(&tree->buffer_lock); + + eb = __alloc_extent_buffer(tree, start, len, mask); + if (!eb) + return NULL; + + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) { + WARN_ON(1); + goto free_eb; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; + + spin_lock(&tree->buffer_lock); + exists = buffer_tree_insert(tree, start, &eb->rb_node); + if (exists) { + /* add one reference for the caller */ + atomic_inc(&exists->refs); + spin_unlock(&tree->buffer_lock); + goto free_eb; + } + spin_unlock(&tree->buffer_lock); + + /* add one reference for the tree */ + atomic_inc(&eb->refs); + return eb; + +free_eb: + if (!atomic_dec_and_test(&eb->refs)) + return exists; + for (index = 1; index < i; index++) + page_cache_release(extent_buffer_page(eb, index)); + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); + return exists; +} + +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + + if (eb) + mark_page_accessed(eb->first_page); + + return eb; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + WARN_ON(1); +} + +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int set; + unsigned long i; + unsigned long num_pages; + struct page *page; + + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!set && !PageDirty(page)) + continue; + + lock_page(page); + if (i == 0) + set_page_extent_head(page, eb->len); + else + set_page_private(page, EXTENT_PAGE_PRIVATE); + + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + start = (u64)page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&page->mapping->tree_lock); + unlock_page(page); + } + return 0; +} + +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} + +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *page = extent_buffer_page(eb, i); + /* writepage may need to do something special for the + * first page, we have to make sure page->private is + * properly set. releasepage may drop page->private + * on us if the page isn't already dirty. + */ + lock_page(page); + if (i == 0) { + set_page_extent_head(page, eb->len); + } else if (PagePrivate(page) && + page->private != EXTENT_PAGE_PRIVATE) { + set_page_extent_mapped(page); + } + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_extent_dirty(tree, page_offset(page), + page_offset(page) + PAGE_CACHE_SIZE - 1, + GFP_NOFS); + unlock_page(page); + } + return 0; +} + +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + eb->flags &= ~EXTENT_UPTODATE; + + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (page) + ClearPageUptodate(page); + } + return 0; +} + +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} + +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct page *page; + int ret; + int pg_uptodate = 1; + int uptodate; + unsigned long index; + + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + if (ret) + return 1; + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + page = find_get_page(tree->mapping, index); + uptodate = PageUptodate(page); + page_cache_release(page); + if (!uptodate) { + pg_uptodate = 0; + break; + } + start += PAGE_CACHE_SIZE; + } + return pg_uptodate; +} + +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int ret = 0; + unsigned long num_pages; + unsigned long i; + struct page *page; + int pg_uptodate = 1; + + if (eb->flags & EXTENT_UPTODATE) + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1); + if (ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + pg_uptodate = 0; + break; + } + } + return pg_uptodate; +} + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, + u64 start, int wait, + get_extent_t *get_extent, int mirror_num) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + int inc_all_pages = 0; + unsigned long num_pages; + struct bio *bio = NULL; + unsigned long bio_flags = 0; + + if (eb->flags & EXTENT_UPTODATE) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1)) { + return 0; + } + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!wait) { + if (!trylock_page(page)) + goto unlock_exit; + } else { + lock_page(page); + } + locked_pages++; + if (!PageUptodate(page)) + all_uptodate = 0; + } + if (all_uptodate) { + if (start_i == 0) + eb->flags |= EXTENT_UPTODATE; + goto unlock_exit; + } + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (inc_all_pages) + page_cache_get(page); + if (!PageUptodate(page)) { + if (start_i == 0) + inc_all_pages = 1; + ClearPageError(page); + err = __extent_read_full_page(tree, page, + get_extent, &bio, + mirror_num, &bio_flags); + if (err) + ret = err; + } else { + unlock_page(page); + } + } + + if (bio) + submit_one_bio(READ, bio, mirror_num, bio_flags); + + if (ret || !wait) + return ret; + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + wait_on_page_locked(page); + if (!PageUptodate(page)) + ret = -EIO; + } + + if (!ret) + eb->flags |= EXTENT_UPTODATE; + return ret; + +unlock_exit: + i = start_i; + while (locked_pages > 0) { + page = extent_buffer_page(eb, i); + i++; + unlock_page(page); + locked_pages--; + } + return ret; +} + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(dst, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER1); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + + if (start + min_len > eb->len) { + printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", (unsigned long long)eb->start, + eb->len, start, min_len); + WARN_ON(1); + } + + p = extent_buffer_page(eb, i); + kaddr = kmap_atomic(p, km); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + WARN_ON(!mutex_is_locked(&eb->mutex)); + } + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + kunmap_atomic(token, km); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + ret = memcmp(ptr, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(kaddr + offset, src, cur); + kunmap_atomic(kaddr, KM_USER1); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, c, cur); + kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + } +} + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(dst, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + kunmap_atomic(kaddr, KM_USER0); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + kunmap_atomic(src_kaddr, KM_USER1); + } + kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + + while (len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while (len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + struct extent_buffer *eb; + int ret = 1; + unsigned long i; + unsigned long num_pages; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (!eb) + goto out; + + if (atomic_read(&eb->refs) > 1) { + ret = 0; + goto out; + } + /* at this point we can safely release the extent buffer */ + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) + page_cache_release(extent_buffer_page(eb, i)); + rb_erase(&eb->rb_node, &tree->buffer); + __free_extent_buffer(eb); +out: + spin_unlock(&tree->buffer_lock); + return ret; +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 00000000000..c5b483a7913 --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,269 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include <linux/rbtree.h> + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_DEFRAG (1 << 6) +#define EXTENT_DEFRAG_DONE (1 << 7) +#define EXTENT_BUFFER_FILLED (1 << 8) +#define EXTENT_ORDERED (1 << 9) +#define EXTENT_ORDERED_METADATA (1 << 10) +#define EXTENT_BOUNDARY (1 << 11) +#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 + +struct extent_state; + +typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags); +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; + int (*merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); + int (*readpage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state); + int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*write_cache_pages_lock_hook)(struct page *page); +}; + +struct extent_io_tree { + struct rb_root state; + struct rb_root buffer; + struct address_space *mapping; + u64 dirty_bytes; + spinlock_t lock; + spinlock_t buffer_lock; + struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + + /* for use by the FS */ + u64 private; + + struct list_head leak_list; +}; + +struct extent_buffer { + u64 start; + unsigned long len; + char *map_token; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + struct page *first_page; + atomic_t refs; + int flags; + struct list_head leak_list; + struct rb_node rb_node; + struct mutex mutex; +}; + +struct extent_map_tree; + +static inline struct extent_state *extent_state_next(struct extent_state *state) +{ + struct rb_node *node; + node = rb_next(&state->rb_node); + if (!node) + return NULL; + return rb_entry(node, struct extent_state, rb_node); +} + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t page_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned long bits); + +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent); +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to); +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent); +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask); +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb); +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end); +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +int release_extent_buffer_tail_pages(struct extent_buffer *eb); +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int unlock_page, + int clear_unlock, + int clear_delalloc, int clear_dirty, + int set_writeback, + int end_writeback); +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 00000000000..4a83e33ada3 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,351 @@ +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/version.h> +#include <linux/hardirq.h> +#include "extent_map.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = btrfs_cache_create("extent_map", + sizeof(struct extent_map), 0, + NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * @mask: flags for memory allocations during tree operations + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) +{ + tree->map.rb_node = NULL; + spin_lock_init(&tree->lock); +} +EXPORT_SYMBOL(extent_map_tree_init); + +/** + * alloc_extent_map - allocate new extent map structure + * @mask: memory allocation flags + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + em->in_tree = 0; + em->flags = 0; + atomic_set(&em->refs, 1); + return em; +} +EXPORT_SYMBOL(alloc_extent_map); + +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + WARN_ON(atomic_read(&em->refs) == 0); + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} +EXPORT_SYMBOL(free_extent_map); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset >= extent_map_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct extent_map, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */ +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +/* check to see if two extent_map structs are adjacent and safe to merge */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was sucessfull. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *exist; + + exist = lookup_extent_mapping(tree, em->start, em->len); + if (exist) { + free_extent_map(exist); + ret = -EEXIST; + goto out; + } + assert_spin_locked(&tree->lock); + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; + free_extent_map(merge); + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } +out: + return ret; +} +EXPORT_SYMBOL(add_extent_mapping); + +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + assert_spin_locked(&tree->lock); + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} +EXPORT_SYMBOL(lookup_extent_mapping); + +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + assert_spin_locked(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +} +EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 00000000000..fb6eeef06bb --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,62 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include <linux/rbtree.h> + +#define EXTENT_MAP_LAST_BYTE (u64)-4 +#define EXTENT_MAP_HOLE (u64)-3 +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +/* bits for the flags field */ +#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 +#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ +#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 orig_start; + u64 block_start; + u64 block_len; + unsigned long flags; + struct block_device *bdev; + atomic_t refs; + int in_tree; +}; + +struct extent_map_tree { + struct rb_root map; + spinlock_t lock; +}; + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); + +struct extent_map *alloc_extent_map(gfp_t mask); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void extent_map_exit(void); +#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 00000000000..964652435fd --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,831 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" + +#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ + sizeof(struct btrfs_ordered_sum)) / \ + sizeof(struct btrfs_sector_sum) * \ + (r)->sectorsize - (r)->sectorsize) + +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + file_key.objectid = objectid; + file_key.offset = pos; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset >= csums_in_item) { + ret = -EFBIG; + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + int ret; + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); + return ret; +} + + +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + u32 sum; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + u64 offset; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u64 disk_bytenr; + u32 diff; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int ret; + struct btrfs_path *path; + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + + path = btrfs_alloc_path(); + if (bio->bi_size > PAGE_CACHE_SIZE * 8) + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + disk_bytenr = (u64)bio->bi_sector << 9; + while (bio_index < bio->bi_vcnt) { + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); + if (ret == 0) + goto found; + + if (!item || disk_bytenr < item_start_offset || + disk_bytenr >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(root, path); + item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + sum = 0; + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + printk(KERN_INFO "btrfs no csum found " + "for inode %lu start %llu\n", + inode->i_ino, + (unsigned long long)offset); + } + item = NULL; + btrfs_release_path(root, path); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / csum_size) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = disk_bytenr - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * csum_size; + + read_extent_buffer(path->nodes[0], &sum, + ((unsigned long)item) + diff, + csum_size); +found: + if (dst) + *dst++ = sum; + else + set_state_private(io_tree, offset, sum); + disk_bytenr += bvec->bv_len; + bio_index++; + bvec++; + } + btrfs_free_path(path); + return 0; +} + +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_csum_item *item; + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) + break; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + size = min_t(size_t, csum_end - start, + MAX_ORDERED_SUM_BYTES(root)); + sums = kzalloc(btrfs_ordered_sum_size(root, size), + GFP_NOFS); + BUG_ON(!sums); + + sector_sum = sums->sums; + sums->bytenr = start; + sums->len = size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + + while (size > 0) { + read_extent_buffer(path->nodes[0], + §or_sum->sum, + ((unsigned long)item) + + offset, csum_size); + sector_sum->bytenr = start; + + size -= root->sectorsize; + start += root->sectorsize; + offset += csum_size; + sector_sum++; + } + list_add_tail(&sums->list, list); + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + u64 offset; + u64 disk_bytenr; + + WARN_ON(bio->bi_vcnt <= 0); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + disk_bytenr = (u64)bio->bi_sector << 9; + sums->len = bio->bi_size; + INIT_LIST_HEAD(&sums->list); + + if (contig) + offset = file_start; + else + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + + while (bio_index < bio->bi_vcnt) { + if (!contig) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + if (!contig && (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset)) { + unsigned long bytes_left; + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + } + + data = kmap_atomic(bvec->bv_page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, + data + bvec->bv_offset, + sector_sum->sum, + bvec->bv_len); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->bytenr = disk_bytenr; + + sector_sum++; + bio_index++; + total_bytes += bvec->bv_len; + this_sum_bytes += bvec->bv_len; + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * helper function for csum removal, this expects the + * key to describe the csum pointed to by the path, and it expects + * the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the + * range should not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the + * overlap, and fixes up the key as required. + */ +static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct extent_buffer *leaf; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + int ret; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + ret = btrfs_truncate_item(trans, root, path, new_size, 1); + BUG_ON(ret); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + ret = btrfs_truncate_item(trans, root, path, new_size, 0); + BUG_ON(ret); + + key->offset = end_byte; + ret = btrfs_set_item_key_safe(trans, root, path, key); + BUG_ON(ret); + } else { + BUG(); + } + return 0; +} + +/* + * deletes the csum items from the csum tree for a given + * range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + root = root->fs_info->csum_root; + + path = btrfs_alloc_path(); + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memset_extent_buffer(leaf, 0, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + BUG_ON(ret && ret != -EAGAIN); + + key.offset = end_byte - 1; + } else { + ret = truncate_one_csum(trans, root, path, + &key, bytenr, len); + BUG_ON(ret); + if (key.offset < bytenr) + break; + } + btrfs_release_path(root, path); + } +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + u64 bytenr; + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + u64 next_offset; + u64 total_bytes = 0; + int found_next; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 csum_offset; + struct btrfs_sector_sum *sector_sum; + u32 nritems; + u32 ins_size; + char *eb_map; + char *eb_token; + unsigned long map_len; + unsigned long map_start; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + sector_sum = sums->sums; +again: + next_offset = (u64)-1; + found_next = 0; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = sector_sum->bytenr; + bytenr = sector_sum->bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + + item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + if (!IS_ERR(item)) { + leaf = path->nodes[0]; + ret = 0; + goto found; + } + ret = PTR_ERR(item); + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(root, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + int slot = path->slots[0] + 1; + /* we didn't find a csum item, insert one */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + found_next = 1; + if (ret != 0) + goto insert; + slot = 0; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) { + found_next = 1; + goto insert; + } + next_offset = found_key.offset; + found_next = 1; + goto insert; + } + + /* + * at this point, we know the tree has an item, but it isn't big + * enough yet to put our csum in. Grow it + */ + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + if (ret < 0) + goto fail_unlock; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + goto insert; + } + + if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + csum_size) { + u32 diff = (csum_offset + 1) * csum_size; + + /* + * is the item big enough already? we dropped our lock + * before and need to recheck + */ + if (diff < btrfs_item_size_nr(leaf, path->slots[0])) + goto csum; + + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + if (diff != csum_size) + goto insert; + + ret = btrfs_extend_item(trans, root, path, diff); + BUG_ON(ret); + goto csum; + } + +insert: + btrfs_release_path(root, path); + csum_offset = 0; + if (found_next) { + u64 tmp = total_bytes + root->sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while (tmp < sums->len) { + if (next_sector + root->sectorsize != next->bytenr) + break; + tmp += root->sectorsize; + next_sector = next->bytenr; + next++; + } + tmp = min(tmp, next_offset - file_key.offset); + tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = max((u64)1, tmp); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + if (ret < 0) + goto fail_unlock; + if (ret != 0) { + WARN_ON(1); + goto fail_unlock; + } +csum: + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + ret = 0; + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); + eb_token = NULL; + cond_resched(); +next_sector: + + if (!eb_token || + (unsigned long)item + csum_size >= map_start + map_len) { + int err; + + if (eb_token) + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + err = map_private_extent_buffer(leaf, (unsigned long)item, + csum_size, + &eb_token, &eb_map, + &map_start, &map_len, KM_USER1); + if (err) + eb_token = NULL; + } + if (eb_token) { + memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), + §or_sum->sum, csum_size); + } else { + write_extent_buffer(leaf, §or_sum->sum, + (unsigned long)item, csum_size); + } + + total_bytes += root->sectorsize; + sector_sum++; + if (total_bytes < sums->len) { + item = (struct btrfs_csum_item *)((char *)item + + csum_size); + if (item < item_end && bytenr + PAGE_CACHE_SIZE == + sector_sum->bytenr) { + bytenr = sector_sum->bytenr; + goto next_sector; + } + } + if (eb_token) { + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + } + btrfs_mark_buffer_dirty(path->nodes[0]); + cond_resched(); + if (total_bytes < sums->len) { + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_free_path(path); + return ret; + +fail_unlock: + goto out; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 00000000000..90268334145 --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,1288 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/version.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "compat.h" + + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + int write_bytes, + struct page **prepared_pages, + const char __user *buf) +{ + long page_fault = 0; + int i; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[i]; + fault_in_pages_readable(buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page) + offset, + buf, count); + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf += count; + write_bytes -= count; + + if (page_fault) + break; + } + return page_fault ? -EFAULT : 0; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + if (!pages[i]) + break; + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } +} + +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ +static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + size_t write_bytes) +{ + int err = 0; + int i; + struct inode *inode = fdentry(file)->d_inode; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 hint_byte; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); + + start_pos = pos & ~((u64)root->sectorsize - 1); + num_bytes = (write_bytes + pos - start_pos + + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; + + lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out_unlock; + } + btrfs_set_trans_block_group(trans, inode); + hint_byte = 0; + + set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); + + /* check for reserved extents on each page, we don't want + * to reset the delalloc bit on things that already have + * extents reserved. + */ + btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); + } + if (end_pos > isize) { + i_size_write(inode, end_pos); + btrfs_update_inode(trans, root, inode); + } + err = btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + return err; +} + +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) +{ + struct extent_map *em; + struct extent_map *split = NULL; + struct extent_map *split2 = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 len = end - start + 1; + int ret; + int testend = 1; + unsigned long flags; + int compressed = 0; + + WARN_ON(end < start); + if (end == (u64)-1) { + len = (u64)-1; + testend = 0; + } + while (1) { + if (!split) + split = alloc_extent_map(GFP_NOFS); + if (!split2) + split2 = alloc_extent_map(GFP_NOFS); + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + spin_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + spin_unlock(&em_tree->lock); + if (em->start <= start && + (!testend || em->start + em->len >= start + len)) { + free_extent_map(em); + break; + } + if (start < em->start) { + len = em->start - start; + } else { + len = start + len - (em->start + em->len); + start = em->start + em->len; + } + free_extent_map(em); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + remove_extent_mapping(em_tree, em); + + if (em->block_start < EXTENT_MAP_LAST_BYTE && + em->start < start) { + split->start = em->start; + split->len = start - em->start; + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + + split->bdev = em->bdev; + split->flags = flags; + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em->block_start < EXTENT_MAP_LAST_BYTE && + testend && em->start + em->len > start + len) { + u64 diff = start + len - em->start; + + split->start = start + len; + split->len = em->start + em->len - (start + len); + split->bdev = em->bdev; + split->flags = flags; + + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + split->orig_start = split->start; + } + + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = NULL; + } + spin_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + if (split) + free_extent_map(split); + if (split2) + free_extent_map(split2); + return 0; +} + +int btrfs_check_file(struct btrfs_root *root, struct inode *inode) +{ + return 0; +#if 0 + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + u64 last_offset = 0; + int nritems; + int slot; + int found_type; + int ret; + int err = 0; + u64 extent_end = 0; + + path = btrfs_alloc_path(); + ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, + last_offset, 0); + while (1) { + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + nritems = btrfs_header_nritems(path->nodes[0]); + } + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != inode->i_ino) + break; + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + if (found_key.offset < last_offset) { + WARN_ON(1); + btrfs_print_leaf(root, leaf); + printk(KERN_ERR "inode %lu found offset %llu " + "expected %llu\n", inode->i_ino, + (unsigned long long)found_key.offset, + (unsigned long long)last_offset); + err = 1; + goto out; + } + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, extent); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + struct btrfs_item *item; + item = btrfs_item_nr(leaf, slot); + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, extent); + extent_end = (extent_end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + last_offset = extent_end; + path->slots[0]++; + } + if (0 && last_offset < inode->i_size) { + WARN_ON(1); + btrfs_print_leaf(root, leaf); + printk(KERN_ERR "inode %lu found offset %llu size %llu\n", + inode->i_ino, (unsigned long long)last_offset, + (unsigned long long)inode->i_size); + err = 1; + + } +out: + btrfs_free_path(path); + return err; +#endif +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + * + * inline_limit is used to tell this code which offsets in the file to keep + * if they contain inline extents. + */ +noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 inline_limit, u64 *hint_byte) +{ + u64 extent_end = 0; + u64 locked_end = end; + u64 search_start = start; + u64 leaf_start; + u64 ram_bytes = 0; + u64 orig_parent = 0; + u64 disk_bytenr = 0; + u8 compression; + u8 encryption; + u16 other_encoding = 0; + u64 root_gen; + u64 root_owner; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item old; + int keep; + int slot; + int bookend; + int found_type = 0; + int found_extent; + int found_inline; + int recow; + int ret; + + inline_limit = 0; + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + while (1) { + recow = 0; + btrfs_release_path(root, path); + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + search_start, -1); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } +next_slot: + keep = 0; + bookend = 0; + found_extent = 0; + found_inline = 0; + leaf_start = 0; + root_gen = 0; + root_owner = 0; + compression = 0; + encryption = 0; + extent = NULL; + leaf = path->nodes[0]; + slot = path->slots[0]; + ret = 0; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && + key.offset >= end) { + goto out; + } + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != inode->i_ino) { + goto out; + } + if (recow) { + search_start = max(key.offset, start); + continue; + } + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + compression = btrfs_file_extent_compression(leaf, + extent); + encryption = btrfs_file_extent_encryption(leaf, + extent); + other_encoding = btrfs_file_extent_other_encoding(leaf, + extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = + btrfs_file_extent_disk_bytenr(leaf, + extent); + if (extent_end) + *hint_byte = extent_end; + + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, extent); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, + extent); + found_extent = 1; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_inline = 1; + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, extent); + } + } else { + extent_end = search_start; + } + + /* we found nothing we can drop */ + if ((!found_extent && !found_inline) || + search_start >= extent_end) { + int nextret; + u32 nritems; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems - 1) { + nextret = btrfs_next_leaf(root, path); + if (nextret) + goto out; + recow = 1; + } else { + path->slots[0]++; + } + goto next_slot; + } + + if (end <= extent_end && start >= key.offset && found_inline) + *hint_byte = EXTENT_MAP_INLINE; + + if (found_extent) { + read_extent_buffer(leaf, &old, (unsigned long)extent, + sizeof(old)); + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + leaf_start = leaf->start; + } + + if (end < extent_end && end >= key.offset) { + bookend = 1; + if (found_inline && start <= key.offset) + keep = 1; + } + + if (bookend && found_extent) { + if (locked_end < extent_end) { + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, + GFP_NOFS); + if (!ret) { + btrfs_release_path(root, path); + lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, + GFP_NOFS); + locked_end = extent_end; + continue; + } + locked_end = extent_end; + } + orig_parent = path->nodes[0]->start; + disk_bytenr = le64_to_cpu(old.disk_bytenr); + if (disk_bytenr != 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } + } + + if (found_inline) { + u64 mask = root->sectorsize - 1; + search_start = (extent_end + mask) & ~mask; + } else + search_start = extent_end; + + /* truncate existing extent */ + if (start > key.offset) { + u64 new_num; + u64 old_num; + keep = 1; + WARN_ON(start & (root->sectorsize - 1)); + if (found_extent) { + new_num = start - key.offset; + old_num = btrfs_file_extent_num_bytes(leaf, + extent); + *hint_byte = + btrfs_file_extent_disk_bytenr(leaf, + extent); + if (btrfs_file_extent_disk_bytenr(leaf, + extent)) { + inode_sub_bytes(inode, old_num - + new_num); + } + btrfs_set_file_extent_num_bytes(leaf, + extent, new_num); + btrfs_mark_buffer_dirty(leaf); + } else if (key.offset < inline_limit && + (end > extent_end) && + (inline_limit < extent_end)) { + u32 new_size; + new_size = btrfs_file_extent_calc_inline_size( + inline_limit - key.offset); + inode_sub_bytes(inode, extent_end - + inline_limit); + btrfs_set_file_extent_ram_bytes(leaf, extent, + new_size); + if (!compression && !encryption) { + btrfs_truncate_item(trans, root, path, + new_size, 1); + } + } + } + /* delete the entire extent */ + if (!keep) { + if (found_inline) + inode_sub_bytes(inode, extent_end - + key.offset); + ret = btrfs_del_item(trans, root, path); + /* TODO update progress marker and return */ + BUG_ON(ret); + extent = NULL; + btrfs_release_path(root, path); + /* the extent will be freed later */ + } + if (bookend && found_inline && start <= key.offset) { + u32 new_size; + new_size = btrfs_file_extent_calc_inline_size( + extent_end - end); + inode_sub_bytes(inode, end - key.offset); + btrfs_set_file_extent_ram_bytes(leaf, extent, + new_size); + if (!compression && !encryption) + ret = btrfs_truncate_item(trans, root, path, + new_size, 0); + BUG_ON(ret); + } + /* create bookend, splitting the extent in two */ + if (bookend && found_extent) { + struct btrfs_key ins; + ins.objectid = inode->i_ino; + ins.offset = end; + btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); + + btrfs_release_path(root, path); + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + write_extent_buffer(leaf, &old, + (unsigned long)extent, sizeof(old)); + + btrfs_set_file_extent_compression(leaf, extent, + compression); + btrfs_set_file_extent_encryption(leaf, extent, + encryption); + btrfs_set_file_extent_other_encoding(leaf, extent, + other_encoding); + btrfs_set_file_extent_offset(leaf, extent, + le64_to_cpu(old.offset) + end - key.offset); + WARN_ON(le64_to_cpu(old.num_bytes) < + (extent_end - end)); + btrfs_set_file_extent_num_bytes(leaf, extent, + extent_end - end); + + /* + * set the ram bytes to the size of the full extent + * before splitting. This is a worst case flag, + * but its the best we can do because we don't know + * how splitting affects compression + */ + btrfs_set_file_extent_ram_bytes(leaf, extent, + ram_bytes); + btrfs_set_file_extent_type(leaf, extent, found_type); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + if (disk_bytenr != 0) { + ret = btrfs_update_extent_ref(trans, root, + disk_bytenr, orig_parent, + leaf->start, + root->root_key.objectid, + trans->transid, ins.objectid); + + BUG_ON(ret); + } + btrfs_release_path(root, path); + if (disk_bytenr != 0) + inode_add_bytes(inode, extent_end - end); + } + + if (found_extent && !keep) { + u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); + + if (old_disk_bytenr != 0) { + inode_sub_bytes(inode, + le64_to_cpu(old.num_bytes)); + ret = btrfs_free_extent(trans, root, + old_disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + leaf_start, root_owner, + root_gen, key.objectid, 0); + BUG_ON(ret); + *hint_byte = old_disk_bytenr; + } + } + + if (search_start >= end) { + ret = 0; + goto out; + } + } +out: + btrfs_free_path(path); + if (locked_end > end) { + unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, + GFP_NOFS); + } + btrfs_check_file(root, inode); + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 extent_offset; + u64 other_start; + u64 other_end; + u64 split = start; + u64 locked_end = end; + u64 orig_parent; + int extent_type; + int split_end = 1; + int ret; + + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + key.objectid = inode->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + if (split == start) + key.offset = split; + else + key.offset = split - 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + BUG_ON(key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (key.offset == start) + split = end; + + if (key.offset == start && extent_end == end) { + int del_nr = 0; + int del_slot = 0; + u64 leaf_owner = btrfs_header_owner(leaf); + u64 leaf_gen = btrfs_header_generation(leaf); + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); + BUG_ON(ret); + } + split_end = 0; + if (del_nr == 0) { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + goto done; + } + + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + goto done; + } else if (split == start) { + if (locked_end < extent_end) { + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, GFP_NOFS); + if (!ret) { + btrfs_release_path(root, path); + lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, GFP_NOFS); + locked_end = extent_end; + goto again; + } + locked_end = extent_end; + } + btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); + extent_offset += split - key.offset; + } else { + BUG_ON(key.offset != start); + btrfs_set_file_extent_offset(leaf, fi, extent_offset + + split - key.offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); + key.offset = split; + btrfs_set_item_key_safe(trans, root, path, &key); + extent_end = split; + } + + if (extent_end == end) { + split_end = 0; + extent_type = BTRFS_FILE_EXTENT_REG; + } + if (extent_end == end && split == start) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + path->slots[0]++; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + key.offset = split; + btrfs_set_item_key_safe(trans, root, path, &key); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - split); + goto done; + } + } + if (extent_end == end && split == end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, + bytenr, &other_start, &other_end)) { + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - + other_start); + goto done; + } + } + + btrfs_mark_buffer_dirty(leaf); + + orig_parent = leaf->start; + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + btrfs_release_path(root, path); + + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_compression(leaf, fi, 0); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + + if (orig_parent != leaf->start) { + ret = btrfs_update_extent_ref(trans, root, bytenr, + orig_parent, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } +done: + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + if (split_end && split == start) { + split = end; + goto again; + } + if (locked_end > end) { + unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, + GFP_NOFS); + } + btrfs_free_path(path); + return 0; +} + +/* + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. + */ +static noinline int prepare_pages(struct btrfs_root *root, struct file *file, + struct page **pages, size_t num_pages, + loff_t pos, unsigned long first_index, + unsigned long last_index, size_t write_bytes) +{ + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = fdentry(file)->d_inode; + int err = 0; + u64 start_pos; + u64 last_pos; + + start_pos = pos & ~((u64)root->sectorsize - 1); + last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + + if (start_pos > inode->i_size) { + err = btrfs_cont_expand(inode, start_pos); + if (err) + return err; + } + + memset(pages, 0, num_pages * sizeof(struct page *)); +again: + for (i = 0; i < num_pages; i++) { + pages[i] = grab_cache_page(inode->i_mapping, index + i); + if (!pages[i]) { + err = -ENOMEM; + BUG_ON(1); + } + wait_on_page_writeback(pages[i]); + } + if (start_pos < inode->i_size) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + last_pos - 1); + if (ordered && + ordered->file_offset + ordered->len > start_pos && + ordered->file_offset < last_pos) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + } + for (i = 0; i < num_pages; i++) { + clear_page_dirty_for_io(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } + return 0; +} + +static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos; + loff_t start_pos; + ssize_t num_written = 0; + ssize_t err = 0; + int ret = 0; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + int nrptrs; + struct page *pinned[2]; + unsigned long first_index; + unsigned long last_index; + int will_write; + + will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT)); + + nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE / (sizeof(struct page *))); + pinned[0] = NULL; + pinned[1] = NULL; + + pos = *ppos; + start_pos = pos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out_nolock; + if (count == 0) + goto out_nolock; + + err = file_remove_suid(file); + if (err) + goto out_nolock; + file_update_time(file); + + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->sequence++; + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; + + /* + * there are lots of better ways to do this, but this code + * makes sure the first and last page in the file range are + * up to date and ready for cow + */ + if ((pos & (PAGE_CACHE_SIZE - 1))) { + pinned[0] = grab_cache_page(inode->i_mapping, first_index); + if (!PageUptodate(pinned[0])) { + ret = btrfs_readpage(NULL, pinned[0]); + BUG_ON(ret); + wait_on_page_locked(pinned[0]); + } else { + unlock_page(pinned[0]); + } + } + if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + pinned[1] = grab_cache_page(inode->i_mapping, last_index); + if (!PageUptodate(pinned[1])) { + ret = btrfs_readpage(NULL, pinned[1]); + BUG_ON(ret); + wait_on_page_locked(pinned[1]); + } else { + unlock_page(pinned[1]); + } + } + + while (count > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(count, nrptrs * + (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + WARN_ON(num_pages > nrptrs); + memset(pages, 0, sizeof(struct page *) * nrptrs); + + ret = btrfs_check_free_space(root, write_bytes, 0); + if (ret) + goto out; + + ret = prepare_pages(root, file, pages, num_pages, + pos, first_index, last_index, + write_bytes); + if (ret) + goto out; + + ret = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, buf); + if (ret) { + btrfs_drop_pages(pages, num_pages); + goto out; + } + + ret = dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); + btrfs_drop_pages(pages, num_pages); + if (ret) + goto out; + + if (will_write) { + btrfs_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1, + WB_SYNC_NONE); + } else { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + num_pages); + if (num_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } + + buf += write_bytes; + count -= write_bytes; + pos += write_bytes; + num_written += write_bytes; + + cond_resched(); + } +out: + mutex_unlock(&inode->i_mutex); + +out_nolock: + kfree(pages); + if (pinned[0]) + page_cache_release(pinned[0]); + if (pinned[1]) + page_cache_release(pinned[1]); + *ppos = pos; + + if (num_written > 0 && will_write) { + struct btrfs_trans_handle *trans; + + err = btrfs_wait_ordered_range(inode, start_pos, num_written); + if (err) + num_written = err; + + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_log_dentry_safe(trans, root, + file->f_dentry); + if (ret == 0) { + btrfs_sync_log(trans, root); + btrfs_end_transaction(trans, root); + } else { + btrfs_commit_transaction(trans, root); + } + } + if (file->f_flags & O_DIRECT) { + invalidate_mapping_pages(inode->i_mapping, + start_pos >> PAGE_CACHE_SHIFT, + (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); + } + } + current->backing_dev_info = NULL; + return num_written ? num_written : err; +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + btrfs_ioctl_trans_end(filp); + return 0; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + struct btrfs_trans_handle *trans; + + /* + * check the transaction that last modified this inode + * and see if its already been committed + */ + if (!BTRFS_I(inode)->last_trans) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { + BTRFS_I(inode)->last_trans = 0; + mutex_unlock(&root->fs_info->trans_mutex); + goto out; + } + mutex_unlock(&root->fs_info->trans_mutex); + + root->fs_info->tree_log_batch++; + filemap_fdatawrite(inode->i_mapping); + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->fs_info->tree_log_batch++; + + /* + * ok we haven't committed the transaction yet, lets do a commit + */ + if (file->private_data) + btrfs_ioctl_trans_end(file); + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + if (ret < 0) + goto out; + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + + if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); + } else { + btrfs_sync_log(trans, root); + ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&file->f_dentry->d_inode->i_mutex); +out: + return ret > 0 ? EIO : ret; +} + +static struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &btrfs_file_vm_ops; + file_accessed(filp); + return 0; +} + +struct file_operations btrfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .write = btrfs_file_write, + .mmap = btrfs_file_mmap, + .open = generic_file_open, + .release = btrfs_release_file, + .fsync = btrfs_sync_file, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif +}; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 00000000000..d1e5f0e84c5 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) + p = &(*p)->rb_left; + else if (offset > info->offset) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +static int tree_insert_bytes(struct rb_root *root, u64 bytes, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, bytes_index); + + if (bytes < info->bytes) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. If contains is set we will return + * the free space that contains the given offset. If contains is not set we + * will return the free space that starts at or after the given offset and is + * at least bytes long. + */ +static struct btrfs_free_space *tree_search_offset(struct rb_root *root, + u64 offset, u64 bytes, + int contains) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, offset_index); + + if (offset < entry->offset) { + if (!contains && + (!ret || entry->offset < ret->offset) && + (bytes <= entry->bytes)) + ret = entry; + n = n->rb_left; + } else if (offset > entry->offset) { + if ((entry->offset + entry->bytes - 1) >= offset && + bytes <= entry->bytes) { + ret = entry; + break; + } + n = n->rb_right; + } else { + if (bytes > entry->bytes) { + n = n->rb_right; + continue; + } + ret = entry; + break; + } + } + + return ret; +} + +/* + * return a chunk at least bytes size, as close to offset that we can get. + */ +static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, + u64 offset, u64 bytes) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, bytes_index); + + if (bytes < entry->bytes) { + /* + * We prefer to get a hole size as close to the size we + * are asking for so we don't take small slivers out of + * huge holes, but we also want to get as close to the + * offset as possible so we don't have a whole lot of + * fragmentation. + */ + if (offset <= entry->offset) { + if (!ret) + ret = entry; + else if (entry->bytes < ret->bytes) + ret = entry; + else if (entry->offset < ret->offset) + ret = entry; + } + n = n->rb_left; + } else if (bytes > entry->bytes) { + n = n->rb_right; + } else { + /* + * Ok we may have multiple chunks of the wanted size, + * so we don't want to take the first one we find, we + * want to take the one closest to our given offset, so + * keep searching just in case theres a better match. + */ + n = n->rb_right; + if (offset > entry->offset) + continue; + else if (!ret || entry->offset < ret->offset) + ret = entry; + } + } + + return ret; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &block_group->free_space_offset); + rb_erase(&info->bytes_index, &block_group->free_space_bytes); +} + +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + int ret = 0; + + + ret = tree_insert_offset(&block_group->free_space_offset, info->offset, + &info->offset_index); + if (ret) + return ret; + + ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, + &info->bytes_index); + if (ret) + return ret; + + return ret; +} + +static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *right_info; + struct btrfs_free_space *left_info; + struct btrfs_free_space *info = NULL; + struct btrfs_free_space *alloc_info; + int ret = 0; + + alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!alloc_info) + return -ENOMEM; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(&block_group->free_space_offset, + offset+bytes, 0, 1); + left_info = tree_search_offset(&block_group->free_space_offset, + offset-1, 0, 1); + + if (right_info && right_info->offset == offset+bytes) { + unlink_free_space(block_group, right_info); + info = right_info; + info->offset = offset; + info->bytes += bytes; + } else if (right_info && right_info->offset != offset+bytes) { + printk(KERN_ERR "btrfs adding space in the middle of an " + "existing free space area. existing: " + "offset=%llu, bytes=%llu. new: offset=%llu, " + "bytes=%llu\n", (unsigned long long)right_info->offset, + (unsigned long long)right_info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + BUG(); + } + + if (left_info) { + unlink_free_space(block_group, left_info); + + if (unlikely((left_info->offset + left_info->bytes) != + offset)) { + printk(KERN_ERR "btrfs free space to the left " + "of new free space isn't " + "quite right. existing: offset=%llu, " + "bytes=%llu. new: offset=%llu, bytes=%llu\n", + (unsigned long long)left_info->offset, + (unsigned long long)left_info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + BUG(); + } + + if (info) { + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); + } else { + info = left_info; + info->bytes += bytes; + } + } + + if (info) { + ret = link_free_space(block_group, info); + if (!ret) + info = NULL; + goto out; + } + + info = alloc_info; + alloc_info = NULL; + info->offset = offset; + info->bytes = bytes; + + ret = link_free_space(block_group, info); + if (ret) + kfree(info); +out: + if (ret) { + printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + if (ret == -EEXIST) + BUG(); + } + + kfree(alloc_info); + + return ret; +} + +static int +__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = tree_search_offset(&block_group->free_space_offset, offset, 0, + 1); + + if (info && info->offset == offset) { + if (info->bytes < bytes) { + printk(KERN_ERR "Found free space at %llu, size %llu," + "trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); + WARN_ON(1); + ret = -EINVAL; + goto out; + } + unlink_free_space(block_group, info); + + if (info->bytes == bytes) { + kfree(info); + goto out; + } + + info->offset += bytes; + info->bytes -= bytes; + + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else if (info && info->offset < offset && + info->offset + info->bytes >= offset + bytes) { + u64 old_start = info->offset; + /* + * we're freeing space in the middle of the info, + * this can happen during tree log replay + * + * first unlink the old info and then + * insert it again after the hole we're creating + */ + unlink_free_space(block_group, info); + if (offset + bytes < info->offset + info->bytes) { + u64 old_end = info->offset + info->bytes; + + info->offset = offset + bytes; + info->bytes = old_end - info->offset; + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else { + /* the hole we're creating ends at the end + * of the info struct, just free the info + */ + kfree(info); + } + + /* step two, insert a new info struct to cover anything + * before the hole + */ + ret = __btrfs_add_free_space(block_group, old_start, + offset - old_start); + BUG_ON(ret); + } else { + WARN_ON(1); + } +out: + return ret; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + struct btrfs_free_space *sp; + + mutex_lock(&block_group->alloc_mutex); + ret = __btrfs_add_free_space(block_group, offset, bytes); + sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); + BUG_ON(!sp); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + struct btrfs_free_space *sp; + + ret = __btrfs_add_free_space(block_group, offset, bytes); + sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); + BUG_ON(!sp); + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret = 0; + + mutex_lock(&block_group->alloc_mutex); + ret = __btrfs_remove_free_space(block_group, offset, bytes); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + + ret = __btrfs_remove_free_space(block_group, offset, bytes); + + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes) + count++; + } + printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" + "\n", count); +} + +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *n; + u64 ret = 0; + + for (n = rb_first(&block_group->free_space_offset); n; + n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + ret += info->bytes; + } + + return ret; +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + mutex_lock(&block_group->alloc_mutex); + while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, bytes_index); + unlink_free_space(block_group, info); + kfree(info); + if (need_resched()) { + mutex_unlock(&block_group->alloc_mutex); + cond_resched(); + mutex_lock(&block_group->alloc_mutex); + } + } + mutex_unlock(&block_group->alloc_mutex); +} + +#if 0 +static struct btrfs_free_space *btrfs_find_free_space_offset(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + mutex_lock(&block_group->alloc_mutex); + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +static struct btrfs_free_space *btrfs_find_free_space_bytes(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + mutex_lock(&block_group->alloc_mutex); + + ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} +#endif + +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret = NULL; + + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + if (!ret) + ret = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes); + + return ret; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h new file mode 100644 index 00000000000..2a020b27676 --- /dev/null +++ b/fs/btrfs/hash.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __HASH__ +#define __HASH__ + +#include "crc32c.h" +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return btrfs_crc32c((u32)~1, name, len); +} +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 00000000000..3d46fa1f29a --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static int find_name_in_backref(struct btrfs_path *path, const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name_len) + continue; + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { + *ref_ret = ref; + return 1; + } + } + return 0; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int del_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + goto out; + } else if (ret < 0) { + goto out; + } + if (!find_name_in_backref(path, name, name_len, &ref)) { + ret = -ENOENT; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name_len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + ret = btrfs_truncate_item(trans, root, path, + item_size - sub_item_len, 1); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + + if (find_name_in_backref(path, name, name_len, &ref)) + goto out; + + old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ret = btrfs_extend_item(trans, root, path, ins_len); + BUG_ON(ret); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + if (ret == 0 && objectid > root->highest_inode) + root->highest_inode = objectid; + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + btrfs_key_type(&found_key) == btrfs_key_type(location)) { + path->slots[0]--; + return 0; + } + } + return ret; +} diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 00000000000..2aa79873eb4 --- /dev/null +++ b/fs/btrfs/inode-map.c @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = found_key.objectid; + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * walks the btree of allocated inodes and find a hole. + */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 *objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + int slot = 0; + u64 last_ino = 0; + int start_found; + struct extent_buffer *l; + struct btrfs_key search_key; + u64 search_start = dirid; + + mutex_lock(&root->objectid_mutex); + if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && + root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { + *objectid = ++root->last_inode_alloc; + mutex_unlock(&root->objectid_mutex); + return 0; + } + path = btrfs_alloc_path(); + BUG_ON(!path); + search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_key.objectid = search_start; + search_key.type = 0; + search_key.offset = 0; + + btrfs_init_path(path); + start_found = 0; + ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + if (!start_found) { + *objectid = search_start; + start_found = 1; + goto found; + } + *objectid = last_ino > search_start ? + last_ino : search_start; + goto found; + } + btrfs_item_key_to_cpu(l, &key, slot); + if (key.objectid >= search_start) { + if (start_found) { + if (last_ino < search_start) + last_ino = search_start; + if (key.objectid > last_ino) { + *objectid = last_ino; + goto found; + } + } else if (key.objectid > search_start) { + *objectid = search_start; + goto found; + } + } + if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) + break; + + start_found = 1; + last_ino = key.objectid + 1; + path->slots[0]++; + } + BUG_ON(1); +found: + btrfs_release_path(root, path); + btrfs_free_path(path); + BUG_ON(*objectid < search_start); + mutex_unlock(&root->objectid_mutex); + return 0; +error: + btrfs_release_path(root, path); + btrfs_free_path(path); + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 00000000000..8adfe059ab4 --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,5035 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/falloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "ref-cache.h" +#include "compression.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +static struct inode_operations btrfs_dir_inode_operations; +static struct inode_operations btrfs_symlink_inode_operations; +static struct inode_operations btrfs_dir_ro_inode_operations; +static struct inode_operations btrfs_special_inode_operations; +static struct inode_operations btrfs_file_inode_operations; +static struct address_space_operations btrfs_aops; +static struct address_space_operations btrfs_symlink_aops; +static struct file_operations btrfs_dir_file_operations; +static struct extent_io_ops btrfs_extent_io_ops; + +static struct kmem_cache *btrfs_inode_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_bit_radix_cachep; +struct kmem_cache *btrfs_path_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); + +/* + * a very lame attempt at stopping writes when the FS is 85% full. There + * are countless ways this is incorrect, but it is better than nothing. + */ +int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, + int for_del) +{ + u64 total; + u64 used; + u64 thresh; + int ret = 0; + + spin_lock(&root->fs_info->delalloc_lock); + total = btrfs_super_total_bytes(&root->fs_info->super_copy); + used = btrfs_super_bytes_used(&root->fs_info->super_copy); + if (for_del) + thresh = total * 90; + else + thresh = total * 85; + + do_div(thresh, 100); + + if (used + root->fs_info->delalloc_bytes + num_required > thresh) + ret = -ENOSPC; + spin_unlock(&root->fs_info->delalloc_lock); + return ret; +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + int use_compress = 0; + + if (compressed_size && compressed_pages) { + use_compress = 1; + cur_size = compressed_size; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + btrfs_set_trans_block_group(trans, inode); + + key.objectid = inode->i_ino; + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (use_compress) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap(cpage); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + BTRFS_COMPRESS_ZLIB); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + actual_end >= PAGE_CACHE_SIZE || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + ret = btrfs_drop_extents(trans, root, inode, start, + aligned_end, start, &hint_byte); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, start, aligned_end, 0); + return 0; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + struct list_head list; +}; + +struct async_cow { + struct inode *inode; + struct btrfs_root *root; + struct page *locked_page; + u64 start; + u64 end; + struct list_head extents; + struct btrfs_work work; +}; + +static noinline int add_async_extent(struct async_cow *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +/* + * we create compressed extents in two phases. The first + * phase compresses a range of pages that have already been + * locked (both pages and state bits are locked). + * + * This is done inside an ordered work queue, and the compression + * is spread across many cpus. The actual IO submission is step + * two, and the ordered work queue takes care of making sure that + * happens in the same order things were put onto the queue by + * writepages and friends. + * + * If this code finds it can't get good compression, it puts an + * entry onto the work queue to write the uncompressed bytes. This + * makes sure that both compressed inodes and uncompressed inodes + * are written in the same order that pdflush sent them down. + */ +static noinline int compress_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, + struct async_cow *async_cow, + int *num_added) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 num_bytes; + u64 orig_start; + u64 disk_num_bytes; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 128 * 1024; + int i; + int will_compress; + + orig_start = start; + + actual_end = min_t(u64, isize, end + 1); +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 128k. This is a crucial number + * because it also controls how easily we can spread reads across + * cpus for decompression. + * + * We also want to make sure the amount of IO required to do + * a random read is reasonably small, so we limit the size of + * a compressed extent to 128k. + */ + total_compressed = min(total_compressed, max_uncompressed); + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + total_in = 0; + ret = 0; + + /* + * we do compression for mount -o compress and when the + * inode has not been flagged as nocompress. This flag can + * change at any time if we discover bad compression ratios. + */ + if (!btrfs_test_flag(inode, NOCOMPRESS) && + btrfs_test_opt(root, COMPRESS)) { + WARN_ON(pages); + pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + + ret = btrfs_zlib_compress_pages(inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + /* lets try to make an inline extent */ + if (ret || total_in < (actual_end - start)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + } else { + /* try making a compressed inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, pages); + } + btrfs_end_transaction(trans, root); + if (ret == 0) { + /* + * inline extent creation worked, we don't need + * to create any more async work items. Unlock + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, 1, 0, + 0, 1, 1, 1); + ret = 0; + goto free_pages_out; + } + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + disk_num_bytes = total_compressed; + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + btrfs_set_flag(inode, NOCOMPRESS); + } + if (will_compress) { + *num_added += 1; + + /* the async work queues will take care of doing actual + * allocation on disk for these compressed pages, + * and will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret); + + if (start + num_bytes < end && start + num_bytes < actual_end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + } else { + /* + * No compression, but we still need to write the pages in + * the file we've been given so far. redirty the locked + * page if it corresponds to our extent and set things up + * for the async work queue to run cow_file_range to do + * the normal delalloc dance + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) { + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + } + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + *num_added += 1; + } + +out: + return 0; + +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + + goto out; +} + +/* + * phase two of compressed writeback. This is the ordered portion + * of the code, which only gets called in the order the work was + * queued. We walk all the async extents created by compress_file_range + * and send them down to the disk. + */ +static noinline int submit_compressed_extents(struct inode *inode, + struct async_cow *async_cow) +{ + struct async_extent *async_extent; + u64 alloc_hint = 0; + struct btrfs_trans_handle *trans; + struct btrfs_key ins; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree; + int ret; + + if (list_empty(&async_cow->extents)) + return 0; + + trans = btrfs_join_transaction(root, 1); + + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + + io_tree = &BTRFS_I(inode)->io_tree; + + /* did the compression code fall back to uncompressed IO? */ + if (!async_extent->pages) { + int page_started = 0; + unsigned long nr_written = 0; + + lock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + + /* allocate blocks */ + cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); + + /* + * if page_started, cow_file_range inserted an + * inline extent and took care of all the unlocking + * and IO for us. Otherwise, we need to submit + * all those pages down to the drive. + */ + if (!page_started) + extent_write_locked_range(io_tree, + inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + btrfs_get_extent, + WB_SYNC_ALL); + kfree(async_extent); + cond_resched(); + continue; + } + + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1, + GFP_NOFS); + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + + ret = btrfs_reserve_extent(trans, root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + em = alloc_extent_map(GFP_NOFS); + em->start = async_extent->start; + em->len = async_extent->ram_size; + em->orig_start = em->start; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + } + + ret = btrfs_add_ordered_extent(inode, async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED); + BUG_ON(ret); + + btrfs_end_transaction(trans, root); + + /* + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, 1, 1, 0, 1, 1, 0); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, + async_extent->ram_size, + ins.objectid, + ins.offset, async_extent->pages, + async_extent->nr_pages); + + BUG_ON(ret); + trans = btrfs_join_transaction(root, 1); + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. + */ +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; + u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + struct btrfs_key ins; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + actual_end = min_t(u64, isize, end + 1); + + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + ret = 0; + + if (start == 0) { + /* lets try to make an inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, 1, 1, + 1, 1, 1, 1); + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; + ret = 0; + goto out; + } + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->orig_start = em->start; + + ram_size = ins.offset; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, + start + ram_size - 1, 0); + } + + cur_alloc_size = ins.offset; + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + BUG_ON(ret); + } + + if (disk_num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + */ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, unlock, 1, + 1, 0, 0, 0); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +out: + ret = 0; + btrfs_end_transaction(trans, root); + + return ret; +} + +/* + * work queue call back to started compression on a file and pages + */ +static noinline void async_cow_start(struct btrfs_work *work) +{ + struct async_cow *async_cow; + int num_added = 0; + async_cow = container_of(work, struct async_cow, work); + + compress_file_range(async_cow->inode, async_cow->locked_page, + async_cow->start, async_cow->end, async_cow, + &num_added); + if (num_added == 0) + async_cow->inode = NULL; +} + +/* + * work queue call back to submit previously compressed pages + */ +static noinline void async_cow_submit(struct btrfs_work *work) +{ + struct async_cow *async_cow; + struct btrfs_root *root; + unsigned long nr_pages; + + async_cow = container_of(work, struct async_cow, work); + + root = async_cow->root; + nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); + + if (atomic_read(&root->fs_info->async_delalloc_pages) < + 5 * 1042 * 1024 && + waitqueue_active(&root->fs_info->async_submit_wait)) + wake_up(&root->fs_info->async_submit_wait); + + if (async_cow->inode) + submit_compressed_extents(async_cow->inode, async_cow); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_cow *async_cow; + async_cow = container_of(work, struct async_cow, work); + kfree(async_cow); +} + +static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + struct async_cow *async_cow; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr_pages; + u64 cur_end; + int limit = 10 * 1024 * 1042; + + if (!btrfs_test_opt(root, COMPRESS)) { + return cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + } + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | + EXTENT_DELALLOC, 1, 0, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + async_cow->inode = inode; + async_cow->root = root; + async_cow->locked_page = locked_page; + async_cow->start = start; + + if (btrfs_test_flag(inode, NOCOMPRESS)) + cur_end = end; + else + cur_end = min(end, start + 512 * 1024 - 1); + + async_cow->end = cur_end; + INIT_LIST_HEAD(&async_cow->extents); + + async_cow->work.func = async_cow_start; + async_cow->work.ordered_func = async_cow_submit; + async_cow->work.ordered_free = async_cow_free; + async_cow->work.flags = 0; + + nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + + btrfs_queue_worker(&root->fs_info->delalloc_workers, + &async_cow->work); + + if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) < + limit)); + } + + while (atomic_read(&root->fs_info->async_submit_draining) && + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) == + 0)); + } + + *nr_written += nr_pages; + start = cur_end + 1; + } + *page_started = 1; + return 0; +} + +static noinline int csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + bytenr + num_bytes - 1, &list); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, int force, + unsigned long *nr_written) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key found_key; + u64 cow_start; + u64 cur_offset; + u64 extent_end; + u64 disk_bytenr; + u64 num_bytes; + int extent_type; + int ret; + int type; + int nocow; + int check_prev = 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + cow_start = (u64)-1; + cur_offset = start; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + cur_offset, 0); + BUG_ON(ret < 0); + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == inode->i_ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + BUG_ON(1); + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + nocow = 0; + disk_bytenr = 0; + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid > inode->i_ino || + found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + goto out_check; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (disk_bytenr == 0) + goto out_check; + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out_check; + if (extent_type == BTRFS_FILE_EXTENT_REG && !force) + goto out_check; + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out_check; + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + disk_bytenr)) + goto out_check; + disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; + nocow = 1; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, fi); + extent_end = ALIGN(extent_end, root->sectorsize); + } else { + BUG_ON(1); + } +out_check: + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (!nocow) { + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + path->slots[0]++; + goto next_slot; + } + + btrfs_release_path(root, path); + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, + found_key.offset - 1, page_started, + nr_written, 1); + BUG_ON(ret); + cow_start = (u64)-1; + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + struct extent_map *em; + struct extent_map_tree *em_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(GFP_NOFS); + em->start = cur_offset; + em->orig_start = em->start; + em->len = num_bytes; + em->block_len = num_bytes; + em->block_start = disk_bytenr; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + type = BTRFS_ORDERED_PREALLOC; + } else { + type = BTRFS_ORDERED_NOCOW; + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, + num_bytes, num_bytes, type); + BUG_ON(ret); + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + cur_offset, cur_offset + num_bytes - 1, + locked_page, 1, 1, 1, 0, 0, 0); + cur_offset = extent_end; + if (cur_offset > end) + break; + } + btrfs_release_path(root, path); + + if (cur_offset <= end && cow_start == (u64)-1) + cow_start = cur_offset; + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); + BUG_ON(ret); + } + + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_free_path(path); + return 0; +} + +/* + * extent_io.c call back to do delayed allocation processing + */ +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + if (btrfs_test_flag(inode, NODATACOW)) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 1, nr_written); + else if (btrfs_test_flag(inode, PREALLOC)) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 0, nr_written); + else + ret = cow_file_range_async(inode, locked_page, start, end, + page_started, nr_written); + + return ret; +} + +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ +static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += end - start + 1; + root->fs_info->delalloc_bytes += end - start + 1; + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + spin_lock(&root->fs_info->delalloc_lock); + if (end - start + 1 > root->fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs warning: delalloc account " + "%llu %llu\n", + (unsigned long long)end - start + 1, + (unsigned long long) + root->fs_info->delalloc_bytes); + root->fs_info->delalloc_bytes = 0; + BTRFS_I(inode)->delalloc_bytes = 0; + } else { + root->fs_info->delalloc_bytes -= end - start + 1; + BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + } + if (BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_mapping_tree *map_tree; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + + if (bio_flags & EXTENT_BIO_COMPRESSED) + return 0; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, READ, logical, + &map_length, NULL, 0); + + if (map_length < length + size) + return 1; + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + BUG_ON(ret); + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + return btrfs_map_bio(root, rw, bio, mirror_num, 1); +} + +/* + * extent_io.c submission hook. This does the right thing for csum calculation + * on write, or reading the csums from the tree before a read + */ +static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + int skip_sum; + + skip_sum = btrfs_test_flag(inode, NODATASUM); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } else if (!skip_sum) + btrfs_lookup_bio_sums(root, inode, bio, NULL); + goto mapit; + } else if (!skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; + /* we're doing a write, do the async checksumming */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + bio_flags, __btrfs_submit_bio_start, + __btrfs_submit_bio_done); + } + +mapit: + return btrfs_map_bio(root, rw, bio, mirror_num, 0); +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static noinline int add_pending_csums(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_offset, + struct list_head *list) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + btrfs_set_trans_block_group(trans, inode); + list_for_each(cur, list) { + sum = list_entry(cur, struct btrfs_ordered_sum, list); + btrfs_csum_file_blocks(trans, + BTRFS_I(inode)->root->fs_info->csum_root, sum); + } + return 0; +} + +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) +{ + if ((end & (PAGE_CACHE_SIZE - 1)) == 0) + WARN_ON(1); + return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, + GFP_NOFS); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup; + struct btrfs_ordered_extent *ordered; + struct page *page; + struct inode *inode; + u64 page_start; + u64 page_end; + + fixup = container_of(work, struct btrfs_writepage_fixup, work); + page = fixup->page; +again: + lock_page(page); + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + ClearPageChecked(page); + goto out_page; + } + + inode = page->mapping->host; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + + /* already ordered? We're done */ + if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_ORDERED, 0)) { + goto out; + } + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, + page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ClearPageChecked(page); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); +out_page: + unlock_page(page); + page_cache_release(page); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +{ + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_ORDERED, 0); + if (ret) + return 0; + + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + SetPageChecked(page); + page_cache_get(page); + fixup->work.func = btrfs_writepage_fixup_worker; + fixup->page = page; + btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + return -EAGAIN; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u64 hint; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, file_pos, &hint); + BUG_ON(ret); + + ins.objectid = inode->i_ino; + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); + BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); + btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino, &ins); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered_extent; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int compressed = 0; + int ret; + + ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); + if (!ret) + return 0; + + trans = btrfs_join_transaction(root, 1); + + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) + goto nocow; + + lock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compressed = 1; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compressed); + ret = btrfs_mark_extent_written(trans, root, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len); + BUG_ON(ret); + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered_extent->file_offset, + ordered_extent->start, + ordered_extent->disk_len, + ordered_extent->len, + ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); + BUG_ON(ret); + } + unlock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); +nocow: + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + + mutex_lock(&BTRFS_I(inode)->extent_mutex); + btrfs_ordered_update_i_size(inode, ordered_extent); + btrfs_update_inode(trans, root, inode); + btrfs_remove_ordered_extent(inode, ordered_extent); + mutex_unlock(&BTRFS_I(inode)->extent_mutex); + + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) +{ + return btrfs_finish_ordered_io(page->mapping->host, start, end); +} + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int last_mirror; +}; + +static int btrfs_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int rw; + u64 logical; + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kmalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->last_mirror = 0; + failrec->bio_flags = 0; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + spin_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + } + failrec->logical = logical; + free_extent_map(em); + set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | + EXTENT_DIRTY, GFP_NOFS); + set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + failrec->last_mirror++; + if (!state) { + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + } + if (!state || failrec->last_mirror > num_copies) { + set_state_private(failure_tree, failrec->start, 0); + clear_extent_bits(failure_tree, failrec->start, + failrec->start + failrec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + kfree(failrec); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + if (failed_bio->bi_rw & (1 << BIO_RW)) + rw = WRITE; + else + rw = READ; + + BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + failrec->last_mirror, + failrec->bio_flags); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int btrfs_clean_io_failures(struct inode *inode, u64 start) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failure; + int ret; + + private = 0; + if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY)) { + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, + start, &private_failure); + if (ret == 0) { + failure = (struct io_failure_record *)(unsigned long) + private_failure; + set_state_private(&BTRFS_I(inode)->io_failure_tree, + failure->start, 0); + clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, + failure->start, + failure->start + failure->len - 1, + EXTENT_DIRTY | EXTENT_LOCKED, + GFP_NOFS); + kfree(failure); + } + } + return 0; +} + +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ +static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + struct inode *inode = page->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + char *kaddr; + u64 private = ~(u32)0; + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum = ~(u32)0; + + if (PageChecked(page)) { + ClearPageChecked(page); + goto good; + } + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + + if (state && state->start == start) { + private = state->private; + ret = 0; + } else { + ret = get_state_private(io_tree, start, &private); + } + kaddr = kmap_atomic(page, KM_USER0); + if (ret) + goto zeroit; + + csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + btrfs_csum_final(csum, (char *)&csum); + if (csum != private) + goto zeroit; + + kunmap_atomic(kaddr, KM_USER0); +good: + /* if the io failure tree for this inode is non-empty, + * check to see if we've recovered from a failed IO + */ + btrfs_clean_io_failures(inode, start); + return 0; + +zeroit: + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + memset(kaddr + offset, 1, end - start + 1); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + if (private == 0) + return 0; + return -EIO; +} + +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + /* already on the orphan list, we're good */ + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + + spin_unlock(&root->list_lock); + + /* + * insert an orphan item to track this unlinked/truncated file + */ + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_del_init(&BTRFS_I(inode)->i_orphan); + if (!trans) { + spin_unlock(&root->list_lock); + return 0; + } + + spin_unlock(&root->list_lock); + + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +void btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + path = btrfs_alloc_path(); + if (!path) + return; + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + printk(KERN_ERR "Error searching slot for orphan: %d" + "\n", ret); + break; + } + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didnt + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(root, path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + inode = btrfs_iget_locked(root->fs_info->sb, + found_key.offset, root); + if (!inode) + break; + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + + /* have to set the location manually */ + BTRFS_I(inode)->location.objectid = inode->i_ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->list_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->list_lock); + + /* + * if this is a bad inode, means we actually succeeded in + * removing the inode, but not the orphan record, which means + * we need to manually delete the orphan since iput will just + * do a destroy_inode + */ + if (is_bad_inode(inode)) { + trans = btrfs_start_transaction(root, 1); + btrfs_orphan_del(trans, inode); + btrfs_end_transaction(trans, root); + iput(inode); + continue; + } + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + nr_truncate++; + btrfs_truncate(inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + } + + if (nr_unlink) + printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + if (nr_truncate) + printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + + btrfs_free_path(path); +} + +/* + * read an inode from the btree into the in-memory inode + */ +void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + u64 alloc_group_block; + u32 rdev; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) + goto make_bad; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + inode->i_uid = btrfs_inode_uid(leaf, inode_item); + inode->i_gid = btrfs_inode_gid(leaf, inode_item); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + + alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, + alloc_group_block, 0); + btrfs_free_path(path); + inode_item = NULL; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + break; + default: + init_special_inode(inode, inode->i_mode, rdev); + break; + } + return; + +make_bad: + btrfs_free_path(path); + make_bad_inode(inode); +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref(trans, root, name, name_len, + inode->i_ino, + dir->i_ino, &index); + if (ret) { + printk(KERN_INFO "btrfs failed to delete reference to %.*s, " + "inode %lu parent %lu\n", name_len, name, + inode->i_ino, dir->i_ino); + goto err; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + ret = btrfs_delete_one_dir_name(trans, root, path, di); + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir->i_ino); + BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != -ENOENT) + BTRFS_I(dir)->log_dirty_trans = trans->transid; + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + BUG_ON(ret); +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + btrfs_update_inode(trans, root, dir); + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + dir->i_sb->s_dirt = 1; +out: + return ret; +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + root = BTRFS_I(dir)->root; + + ret = btrfs_check_free_space(root, 1, 1); + if (ret) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + + if (inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, inode); + + nr = trans->blocks_used; + + btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + int ret; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + + /* + * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir + * the root of a subvolume or snapshot + */ + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + return -ENOTEMPTY; + } + + ret = btrfs_check_free_space(root, 1, 1); + if (ret) + goto fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_orphan_add(trans, inode); + if (err) + goto fail_trans; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); + +fail_trans: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_btree_balance_dirty(root, nr); + + if (ret && !err) + err = ret; + return err; +} + +#if 0 +/* + * when truncating bytes in a file, it is possible to avoid reading + * the leaves that contain only checksum items. This can be the + * majority of the IO required to delete a large file, but it must + * be done carefully. + * + * The keys in the level just above the leaves are checked to make sure + * the lowest key in a given leaf is a csum key, and starts at an offset + * after the new size. + * + * Then the key for the next leaf is checked to make sure it also has + * a checksum item for the same file. If it does, we know our target leaf + * contains only checksum items, and it can be safely freed without reading + * it. + * + * This is just an optimization targeted at large files. It may do + * nothing. It will return 0 unless things went badly. + */ +static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *inode, u64 new_size) +{ + struct btrfs_key key; + int ret; + int nritems; + struct btrfs_key found_key; + struct btrfs_key other_key; + struct btrfs_leaf_ref *ref; + u64 leaf_gen; + u64 leaf_start; + + path->lowest_level = 1; + key.objectid = inode->i_ino; + key.type = BTRFS_CSUM_ITEM_KEY; + key.offset = new_size; +again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (path->nodes[1] == NULL) { + ret = 0; + goto out; + } + ret = 0; + btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); + nritems = btrfs_header_nritems(path->nodes[1]); + + if (!nritems) + goto out; + + if (path->slots[1] >= nritems) + goto next_node; + + /* did we find a key greater than anything we want to delete? */ + if (found_key.objectid > inode->i_ino || + (found_key.objectid == inode->i_ino && found_key.type > key.type)) + goto out; + + /* we check the next key in the node to make sure the leave contains + * only checksum items. This comparison doesn't work if our + * leaf is the last one in the node + */ + if (path->slots[1] + 1 >= nritems) { +next_node: + /* search forward from the last key in the node, this + * will bring us into the next node in the tree + */ + btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); + + /* unlikely, but we inc below, so check to be safe */ + if (found_key.offset == (u64)-1) + goto out; + + /* search_forward needs a path with locks held, do the + * search again for the original key. It is possible + * this will race with a balance and return a path that + * we could modify, but this drop is just an optimization + * and is allowed to miss some leaves. + */ + btrfs_release_path(root, path); + found_key.offset++; + + /* setup a max key for search_forward */ + other_key.offset = (u64)-1; + other_key.type = key.type; + other_key.objectid = key.objectid; + + path->keep_locks = 1; + ret = btrfs_search_forward(root, &found_key, &other_key, + path, 0, 0); + path->keep_locks = 0; + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + key.offset = found_key.offset; + btrfs_release_path(root, path); + cond_resched(); + goto again; + } + + /* we know there's one more slot after us in the tree, + * read that key so we can verify it is also a checksum item + */ + btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); + + if (found_key.objectid < inode->i_ino) + goto next_key; + + if (found_key.type != key.type || found_key.offset < new_size) + goto next_key; + + /* + * if the key for the next leaf isn't a csum key from this objectid, + * we can't be sure there aren't good items inside this leaf. + * Bail out + */ + if (other_key.objectid != inode->i_ino || other_key.type != key.type) + goto out; + + leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); + leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); + /* + * it is safe to delete this leaf, it contains only + * csum items from this inode at an offset >= new_size + */ + ret = btrfs_del_leaf(trans, root, path, leaf_start); + BUG_ON(ret); + + if (root->ref_cows && leaf_gen < trans->transid) { + ref = btrfs_alloc_leaf_ref(root, 0); + if (ref) { + ref->root_gen = root->root_key.offset; + ref->bytenr = leaf_start; + ref->owner = 0; + ref->generation = leaf_gen; + ref->nritems = 0; + + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } else { + WARN_ON(1); + } + } +next_key: + btrfs_release_path(root, path); + + if (other_key.objectid == inode->i_ino && + other_key.type == key.type && other_key.offset > key.offset) { + key.offset = other_key.offset; + cond_resched(); + goto again; + } + ret = 0; +out: + /* fixup any changes we've made to the path */ + path->lowest_level = 0; + path->keep_locks = 0; + btrfs_release_path(root, path); + return ret; +} + +#endif + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than new_size + * + * csum items that cross the new i_size are truncated to the new size + * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + u32 found_type; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 item_end = 0; + u64 root_gen = 0; + u64 root_owner = 0; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int encoding; + u64 mask = root->sectorsize - 1; + + if (root->ref_cows) + btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + path = btrfs_alloc_path(); + path->reada = -1; + BUG_ON(!path); + + /* FIXME, add redo link to tree so we don't leak on crash */ + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.type = (u8)-1; + + btrfs_init_path(path); + +search_again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto error; + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) { + ret = 0; + goto error; + } + path->slots[0]--; + } + + while (1) { + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + encoding = 0; + + if (found_key.objectid != inode->i_ino) + break; + + if (found_type < min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + encoding = btrfs_file_extent_compression(leaf, fi); + encoding |= btrfs_file_extent_encryption(leaf, fi); + encoding |= btrfs_file_extent_other_encoding(leaf, fi); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_inline_len(leaf, + fi); + } + item_end--; + } + if (item_end < new_size) { + if (found_type == BTRFS_DIR_ITEM_KEY) + found_type = BTRFS_INODE_ITEM_KEY; + else if (found_type == BTRFS_EXTENT_ITEM_KEY) + found_type = BTRFS_EXTENT_DATA_KEY; + else if (found_type == BTRFS_EXTENT_DATA_KEY) + found_type = BTRFS_XATTR_ITEM_KEY; + else if (found_type == BTRFS_XATTR_ITEM_KEY) + found_type = BTRFS_INODE_REF_KEY; + else if (found_type) + found_type--; + else + break; + btrfs_set_key_type(&key, found_type); + goto next; + } + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + found_extent = 0; + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item && !encoding) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = new_size - + found_key.offset + root->sectorsize - 1; + extent_num_bytes = extent_num_bytes & + ~((u64)root->sectorsize - 1); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - + extent_num_bytes); + if (root->ref_cows && extent_start != 0) + inode_sub_bytes(inode, num_dec); + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, + fi); + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) { + found_extent = 1; + if (root->ref_cows) + inode_sub_bytes(inode, num_dec); + } + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + } + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { + u32 size = new_size - found_key.offset; + + if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + new_size); + } + size = + btrfs_file_extent_calc_inline_size(size); + ret = btrfs_truncate_item(trans, root, path, + size, 1); + BUG_ON(ret); + } else if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + found_key.offset); + } + } +delete: + if (del_item) { + if (!pending_del_nr) { + /* no pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } else { + BUG(); + } + } else { + break; + } + if (found_extent) { + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_bytes, + leaf->start, root_owner, + root_gen, inode->i_ino, 0); + BUG_ON(ret); + } +next: + if (path->slots[0] == 0) { + if (pending_del_nr) + goto del_pending; + btrfs_release_path(root, path); + goto search_again; + } + + path->slots[0]--; + if (pending_del_nr && + path->slots[0] + 1 != pending_del_slot) { + struct btrfs_key debug; +del_pending: + btrfs_item_key_to_cpu(path->nodes[0], &debug, + pending_del_slot); + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + btrfs_release_path(root, path); + goto search_again; + } + } + ret = 0; +error: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + } + btrfs_free_path(path); + inode->i_sb->s_dirt = 1; + return ret; +} + +/* + * taken from block_truncate_page, but does cow as it zeros out + * any bytes left in the last page in the file. + */ +static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + u32 blocksize = root->sectorsize; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + int ret = 0; + u64 page_start; + u64 page_end; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; +again: + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if (!PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +int btrfs_cont_expand(struct inode *inode, loff_t size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 mask = root->sectorsize - 1; + u64 hole_start = (inode->i_size + mask) & ~mask; + u64 block_end = (size + mask) & ~mask; + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err; + + if (size <= hole_start) + return 0; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + return err; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, + block_end - hole_start); + lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, hole_start); + if (!ordered) + break; + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + btrfs_put_ordered_extent(ordered); + } + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), block_end); + last_byte = (last_byte + mask) & ~mask; + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; + err = btrfs_drop_extents(trans, root, inode, + cur_offset, + cur_offset + hole_size, + cur_offset, &hint_byte); + if (err) + break; + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); + } + free_extent_map(em); + cur_offset = last_byte; + if (err || cur_offset >= block_end) + break; + } + + btrfs_end_transaction(trans, root); + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + return err; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } + + err = inode_setattr(inode, attr); + + if (!err && ((attr->ia_valid & ATTR_MODE))) + err = btrfs_acl_chmod(inode); + return err; +} + +void btrfs_delete_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + + btrfs_i_size_write(inode, 0); + trans = btrfs_join_transaction(root, 1); + + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete_lock; + } + + btrfs_orphan_del(trans, inode); + + nr = trans->blocks_used; + clear_inode(inode); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return; + +no_delete_lock: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +no_delete: + clear_inode(inode); +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, + namelen, 0); + if (IS_ERR(di)) + ret = PTR_ERR(di); + + if (!di || IS_ERR(di)) + goto out_err; + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); +out: + btrfs_free_path(path); + return ret; +out_err: + location->objectid = 0; + goto out; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct btrfs_key *location, + struct btrfs_root **sub_root, + struct dentry *dentry) +{ + struct btrfs_root_item *ri; + + if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) + return 0; + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return 0; + + *sub_root = btrfs_read_fs_root(root->fs_info, location, + dentry->d_name.name, + dentry->d_name.len); + if (IS_ERR(*sub_root)) + return PTR_ERR(*sub_root); + + ri = &(*sub_root)->root_item; + location->objectid = btrfs_root_dirid(ri); + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + location->offset = 0; + + return 0; +} + +static noinline void init_btrfs_i(struct inode *inode) +{ + struct btrfs_inode *bi = BTRFS_I(inode); + + bi->i_acl = NULL; + bi->i_default_acl = NULL; + + bi->generation = 0; + bi->sequence = 0; + bi->last_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->disk_i_size = 0; + bi->flags = 0; + bi->index_cnt = (u64)-1; + bi->log_dirty_trans = 0; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, + inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, + inode->i_mapping, GFP_NOFS); + INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); + mutex_init(&BTRFS_I(inode)->extent_mutex); + mutex_init(&BTRFS_I(inode)->log_mutex); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + init_btrfs_i(inode); + BTRFS_I(inode)->root = args->root; + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return args->ino == inode->i_ino && + args->root == BTRFS_I(inode)->root; +} + +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + if (wait) { + inode = ilookup5(s, objectid, btrfs_find_actor, + (void *)&args); + } else { + inode = ilookup5_nowait(s, objectid, btrfs_find_actor, + (void *)&args); + } + return inode; +} + +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + inode = iget5_locked(s, objectid, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location->objectid, root); + if (!inode) + return ERR_PTR(-EACCES); + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + if (is_new) + *is_new = 1; + } else { + if (is_new) + *is_new = 0; + } + + return inode; +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct btrfs_inode *bi = BTRFS_I(dir); + struct btrfs_root *root = bi->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int ret, new; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &location); + + if (ret < 0) + return ERR_PTR(ret); + + inode = NULL; + if (location.objectid) { + ret = fixup_tree_root_location(root, &location, &sub_root, + dentry); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return inode; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); +} + +static unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_real_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + int advance; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + + /* special case for "." */ + if (filp->f_pos == 0) { + over = filldir(dirent, ".", 1, + 1, inode->i_ino, + DT_DIR); + if (over) + return 0; + filp->f_pos = 1; + } + /* special case for .., just use the back ref */ + if (filp->f_pos == 1) { + u64 pino = parent_ino(filp->f_path.dentry); + over = filldir(dirent, "..", 2, + 2, pino, DT_DIR); + if (over) + return 0; + filp->f_pos = 2; + } + path = btrfs_alloc_path(); + path->reada = 2; + + btrfs_set_key_type(&key, key_type); + key.offset = filp->f_pos; + key.objectid = inode->i_ino; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + advance = 0; + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (advance || slot >= nritems) { + if (slot >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + slot++; + path->slots[0]++; + } + } + + advance = 1; + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != key_type) + break; + if (found_key.offset < filp->f_pos) + continue; + + filp->f_pos = found_key.offset; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf, item); + + while (di_cur < di_total) { + struct btrfs_key location; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + /* is this a reference to our own snapshot? If so + * skip it + */ + if (location.type == BTRFS_ROOT_ITEM_KEY && + location.objectid == root->root_key.objectid) { + over = 0; + goto skip; + } + over = filldir(dirent, name_ptr, name_len, + found_key.offset, location.objectid, + d_type); + +skip: + if (name_ptr != tmp_name) + kfree(name_ptr); + + if (over) + goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } + } + + /* Reached end of directory/root. Bump pos past the last item. */ + if (key_type == BTRFS_DIR_INDEX_KEY) + filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); + else + filp->f_pos++; +nopos: + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +int btrfs_write_inode(struct inode *inode, int wait) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (root->fs_info->btree_inode == inode) + return 0; + + if (wait) { + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_commit_transaction(trans, root); + } + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +void btrfs_dirty_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); +} + +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ +static int btrfs_set_inode_index_count(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + /* + * MAGIC NUMBER EXPLANATION: + * since we search a directory based on f_pos we have to start at 2 + * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody + * else has to start at 2 + */ + if (path->slots[0] == 0) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != inode->i_ino || + btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + BTRFS_I(inode)->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct inode *dir, u64 *index) +{ + int ret = 0; + + if (BTRFS_I(dir)->index_cnt == (u64)-1) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + + *index = BTRFS_I(dir)->index_cnt; + BTRFS_I(dir)->index_cnt++; + + return ret; +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + const char *name, int name_len, + u64 ref_objectid, u64 objectid, + u64 alloc_hint, int mode, u64 *index) +{ + struct inode *inode; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + unsigned long ptr; + int ret; + int owner; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + inode = new_inode(root->fs_info->sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (dir) { + ret = btrfs_set_inode_index(dir, index); + if (ret) + return ERR_PTR(ret); + } + /* + * index_cnt is ignored for everything but a dir, + * btrfs_get_inode_index_count has an explanation for the magic + * number + */ + init_btrfs_i(inode); + BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; + + if (mode & S_IFDIR) + owner = 0; + else + owner = 1; + BTRFS_I(inode)->block_group = + btrfs_find_block_group(root, 0, alloc_hint, owner); + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + btrfs_set_flag(inode, NODATASUM); + if (btrfs_test_opt(root, NODATACOW)) + btrfs_set_flag(inode, NODATACOW); + } + + key[0].objectid = objectid; + btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].offset = 0; + + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[0] = sizeof(struct btrfs_inode_item); + sizes[1] = name_len + sizeof(*ref); + + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + if (ret != 0) + goto fail; + + if (objectid > root->highest_inode) + root->highest_inode = objectid; + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mode = mode; + inode->i_ino = objectid; + inode_set_bytes(inode, 0); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + insert_inode_hash(inode); + return inode; +fail: + if (dir) + BTRFS_I(dir)->index_cnt--; + btrfs_free_path(path); + return ERR_PTR(ret); +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) +{ + int ret; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, + &key, btrfs_inode_type(inode), + index); + if (ret == 0) { + if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, + name, name_len, + inode->i_ino, + parent_inode->i_ino, + index); + } + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct dentry *dentry, struct inode *inode, + int backref, u64 index) +{ + int err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, backref, index); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + unsigned long nr = 0; + u64 index = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + btrfs_update_inode(trans, root, inode); + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + unsigned long nr = 0; + u64 objectid; + u64 index = 0; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, + objectid, BTRFS_I(dir)->block_group, mode, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = old_dentry->d_inode; + u64 index; + unsigned long nr = 0; + int err; + int drop_inode = 0; + + if (inode->i_nlink == 0) + return -ENOENT; + + btrfs_inc_nlink(inode); + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + atomic_inc(&inode->i_count); + + err = btrfs_add_nondir(trans, dentry, inode, 1, index); + + if (err) + drop_inode = 1; + + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, dir); + err = btrfs_update_inode(trans, root, inode); + + if (err) + drop_inode = 1; + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid = 0; + u64 index = 0; + unsigned long nr = 1; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto out_unlock; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_unlock; + } + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFDIR | mode, + &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + + drop_on_err = 1; + + err = btrfs_init_acl(inode, dir); + if (err) + goto out_fail; + + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + btrfs_set_trans_block_group(trans, inode); + + btrfs_i_size_write(inode, 0); + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail; + + err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, 0, index); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); + drop_on_err = 0; + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + +out_fail: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + +out_unlock: + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); + return err; +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ +static int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start, u64 map_len) +{ + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + start_diff = map_start - em->start; + em->start = map_start; + em->len = map_len; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len -= start_diff; + } + return add_extent_mapping(em_tree, em); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + + WARN_ON(pg_offset != 0); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + ret = btrfs_zlib_decompress(tmp, page, extent_offset, + inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the in-ram + * representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ + +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + int ret; + int err = 0; + u64 bytenr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = inode->i_ino; + u32 found_type; + struct btrfs_path *path = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_trans_handle *trans = NULL; + int compressed; + +again: + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + if (!path) { + path = btrfs_alloc_path(); + BUG_ON(!path); + } + + ret = btrfs_lookup_file_extent(trans, root, path, + objectid, start, trans != NULL); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + /* are we inside the extent that was found? */ + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + goto not_found; + } + + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + compressed = btrfs_file_extent_compression(leaf, item); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, item); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, item); + extent_end = (extent_start + size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + goto not_found; + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + em->start = start; + em->len = found_key.offset - start; + goto not_found_em; + } + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, item); + bytenr = btrfs_file_extent_disk_bytenr(leaf, item); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + goto insert; + } + if (compressed) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + if (found_type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + unsigned long ptr; + char *map; + size_t size; + size_t extent_offset; + size_t copy_size; + + em->block_start = EXTENT_MAP_INLINE; + if (!page || create) { + em->start = extent_start; + em->len = extent_end - extent_start; + goto out; + } + + size = btrfs_file_extent_inline_len(leaf, item); + extent_offset = page_offset(page) + pg_offset - extent_start; + copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, + size - extent_offset); + em->start = extent_start + extent_offset; + em->len = (copy_size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + em->orig_start = EXTENT_MAP_INLINE; + if (compressed) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + ptr = btrfs_file_extent_inline_start(item) + extent_offset; + if (create == 0 && !PageUptodate(page)) { + if (btrfs_file_extent_compression(leaf, item) == + BTRFS_COMPRESS_ZLIB) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + } + flush_dcache_page(page); + } else if (create && PageUptodate(page)) { + if (!trans) { + kunmap(page); + free_extent_map(em); + em = NULL; + btrfs_release_path(root, path); + trans = btrfs_join_transaction(root, 1); + goto again; + } + map = kmap(page); + write_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + btrfs_mark_buffer_dirty(leaf); + } + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, GFP_NOFS); + goto insert; + } else { + printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); + WARN_ON(1); + } +not_found: + em->start = start; + em->len = len; +not_found_em: + em->block_start = EXTENT_MAP_HOLE; + set_bit(EXTENT_FLAG_VACANCY, &em->flags); +insert: + btrfs_release_path(root, path); + if (em->start > start || extent_map_end(em) <= start) { + printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " + "[%llu %llu]\n", (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); + err = -EIO; + goto out; + } + + err = 0; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = lookup_extent_mapping(em_tree, start, len); + if (existing && (existing->start > start || + existing->start + existing->len <= start)) { + free_extent_map(existing); + existing = NULL; + } + if (!existing) { + existing = lookup_extent_mapping(em_tree, em->start, + em->len); + if (existing) { + err = merge_extent_mapping(em_tree, existing, + em, start, + root->sectorsize); + free_extent_map(existing); + if (err) { + free_extent_map(em); + em = NULL; + } + } else { + err = -EIO; + free_extent_map(em); + em = NULL; + } + } else { + free_extent_map(em); + em = existing; + err = 0; + } + } + spin_unlock(&em_tree->lock); +out: + if (path) + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + if (err) { + free_extent_map(em); + WARN_ON(1); + return ERR_PTR(err); + } + return em; +} + +static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + return -EINVAL; +} + +static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) +{ + return extent_bmap(mapping, iblock, btrfs_get_extent); +} + +int btrfs_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btrfs_get_extent); +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); +} + +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_writepages(tree, mapping, btrfs_get_extent, wbc); +} + +static int +btrfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_readpages(tree, mapping, pages, nr_pages, + btrfs_get_extent); +} +static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page, gfp_flags); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + return ret; +} + +static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + return __btrfs_releasepage(page, gfp_flags); +} + +static void btrfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + struct btrfs_ordered_extent *ordered; + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } + + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); + if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED, 1, 0, GFP_NOFS); + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + btrfs_put_ordered_extent(ordered); + lock_extent(tree, page_start, page_end, GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_ORDERED, + 1, 1, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + unsigned long zero_start; + loff_t size; + int ret; + u64 page_start; + u64 page_end; + + ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); + if (ret) + goto out; + + ret = -EINVAL; +again: + lock_page(page); + size = i_size_read(inode); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = 0; + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_CACHE_SIZE > size) + zero_start = size & ~PAGE_CACHE_MASK; + else + zero_start = PAGE_CACHE_SIZE; + + if (zero_start != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + unlock_page(page); +out: + return ret; +} + +static void btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 mask = root->sectorsize - 1; + + if (!S_ISREG(inode->i_mode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_i_size_write(inode, inode->i_size); + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + /* FIXME, add redo link to tree so we don't leak on crash */ + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + btrfs_update_inode(trans, root, inode); + + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + +out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); +} + +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, struct dentry *dentry, + u64 new_dirid, u64 alloc_hint) +{ + struct inode *inode; + int error; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, + new_dirid, alloc_hint, S_IFDIR | 0700, &index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + + error = btrfs_update_inode(trans, new_root, inode); + if (error) + return error; + + d_instantiate(dentry, inode); + return 0; +} + +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index) +{ + pgoff_t req_size = last_index - offset + 1; + + page_cache_sync_readahead(mapping, ra, file, offset, req_size); + return offset + req_size; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->last_trans = 0; + ei->logged_trans = 0; + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + ei->i_acl = BTRFS_ACL_NOT_CACHED; + ei->i_default_acl = BTRFS_ACL_NOT_CACHED; + INIT_LIST_HEAD(&ei->i_orphan); + return &ei->vfs_inode; +} + +void btrfs_destroy_inode(struct inode *inode) +{ + struct btrfs_ordered_extent *ordered; + WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + + if (BTRFS_I(inode)->i_acl && + BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(BTRFS_I(inode)->i_acl); + if (BTRFS_I(inode)->i_default_acl && + BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(BTRFS_I(inode)->i_default_acl); + + spin_lock(&BTRFS_I(inode)->root->list_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" + " list\n", inode->i_ino); + dump_stack(); + } + spin_unlock(&BTRFS_I(inode)->root->list_lock); + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + printk(KERN_ERR "btrfs found ordered " + "extent %llu %llu on inode cleanup\n", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_bit_radix_cachep) + kmem_cache_destroy(btrfs_bit_radix_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); +} + +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *)) +{ + return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD | extra_flags), ctor); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), + 0, init_once); + if (!btrfs_inode_cachep) + goto fail; + btrfs_trans_handle_cachep = + btrfs_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), + 0, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), + 0, NULL); + if (!btrfs_transaction_cachep) + goto fail; + btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), + 0, NULL); + if (!btrfs_path_cachep) + goto fail; + btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, + SLAB_DESTROY_BY_RCU, NULL); + if (!btrfs_bit_radix_cachep) + goto fail; + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->blksize = PAGE_CACHE_SIZE; + stat->blocks = (inode_get_bytes(inode) + + BTRFS_I(inode)->delalloc_bytes) >> 9; + return 0; +} + +static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; + int ret; + + /* we're not allowed to rename between subvolumes */ + if (BTRFS_I(old_inode)->root->root_key.objectid != + BTRFS_I(new_dir)->root->root_key.objectid) + return -EXDEV; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { + return -ENOTEMPTY; + } + + /* to rename a snapshot or subvolume, we need to juggle the + * backrefs. This isn't coded yet + */ + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return -EXDEV; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto out_unlock; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, new_dir); + + btrfs_inc_nlink(old_dentry->d_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (ret) + goto out_fail; + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; + ret = btrfs_unlink_inode(trans, root, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (ret) + goto out_fail; + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); + if (ret) + goto out_fail; + } + + } + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, + old_inode, new_dentry->d_name.name, + new_dentry->d_name.len, 1, index); + if (ret) + goto out_fail; + +out_fail: + btrfs_end_transaction_throttle(trans, root); +out_unlock: + return ret; +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +int btrfs_start_delalloc_inodes(struct btrfs_root *root) +{ + struct list_head *head = &root->fs_info->delalloc_inodes; + struct btrfs_inode *binode; + struct inode *inode; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(head)) { + binode = list_entry(head->next, struct btrfs_inode, + delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (!inode) + list_del_init(&binode->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + if (inode) { + filemap_flush(inode->i_mapping); + iput(inode); + } + cond_resched(); + spin_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + return 0; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0 ; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + unsigned long nr = 0; + + name_len = strlen(symname) + 1; + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto out_fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + if (drop_inode) + goto out_unlock; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = inode->i_ino; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + drop_inode = 1; + goto out_unlock; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode_set_bytes(inode, name_len); + btrfs_i_size_write(inode, name_len - 1); + err = btrfs_update_inode(trans, root, inode); + if (err) + drop_inode = 1; + +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +out_fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 alloc_size; + u64 cur_offset = start; + u64 num_bytes = end - start; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + if (ret) { + WARN_ON(1); + goto out; + } + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, + ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; + } +out: + if (cur_offset > start) { + inode->i_ctime = CURRENT_TIME; + btrfs_set_flag(inode, PREALLOC); + if (!(mode & FALLOC_FL_KEEP_SIZE) && + cur_offset > i_size_read(inode)) + btrfs_i_size_write(inode, cur_offset); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + + btrfs_end_transaction(trans, root); + return ret; +} + +static long btrfs_fallocate(struct inode *inode, int mode, + loff_t offset, loff_t len) +{ + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + mutex_lock(&inode->i_mutex); + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, + alloc_end - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + alloc_start, alloc_end - 1, GFP_NOFS); + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE) { + ret = prealloc_file_range(inode, cur_offset, + last_byte, alloc_hint, mode); + if (ret < 0) { + free_extent_map(em); + break; + } + } + if (em->block_start <= EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int btrfs_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ + if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + return -EACCES; + return generic_permission(inode, mask, btrfs_check_acl); +} + +static struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, +}; +static struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, +}; +static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_real_readdir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +static struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .writepage_end_io_hook = btrfs_writepage_end_io_hook, + .writepage_start_hook = btrfs_writepage_start_hook, + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, +}; + +static struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .writepages = btrfs_writepages, + .readpages = btrfs_readpages, + .sync_page = block_sync_page, + .bmap = btrfs_bmap, + .direct_IO = btrfs_direct_IO, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = btrfs_set_page_dirty, +}; + +static struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, +}; + +static struct inode_operations btrfs_file_inode_operations = { + .truncate = btrfs_truncate, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .fallocate = btrfs_fallocate, +}; +static struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; +static struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .permission = btrfs_permission, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 00000000000..c2aa33e3feb --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,1132 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/security.h> +#include <linux/version.h> +#include <linux/xattr.h> +#include <linux/vmalloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" + + + +static noinline int create_subvol(struct btrfs_root *root, + struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + struct inode *dir; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; + unsigned long nr = 1; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_commit; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + if (ret) + goto fail; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + objectid, trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, 0); + btrfs_set_root_last_snapshot(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + /* + * insert the directory item + */ + key.offset = (u64)-1; + dir = dentry->d_parent->d_inode; + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + + ret = btrfs_insert_dir_item(trans, root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR, index); + if (ret) + goto fail; + + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, BTRFS_ROOT_BACKREF_KEY, + root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + root->root_key.objectid, BTRFS_ROOT_REF_KEY, + objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto fail_commit; + + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(!new_root); + + trans = btrfs_start_transaction(new_root, 1); + BUG_ON(!trans); + + ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, + BTRFS_I(dir)->block_group); + if (ret) + goto fail; + +fail: + nr = trans->blocks_used; + err = btrfs_commit_transaction(trans, new_root); + if (err && !ret) + ret = err; +fail_commit: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret = 0; + int err; + unsigned long nr = 0; + + if (!root->ref_cows) + return -EINVAL; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_unlock; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; + pending_snapshot->dentry = dentry; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + err = btrfs_commit_transaction(trans, root); + +fail_unlock: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, char *name, + int mode, int namelen, + struct btrfs_root *snap_src) +{ + struct dentry *dentry; + int error; + + mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (dentry->d_inode) + goto out_dput; + + if (!IS_POSIXACL(parent->dentry->d_inode)) + mode &= ~current->fs->umask; + + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + + error = btrfs_may_create(parent->dentry->d_inode, dentry); + if (error) + goto out_drop_write; + + /* + * Actually perform the low-level subvolume creation after all + * this VFS fuzz. + * + * Eventually we want to pass in an inode under which we create this + * subvolume, but for now all are under the filesystem root. + * + * Also we should pass on the mode eventually to allow creating new + * subvolume with specific mode bits. + */ + if (snap_src) { + struct dentry *dir = dentry->d_parent; + struct dentry *test = dir->d_parent; + struct btrfs_path *path = btrfs_alloc_path(); + int ret; + u64 test_oid; + u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; + + test_oid = snap_src->root_key.objectid; + + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, parent_oid, test_oid); + if (ret == 0) + goto create; + btrfs_release_path(snap_src->fs_info->tree_root, path); + + /* we need to make sure we aren't creating a directory loop + * by taking a snapshot of something that has our current + * subvol in its directory tree. So, this loops through + * the dentries and checks the forward refs for each subvolume + * to see if is references the subvolume where we are + * placing this new snapshot. + */ + while (1) { + if (!test || + dir == snap_src->fs_info->sb->s_root || + test == snap_src->fs_info->sb->s_root || + test->d_inode->i_sb != snap_src->fs_info->sb) { + break; + } + if (S_ISLNK(test->d_inode->i_mode)) { + printk(KERN_INFO "Btrfs symlink in snapshot " + "path, failed\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + test_oid = + BTRFS_I(test->d_inode)->root->root_key.objectid; + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, test_oid, parent_oid); + if (ret == 0) { + printk(KERN_INFO "Btrfs snapshot creation " + "failed, looping\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + btrfs_release_path(snap_src->fs_info->tree_root, path); + test = test->d_parent; + } +create: + btrfs_free_path(path); + error = create_snapshot(snap_src, dentry, name, namelen); + } else { + error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, + dentry, name, namelen); + } + if (error) + goto out_drop_write; + + fsnotify_mkdir(parent->dentry->d_inode, dentry); +out_drop_write: + mnt_drop_write(parent->mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&parent->dentry->d_inode->i_mutex); + return error; +} + + +static int btrfs_defrag_file(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct page *page; + unsigned long last_index; + unsigned long ra_pages = root->fs_info->bdi.ra_pages; + unsigned long total_read = 0; + u64 page_start; + u64 page_end; + unsigned long i; + int ret; + + ret = btrfs_check_free_space(root, inode->i_size, 0); + if (ret) + return -ENOSPC; + + mutex_lock(&inode->i_mutex); + last_index = inode->i_size >> PAGE_CACHE_SHIFT; + for (i = 0; i <= last_index; i++) { + if (total_read % ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, + min(last_index, i + ra_pages - 1)); + } + total_read++; +again: + page = grab_cache_page(inode->i_mapping, i); + if (!page) + goto out_unlock; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + goto out_unlock; + } + } + + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + /* + * this makes sure page_mkwrite is called on the + * page if it is dirtied again later + */ + clear_page_dirty_for_io(page); + + btrfs_set_extent_delalloc(inode, page_start, page_end); + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + } + +out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +/* + * Called inside transaction, so use GFP_NOFS + */ + +static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int namelen; + int mod = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + + mutex_lock(&root->fs_info->volume_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", devid); + } + device = btrfs_find_device(root, devid, NULL, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = btrfs_parse_size(sizestr); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); +out: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct file *src_file; + u64 root_dirid; + int namelen; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/')) { + ret = -EINVAL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, + path, root_dirid, + vol_args->name, namelen, 0); + btrfs_free_path(path); + + if (di && !IS_ERR(di)) { + ret = -EEXIST; + goto out; + } + + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, NULL); + } else { + struct inode *src_inode; + src_file = fget(vol_args->fd); + if (!src_file) { + ret = -EINVAL; + goto out; + } + + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + printk(KERN_INFO "btrfs: Snapshot src from " + "another FS\n"); + ret = -EINVAL; + fput(src_file); + goto out; + } + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, BTRFS_I(src_inode)->root); + fput(src_file); + } + +out: + kfree(vol_args); + return ret; +} + +static int btrfs_ioctl_defrag(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + btrfs_defrag_root(root, 0); + btrfs_defrag_root(root->fs_info->extent_root, 0); + break; + case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } + btrfs_defrag_file(file); + break; + } +out: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct extent_buffer *leaf; + char *buf; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + u64 hint_byte; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE)) + return -EINVAL; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + src_file = fget(srcfd); + if (!src_file) { + ret = -EBADF; + goto out_drop_write; + } + src = src_file->f_dentry->d_inode; + + ret = -EINVAL; + if (src == inode) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + goto out_fput; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + goto out_fput; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + goto out_fput; + } + path->reada = 2; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off >= src->i_size || off + len > src->i_size) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ((src->i_size + bs-1) & ~(bs-1)) + - off; + + /* verify the end result is block aligned */ + if ((off & (bs-1)) || + ((off + len) & (bs-1))) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, off+len); + if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + break; + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(src, off, off+len); + } + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + /* punch hole in destination first */ + btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + + /* clone data */ + key.objectid = src->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != src->i_ino) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG) { + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } + btrfs_release_path(root, path); + + if (key.offset + datal < off || + key.offset >= off+len) + goto next; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + new_key.offset = key.offset + destoff - off; + + if (type == BTRFS_FILE_EXTENT_REG) { + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + if (key.offset + datao + datal + key.offset > + off + len) + datal = off + len - key.offset - datao; + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + if (disko) { + inode_add_bytes(inode, datal); + ret = btrfs_inc_extent_ref(trans, root, + disko, diskl, leaf->start, + root->root_key.objectid, + trans->transid, + inode->i_ino); + BUG_ON(ret); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off+len) + trim = key.offset + datal - (off+len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + if (skip) { + u32 start = + btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); + } + + btrfs_mark_buffer_dirty(leaf); + } + +next: + btrfs_release_path(root, path); + key.offset++; + } + ret = 0; +out: + btrfs_release_path(root, path); + if (ret == 0) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (destoff + olen > inode->i_size) + btrfs_i_size_write(inode, destoff + olen); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans, root); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ret) + vmtruncate(inode, 0); +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + vfree(buf); + btrfs_free_path(path); +out_fput: + fput(src_file); +out_drop_write: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +static long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (file->private_data) { + ret = -EINPROGRESS; + goto out; + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + + trans = btrfs_start_ioctl_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ +out: + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + trans = file->private_data; + if (!trans) { + ret = -EINVAL; + goto out; + } + btrfs_end_transaction(trans, root); + file->private_data = NULL; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + + mnt_drop_write(file->f_path.mnt); + +out: + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, argp); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + } + + return -ENOTTY; +} diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 00000000000..78049ea208d --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __IOCTL_ +#define __IOCTL_ +#include <linux/ioctl.h> + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_PATH_NAME_MAX 3072 + +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) + +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 00000000000..39bae7761db --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> +#include <asm/bug.h> +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +/* + * locks the per buffer mutex in an extent buffer. This uses adaptive locks + * and the spin is not tuned very extensively. The spinning does make a big + * difference in almost every workload, but spinning for the right amount of + * time needs some help. + * + * In general, we want to spin as long as the lock holder is doing btree + * searches, and we should give up if they are in more expensive code. + */ + +int btrfs_tree_lock(struct extent_buffer *eb) +{ + int i; + + if (mutex_trylock(&eb->mutex)) + return 0; + for (i = 0; i < 512; i++) { + cpu_relax(); + if (mutex_trylock(&eb->mutex)) + return 0; + } + cpu_relax(); + mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); + return 0; +} + +int btrfs_try_tree_lock(struct extent_buffer *eb) +{ + return mutex_trylock(&eb->mutex); +} + +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + mutex_unlock(&eb->mutex); + return 0; +} + +int btrfs_tree_locked(struct extent_buffer *eb) +{ + return mutex_is_locked(&eb->mutex); +} + +/* + * btrfs_search_slot uses this to decide if it should drop its locks + * before doing something expensive like allocating free blocks for cow. + */ +int btrfs_path_lock_waiting(struct btrfs_path *path, int level) +{ + int i; + struct extent_buffer *eb; + for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { + eb = path->nodes[i]; + if (!eb) + break; + smp_mb(); + if (!list_empty(&eb->mutex.wait_list)) + return 1; + } + return 0; +} + diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 00000000000..bc1faef1251 --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +int btrfs_tree_lock(struct extent_buffer *eb); +int btrfs_tree_unlock(struct extent_buffer *eb); +int btrfs_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_lock(struct extent_buffer *eb); +int btrfs_path_lock_waiting(struct btrfs_path *path, int level); +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 00000000000..a2094017027 --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,730 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->len < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->len; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +/* + * helper to check if a given offset is inside a given entry + */ +static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) +{ + if (file_offset < entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (offset_in_entry(entry, file_offset)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * This also sets the EXTENT_ORDERED bit on the range in the inode. + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + + tree = &BTRFS_I(inode)->ordered_tree; + entry = kzalloc(sizeof(*entry), GFP_NOFS); + if (!entry) + return -ENOMEM; + + mutex_lock(&tree->mutex); + entry->file_offset = file_offset; + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; + entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + + /* one ref for the tree */ + atomic_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); + + node = tree_insert(&tree->tree, file_offset, + &entry->rb_node); + BUG_ON(node); + + set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, + entry_end(entry) - 1, GFP_NOFS); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + BUG_ON(node); + return 0; +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + list_add_tail(&sum->list, &entry->list); + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, + GFP_NOFS); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) { + ret = 1; + goto out; + } + + ret = test_range_bit(io_tree, entry->file_offset, + entry->file_offset + entry->len - 1, + EXTENT_ORDERED, 0); + if (ret == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); +out: + mutex_unlock(&tree->mutex); + return ret == 0; +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + if (atomic_dec_and_test(&entry->refs)) { + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } + kfree(entry); + } + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but, anyone waiting on this extent is woken up. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = &entry->rb_node; + rb_erase(node, &tree->tree); + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + wake_up(&entry->wait); + return 0; +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) +{ + struct list_head splice; + struct list_head *cur; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + cur = splice.next; + ordered = list_entry(cur, struct btrfs_ordered_extent, + root_extent_list); + if (nocow_only && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + list_move(&ordered->root_extent_list, + &root->fs_info->ordered_extents); + cond_resched_lock(&root->fs_info->ordered_extent_lock); + continue; + } + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } + + spin_lock(&root->fs_info->ordered_extent_lock); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) +{ + u64 start = entry->file_offset; + u64 end = start + entry->len - 1; + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ + btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + if (wait) { + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &entry->flags)); + } +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + u64 end; + u64 orig_end; + u64 wait_end; + struct btrfs_ordered_extent *ordered; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } + wait_end = orig_end; +again: + /* start IO across the range first to instantiate any delalloc + * extents + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + + /* The compression code will leave pages locked but return from + * writepage without setting the page writeback. Starting again + * with WB_SYNC_ALL will end up waiting for the IO to actually start. + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + + btrfs_wait_on_page_writeback_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + orig_end >> PAGE_CACHE_SHIFT); + + end = orig_end; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->len < start) { + btrfs_put_ordered_extent(ordered); + break; + } + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); + if (end == 0 || end == start) + break; + end--; + } + if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + schedule_timeout(1); + goto again; + } + return 0; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) + entry = NULL; + if (entry) + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; + struct rb_node *node; + struct btrfs_ordered_extent *test; + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size >= inode->i_size || + ordered->file_offset + ordered->len <= disk_i_size) { + goto out; + } + + /* + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ + if (test_range_bit(io_tree, disk_i_size, + ordered->file_offset + ordered->len - 1, + EXTENT_DELALLOC, 0)) { + goto out; + } + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + node = &ordered->rb_node; + while (1) { + node = rb_prev(node); + if (!node) + break; + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= inode->i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; + } + new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + + /* + * at this point, we know we can safely update i_size to at least + * the offset from this ordered extent. But, we need to + * walk forward and see if ios from higher up in the file have + * finished. + */ + node = rb_next(&ordered->rb_node); + i_size_test = 0; + if (node) { + /* + * do we have an area where IO might have finished + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset > entry_end(ordered)) + i_size_test = test->file_offset; + } else { + i_size_test = i_size_read(inode); + } + + /* + * i_size_test is the end of a region after this ordered + * extent where there are no ordered extents. As long as there + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ + if (i_size_test > entry_end(ordered) && + !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, + EXTENT_DELALLOC, 0)) { + new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + } + BTRFS_I(inode)->disk_i_size = new_i_size; +out: + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_sector_sum *sector_sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct list_head *cur; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int ret = 1; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 1; + + mutex_lock(&tree->mutex); + list_for_each_prev(cur, &ordered->list) { + ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); + if (disk_bytenr >= ordered_sum->bytenr) { + num_sectors = ordered_sum->len / sectorsize; + sector_sums = ordered_sum->sums; + for (i = 0; i < num_sectors; i++) { + if (sector_sums[i].bytenr == disk_bytenr) { + *sum = sector_sums[i].sum; + ret = 0; + goto out; + } + } + } + } +out: + mutex_unlock(&tree->mutex); + btrfs_put_ordered_extent(ordered); + return ret; +} + + +/** + * taken from mm/filemap.c because it isn't exported + * + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets <start, end> inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = mapping->nrpages * 2, + .range_start = start, + .range_end = end, + .for_writepages = 1, + }; + return btrfs_writepages(mapping, &wbc); +} + +/** + * taken from mm/filemap.c because it isn't exported + * + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * + * Wait for writeback to complete against pages indexed by start->end + * inclusive + */ +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 00000000000..ab66d5e8d6d --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + struct mutex mutex; + struct rb_root tree; + struct rb_node *last; +}; + +/* + * these are used to collect checksums done just before bios submission. + * They are attached via a list into the ordered extent, and + * checksum items are inserted into the tree after all the blocks in + * the ordered extent are on disk + */ +struct btrfs_sector_sum { + /* bytenr on disk */ + u64 bytenr; + u32 sum; +}; + +struct btrfs_ordered_sum { + /* bytenr is the start of this extent on disk */ + u64 bytenr; + + /* + * this is the length in bytes covered by the sums array below. + */ + unsigned long len; + struct list_head list; + /* last field is a variable length array of btrfs_sector_sums */ + struct btrfs_sector_sum sums[]; +}; + +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + +#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ + +#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ + +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ + +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ + +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* disk byte number */ + u64 start; + + /* ram length of the extent in bytes */ + u64 len; + + /* extent length on disk */ + u64 disk_len; + + /* flags (described above) */ + unsigned long flags; + + /* reference count */ + atomic_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; +}; + + +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ +static inline int btrfs_ordered_sum_size(struct btrfs_root *root, + unsigned long bytes) +{ + unsigned long num_sectors = (bytes + root->sectorsize - 1) / + root->sectorsize; + num_sectors++; + return sizeof(struct btrfs_ordered_sum) + + num_sectors * sizeof(struct btrfs_sector_sum); +} + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + mutex_init(&t->mutex); + t->tree.rb_node = NULL; + t->last = NULL; +} + +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry); +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size); +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int tyep); +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +#endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 00000000000..3c0d52af4f8 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 00000000000..5f8f218c100 --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " + "num_stripes %d\n", + (unsigned long long)btrfs_chunk_length(eb, chunk), + (unsigned long long)btrfs_chunk_owner(eb, chunk), + (unsigned long long)btrfs_chunk_type(eb, chunk), + num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, + (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), + (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + printk(KERN_INFO "\t\tdev item devid %llu " + "total_bytes %llu bytes used %llu\n", + (unsigned long long)btrfs_device_id(eb, dev_item), + (unsigned long long)btrfs_device_total_bytes(eb, dev_item), + (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); +} +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +{ + int i; + u32 nr = btrfs_header_nritems(l); + struct btrfs_item *item; + struct btrfs_extent_item *ei; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_extent_ref *ref; + struct btrfs_dev_extent *dev_extent; + u32 type; + + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + (unsigned long long)btrfs_header_bytenr(l), nr, + btrfs_leaf_free_space(root, l)); + for (i = 0 ; i < nr ; i++) { + item = btrfs_item_nr(l, i); + btrfs_item_key_to_cpu(l, &key, i); + type = btrfs_key_type(&key); + printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + "itemsize %d\n", + i, + (unsigned long long)key.objectid, type, + (unsigned long long)key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + printk(KERN_INFO "\t\tinode generation %llu size %llu " + "mode %o\n", + (unsigned long long) + btrfs_inode_generation(l, ii), + (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + printk(KERN_INFO "\t\tdir oid %llu type %u\n", + (unsigned long long)found_key.objectid, + btrfs_dir_type(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", + (unsigned long long) + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); + printk(KERN_INFO "\t\textent data refs %u\n", + btrfs_extent_refs(l, ei)); + break; + case BTRFS_EXTENT_REF_KEY: + ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); + printk(KERN_INFO "\t\textent back ref root %llu " + "gen %llu owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root(l, ref), + (unsigned long long)btrfs_ref_generation(l, ref), + (unsigned long long)btrfs_ref_objectid(l, ref), + (unsigned long)btrfs_ref_num_refs(l, ref)); + break; + + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + printk(KERN_INFO "\t\tinline extent data " + "size %u\n", + btrfs_file_extent_inline_len(l, fi)); + break; + } + printk(KERN_INFO "\t\textent data disk bytenr %llu " + "nr %llu\n", + (unsigned long long) + btrfs_file_extent_disk_bytenr(l, fi), + (unsigned long long) + btrfs_file_extent_disk_num_bytes(l, fi)); + printk(KERN_INFO "\t\textent data offset %llu " + "nr %llu ram %llu\n", + (unsigned long long) + btrfs_file_extent_offset(l, fi), + (unsigned long long) + btrfs_file_extent_num_bytes(l, fi), + (unsigned long long) + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + printk(KERN_INFO "\t\tblock group used %llu\n", + (unsigned long long) + btrfs_disk_block_group_used(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + (unsigned long long) + btrfs_dev_extent_chunk_tree(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_objectid(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_offset(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_length(l, dev_extent)); + }; + } +} + +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +{ + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(root, c); + return; + } + printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", + (unsigned long long)btrfs_header_bytenr(c), + btrfs_header_level(c), nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", + i, + (unsigned long long)key.objectid, + key.type, + (unsigned long long)key.offset, + (unsigned long long)btrfs_node_blockptr(c, i)); + } + for (i = 0; i < nr; i++) { + struct extent_buffer *next = read_tree_block(root, + btrfs_node_blockptr(c, i), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(c, i)); + if (btrfs_is_leaf(next) && + btrfs_header_level(c) != 1) + BUG(); + if (btrfs_header_level(next) != + btrfs_header_level(c) - 1) + BUG(); + btrfs_print_tree(root, next); + free_extent_buffer(next); + } +} diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 00000000000..da75efe534d --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +#endif diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 00000000000..6f0acc4c9ea --- /dev/null +++ b/fs/btrfs/ref-cache.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "ref-cache.h" +#include "transaction.h" + +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on. This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */ +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents) +{ + struct btrfs_leaf_ref *ref; + size_t size = btrfs_leaf_ref_size(nr_extents); + + ref = kmalloc(size, GFP_NOFS); + if (ref) { + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size += size; + spin_unlock(&root->fs_info->ref_cache_lock); + + memset(ref, 0, sizeof(*ref)); + atomic_set(&ref->usage, 1); + INIT_LIST_HEAD(&ref->list); + } + return ref; +} + +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */ +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + if (!ref) + return; + WARN_ON(atomic_read(&ref->usage) == 0); + if (atomic_dec_and_test(&ref->usage)) { + size_t size = btrfs_leaf_ref_size(ref->nritems); + + BUG_ON(ref->in_tree); + kfree(ref); + + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size -= size; + spin_unlock(&root->fs_info->ref_cache_lock); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_leaf_ref *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct btrfs_leaf_ref *entry; + + while (n) { + entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared) +{ + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + if (!tree) + return 0; + + spin_lock(&tree->lock); + while (!list_empty(&tree->list)) { + ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list); + BUG_ON(ref->tree != tree); + if (ref->root_gen > max_root_gen) + break; + if (!xchg(&ref->in_tree, 0)) { + cond_resched_lock(&tree->lock); + continue; + } + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + btrfs_free_leaf_ref(root, ref); + cond_resched(); + spin_lock(&tree->lock); + } + spin_unlock(&tree->lock); + return 0; +} + +/* + * find the leaf ref for a given extent. This returns the ref struct with + * a usage reference incremented + */ +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; +again: + if (tree) { + spin_lock(&tree->lock); + rb = tree_search(&tree->root, bytenr); + if (rb) + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + if (ref) + atomic_inc(&ref->usage); + spin_unlock(&tree->lock); + if (ref) + return ref; + } + if (tree != &root->fs_info->shared_ref_tree) { + tree = &root->fs_info->shared_ref_tree; + goto again; + } + return NULL; +} + +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */ +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared) +{ + int ret = 0; + struct rb_node *rb; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + + spin_lock(&tree->lock); + rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node); + if (rb) { + ret = -EEXIST; + } else { + atomic_inc(&ref->usage); + ref->tree = tree; + ref->in_tree = 1; + list_add_tail(&ref->list, &tree->list); + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * remove a single leaf ref from the tree. This drops the ref held by the tree + * only + */ +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + struct btrfs_leaf_ref_tree *tree; + + if (!xchg(&ref->in_tree, 0)) + return 0; + + tree = ref->tree; + spin_lock(&tree->lock); + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + + btrfs_free_leaf_ref(root, ref); + return 0; +} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 00000000000..16f3183d7c5 --- /dev/null +++ b/fs/btrfs/ref-cache.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REFCACHE__ +#define __REFCACHE__ + +struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ + u64 bytenr; + u64 num_bytes; + + /* objectid and offset find the back reference for the file */ + u64 objectid; + u64 offset; +}; + +struct btrfs_leaf_ref { + struct rb_node rb_node; + struct btrfs_leaf_ref_tree *tree; + int in_tree; + atomic_t usage; + + u64 root_gen; + u64 bytenr; + u64 owner; + u64 generation; + int nritems; + + struct list_head list; + struct btrfs_extent_info extents[]; +}; + +static inline size_t btrfs_leaf_ref_size(int nr_extents) +{ + return sizeof(struct btrfs_leaf_ref) + + sizeof(struct btrfs_extent_info) * nr_extents; +} + +static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) +{ + tree->root.rb_node = NULL; + INIT_LIST_HEAD(&tree->list); + spin_lock_init(&tree->lock); +} + +static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) +{ + return RB_EMPTY_ROOT(&tree->root); +} + +void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents); +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr); +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared); +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared); +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 00000000000..b48650de447 --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * search forward for a root, starting with objectid 'search_start' + * if a root key is found, the objectid we find is filled into 'found_objectid' + * and 0 is returned. < 0 is returned on error, 1 if there is nothing + * left in the tree. + */ +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + root = root->fs_info->tree_root; + search_key.objectid = search_start; + search_key.type = (u8)-1; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) { + ret = 1; + goto out; + } + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]); + if (search_key.type != BTRFS_ROOT_ITEM_KEY) { + search_key.offset++; + btrfs_release_path(root, path); + goto again; + } + ret = 0; + *found_objectid = search_key.objectid; + +out: + btrfs_free_path(path); + return ret; +} + +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + search_key.objectid = objectid; + search_key.type = BTRFS_ROOT_ITEM_KEY; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + + BUG_ON(ret == 0); + l = path->nodes[0]; + BUG_ON(path->slots[0] == 0); + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != objectid) { + ret = 1; + goto out; + } + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + memcpy(key, &found_key, sizeof(found_key)); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + (unsigned long long)key->objectid, key->type, + (unsigned long long)key->offset); + BUG_ON(1); + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + int ret; + ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); + return ret; +} + +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an + * offset lower than the latest root. They need to be queued for deletion to + * finish what was happening when we crashed. + */ +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest) +{ + struct btrfs_root *dead_root; + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = 0; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) + goto next; + + if (key.objectid < objectid) + goto next; + + if (key.objectid > objectid) + break; + + ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); + if (btrfs_disk_root_refs(leaf, ri) != 0) + goto next; + + memcpy(&found_key, &key, sizeof(key)); + key.offset++; + btrfs_release_path(root, path); + dead_root = + btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &found_key); + if (IS_ERR(dead_root)) { + ret = PTR_ERR(dead_root); + goto err; + } + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_add_dead_reloc_root(dead_root); + else + ret = btrfs_add_dead_root(dead_root, latest); + if (ret) + goto err; + goto again; +next: + slot++; + path->slots[0]++; + } + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +/* drop the root item for 'key' from 'root' */ +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +{ + struct btrfs_path *path; + int ret; + u32 refs; + struct btrfs_root_item *ri; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + leaf = path->nodes[0]; + ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); + + refs = btrfs_disk_root_refs(leaf, ri); + BUG_ON(refs != 0); + ret = btrfs_del_item(trans, root, path); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +#if 0 /* this will get used when snapshot deletion is implemented */ +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + + key.objectid = root_id; + key.type = type; + key.offset = ref_id; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, tree_root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} +#endif + +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id) +{ + struct btrfs_key key; + int ret; + + key.objectid = root_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = ref_id; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + return ret; +} + + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id, + u64 dirid, u64 sequence, + const char *name, int name_len) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + + path = btrfs_alloc_path(); + + key.objectid = root_id; + key.type = type; + key.offset = ref_id; + + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name_len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 00000000000..c0f7ecaf1e7 --- /dev/null +++ b/fs/btrfs/struct-funcs.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/highmem.h> + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + * + * The macro starts with a simple function prototype declaration so that + * sparse won't complain about it being static. + */ + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + return le##bits##_to_cpu(p->member); \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + return res; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + } \ +} + +#include "ctree.h" + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + if (eb->map_token && ptr >= eb->map_start && + ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { + memcpy(disk_key, eb->kaddr + ptr - eb->map_start, + sizeof(*disk_key)); + return; + } else if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, KM_USER1); + eb->map_token = NULL; + } + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 00000000000..b4c101d9322 --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/parser.h> +#include <linux/ctype.h> +#include <linux/namei.h> +#include <linux/miscdevice.h> +#include <linux/version.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "xattr.h" +#include "volumes.h" +#include "version.h" +#include "export.h" +#include "compression.h" + +#define BTRFS_SUPER_MAGIC 0x9123683E + +static struct super_operations btrfs_super_ops; + +static void btrfs_put_super(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + sb->s_fs_info = NULL; +} + +enum { + Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, +}; + +static match_table_t tokens = { + {Opt_degraded, "degraded"}, + {Opt_subvol, "subvol=%s"}, + {Opt_device, "device=%s"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_max_extent, "max_extent=%s"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, + {Opt_ssd, "ssd"}, + {Opt_noacl, "noacl"}, + {Opt_err, NULL}, +}; + +u64 btrfs_parse_size(char *str) +{ + u64 res; + int mult = 1; + char *end; + char last; + + res = simple_strtoul(str, &end, 10); + + last = end[0]; + if (isalpha(last)) { + last = tolower(last); + switch (last) { + case 'g': + mult *= 1024; + case 'm': + mult *= 1024; + case 'k': + mult *= 1024; + } + res = res * mult; + } + return res; +} + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + */ +int btrfs_parse_options(struct btrfs_root *root, char *options) +{ + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; + char *p, *num; + int intarg; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + options = kstrdup(options, GFP_NOFS); + if (!options) + return -ENOMEM; + + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_device: + /* + * These are parsed by btrfs_parse_early_options + * and can be happily ignored here. + */ + break; + case Opt_nodatasum: + printk(KERN_INFO "btrfs: setting nodatacsum\n"); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + printk(KERN_INFO "btrfs: setting nodatacow\n"); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_compress: + printk(KERN_INFO "btrfs: use compression\n"); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; + case Opt_ssd: + printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + break; + case Opt_nobarrier: + printk(KERN_INFO "btrfs: turning off barriers\n"); + btrfs_set_opt(info->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->thread_pool_size = intarg; + printk(KERN_INFO "btrfs: thread pool %d\n", + info->thread_pool_size); + } + break; + case Opt_max_extent: + num = match_strdup(&args[0]); + if (num) { + info->max_extent = btrfs_parse_size(num); + kfree(num); + + info->max_extent = max_t(u64, + info->max_extent, root->sectorsize); + printk(KERN_INFO "btrfs: max_extent at %llu\n", + info->max_extent); + } + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = btrfs_parse_size(num); + kfree(num); + + if (info->max_inline) { + info->max_inline = max_t(u64, + info->max_inline, + root->sectorsize); + } + printk(KERN_INFO "btrfs: max_inline at %llu\n", + info->max_inline); + } + break; + case Opt_alloc_start: + num = match_strdup(&args[0]); + if (num) { + info->alloc_start = btrfs_parse_size(num); + kfree(num); + printk(KERN_INFO + "btrfs: allocations start at %llu\n", + info->alloc_start); + } + break; + case Opt_noacl: + root->fs_info->sb->s_flags &= ~MS_POSIXACL; + break; + default: + break; + } + } + kfree(options); + return 0; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_early_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, + struct btrfs_fs_devices **fs_devices) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *p; + int error = 0; + + if (!options) + goto out; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + *subvol_name = match_strdup(&args[0]); + break; + case Opt_device: + error = btrfs_scan_one_device(match_strdup(&args[0]), + flags, holder, fs_devices); + if (error) + goto out_free_opts; + break; + default: + break; + } + } + + out_free_opts: + kfree(opts); + out: + /* + * If no subvolume name is specified we use the default one. Allocate + * a copy of the string "." here so that code later in the + * mount path doesn't care if it's the default volume or another one. + */ + if (!*subvol_name) { + *subvol_name = kstrdup(".", GFP_KERNEL); + if (!*subvol_name) + return -ENOMEM; + } + return error; +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root_dentry; + struct btrfs_super_block *disk_super; + struct btrfs_root *tree_root; + struct btrfs_inode *bi; + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; + sb->s_flags |= MS_POSIXACL; + + tree_root = open_ctree(sb, fs_devices, (char *)data); + + if (IS_ERR(tree_root)) { + printk("btrfs: open_ctree failed\n"); + return PTR_ERR(tree_root); + } + sb->s_fs_info = tree_root; + disk_super = &tree_root->fs_info->super_copy; + inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, + tree_root->fs_info->fs_root); + bi = BTRFS_I(inode); + bi->location.objectid = inode->i_ino; + bi->location.offset = 0; + bi->root = tree_root->fs_info->fs_root; + + btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); + + if (!inode) { + err = -ENOMEM; + goto fail_close; + } + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + root_dentry = d_alloc_root(inode); + if (!root_dentry) { + iput(inode); + err = -ENOMEM; + goto fail_close; + } +#if 0 + /* this does the super kobj at the same time */ + err = btrfs_sysfs_add_super(tree_root->fs_info); + if (err) + goto fail_close; +#endif + + sb->s_root = root_dentry; + + save_mount_options(sb, data); + return 0; + +fail_close: + close_ctree(tree_root); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + int ret; + root = btrfs_sb(sb); + + if (sb->s_flags & MS_RDONLY) + return 0; + + sb->s_dirt = 0; + if (!wait) { + filemap_flush(root->fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + btrfs_clean_old_snapshots(root); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + sb->s_dirt = 0; + return ret; +} + +static void btrfs_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *root = btrfs_sb(s); + + return root->fs_info->fs_devices == test_fs_devices; +} + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static int btrfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + char *subvol_name = NULL; + struct block_device *bdev = NULL; + struct super_block *s; + struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; + fmode_t mode = FMODE_READ; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + &subvol_name, &fs_devices); + if (error) + return error; + + error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); + if (error) + goto error_free_subvol_name; + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_free_subvol_name; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + up_write(&s->s_umount); + deactivate_super(s); + error = -EBUSY; + goto error_close_devices; + } + + btrfs_close_devices(fs_devices); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + goto error_free_subvol_name; + } + + btrfs_sb(s)->fs_info->bdev_holder = fs_type; + s->s_flags |= MS_ACTIVE; + } + + if (!strcmp(subvol_name, ".")) + root = dget(s->s_root); + else { + mutex_lock(&s->s_root->d_inode->i_mutex); + root = lookup_one_len(subvol_name, s->s_root, + strlen(subvol_name)); + mutex_unlock(&s->s_root->d_inode->i_mutex); + + if (IS_ERR(root)) { + up_write(&s->s_umount); + deactivate_super(s); + error = PTR_ERR(root); + goto error_free_subvol_name; + } + if (!root->d_inode) { + dput(root); + up_write(&s->s_umount); + deactivate_super(s); + error = -ENXIO; + goto error_free_subvol_name; + } + } + + mnt->mnt_sb = s; + mnt->mnt_root = root; + + kfree(subvol_name); + return 0; + +error_s: + error = PTR_ERR(s); +error_close_devices: + btrfs_close_devices(fs_devices); +error_free_subvol_name: + kfree(subvol_name); + return error; +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + + ret = btrfs_commit_super(root); + WARN_ON(ret); + } else { + if (root->fs_info->fs_devices->rw_devices == 0) + return -EACCES; + + if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + return -EINVAL; + + ret = btrfs_cleanup_reloc_trees(root); + WARN_ON(ret); + + ret = btrfs_cleanup_fs_roots(root->fs_info); + WARN_ON(ret); + + sb->s_flags &= ~MS_RDONLY; + } + + return 0; +} + +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)root->fs_info->fsid; + + buf->f_namelen = BTRFS_NAME_LEN; + buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; + buf->f_bfree = buf->f_blocks - + (btrfs_super_bytes_used(disk_super) >> bits); + buf->f_bavail = buf->f_bfree; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_type = BTRFS_SUPER_MAGIC; + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .get_sb = btrfs_get_sb, + .kill_sb = kill_anon_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * used by btrfsctl to scan devices when no FS is mounted + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret = 0; + int len; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = kmalloc(sizeof(*vol), GFP_KERNEL); + if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { + ret = -EFAULT; + goto out; + } + len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + break; + } +out: + kfree(vol); + return ret; +} + +static void btrfs_write_super_lockfs(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&root->fs_info->cleaner_mutex); +} + +static void btrfs_unlockfs(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); +} + +static struct super_operations btrfs_super_ops = { + .delete_inode = btrfs_delete_inode, + .put_super = btrfs_put_super, + .write_super = btrfs_write_super, + .sync_fs = btrfs_sync_fs, + .show_options = generic_show_options, + .write_inode = btrfs_write_inode, + .dirty_inode = btrfs_dirty_inode, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .write_super_lockfs = btrfs_write_super_lockfs, + .unlockfs = btrfs_unlockfs, +}; + +static const struct file_operations btrfs_ctl_fops = { + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = btrfs_control_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice btrfs_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +static int btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static void btrfs_interface_exit(void) +{ + if (misc_deregister(&btrfs_misc) < 0) + printk(KERN_INFO "misc_deregister failed for control device"); +} + +static int __init init_btrfs_fs(void) +{ + int err; + + err = btrfs_init_sysfs(); + if (err) + return err; + + err = btrfs_init_cachep(); + if (err) + goto free_sysfs; + + err = extent_io_init(); + if (err) + goto free_cachep; + + err = extent_map_init(); + if (err) + goto free_extent_io; + + err = btrfs_interface_init(); + if (err) + goto free_extent_map; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; + + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); + return 0; + +unregister_ioctl: + btrfs_interface_exit(); +free_extent_map: + extent_map_exit(); +free_extent_io: + extent_io_exit(); +free_cachep: + btrfs_destroy_cachep(); +free_sysfs: + btrfs_exit_sysfs(); + return err; +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_destroy_cachep(); + extent_map_exit(); + extent_io_exit(); + btrfs_interface_exit(); + unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); + btrfs_zlib_exit(); +} + +module_init(init_btrfs_fs) +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 00000000000..a240b6fa81d --- /dev/null +++ b/fs/btrfs/sysfs.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/kobject.h> + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_used(&root->root_item)); +} + +static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_limit(&root->root_item)); +} + +static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) +{ + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); +} + +static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); +} + +static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); +} + +/* this is for root attrs (subvols/snapshots) */ +struct btrfs_root_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_root *, char *); + ssize_t (*store)(struct btrfs_root *, const char *, size_t); +}; + +#define ROOT_ATTR(name, mode, show, store) \ +static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ + show, store) + +ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); +ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); + +static struct attribute *btrfs_root_attrs[] = { + &btrfs_root_attr_blocks_used.attr, + &btrfs_root_attr_block_limit.attr, + NULL, +}; + +/* this is for super attrs (actual full fs) */ +struct btrfs_super_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_fs_info *, char *); + ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); +}; + +#define SUPER_ATTR(name, mode, show, store) \ +static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ + show, store) + +SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); +SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); +SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); + +static struct attribute *btrfs_super_attrs[] = { + &btrfs_super_attr_blocks_used.attr, + &btrfs_super_attr_total_blocks.attr, + &btrfs_super_attr_blocksize.attr, + NULL, +}; + +static ssize_t btrfs_super_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->show ? a->show(fs, buf) : 0; +} + +static ssize_t btrfs_super_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->store ? a->store(fs, buf, len) : 0; +} + +static ssize_t btrfs_root_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + + return a->show ? a->show(root, buf) : 0; +} + +static ssize_t btrfs_root_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + return a->store ? a->store(root, buf, len) : 0; +} + +static void btrfs_super_release(struct kobject *kobj) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + complete(&fs->kobj_unregister); +} + +static void btrfs_root_release(struct kobject *kobj) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + complete(&root->kobj_unregister); +} + +static struct sysfs_ops btrfs_super_attr_ops = { + .show = btrfs_super_attr_show, + .store = btrfs_super_attr_store, +}; + +static struct sysfs_ops btrfs_root_attr_ops = { + .show = btrfs_root_attr_show, + .store = btrfs_root_attr_store, +}; + +static struct kobj_type btrfs_root_ktype = { + .default_attrs = btrfs_root_attrs, + .sysfs_ops = &btrfs_root_attr_ops, + .release = btrfs_root_release, +}; + +static struct kobj_type btrfs_super_ktype = { + .default_attrs = btrfs_super_attrs, + .sysfs_ops = &btrfs_super_attr_ops, + .release = btrfs_super_release, +}; + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) +{ + int error; + char *name; + char c; + int len = strlen(fs->sb->s_id) + 1; + int i; + + name = kmalloc(len, GFP_NOFS); + if (!name) { + error = -ENOMEM; + goto fail; + } + + for (i = 0; i < len; i++) { + c = fs->sb->s_id[i]; + if (c == '/' || c == '\\') + c = '!'; + name[i] = c; + } + name[len] = '\0'; + + fs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype, + NULL, "%s", name); + kfree(name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); + return error; +} + +int btrfs_sysfs_add_root(struct btrfs_root *root) +{ + int error; + + error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype, + &root->fs_info->super_kobj, + "%s", root->name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); + return error; +} + +void btrfs_sysfs_del_root(struct btrfs_root *root) +{ + kobject_put(&root->root_kobj); + wait_for_completion(&root->kobj_unregister); +} + +void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) +{ + kobject_put(&fs->super_kobj); + wait_for_completion(&fs->kobj_unregister); +} + +int btrfs_init_sysfs(void) +{ + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + return 0; +} + +void btrfs_exit_sysfs(void) +{ + kset_unregister(btrfs_kset); +} + diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 00000000000..8a08f944334 --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,1097 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "ref-cache.h" +#include "tree-log.h" + +#define BTRFS_ROOT_TRANS_TAG 0 + +static noinline void put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(transaction->use_count == 0); + transaction->use_count--; + if (transaction->use_count == 0) { + list_del_init(&transaction->list); + memset(transaction, 0, sizeof(*transaction)); + kmem_cache_free(btrfs_transaction_cachep, transaction); + } +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + cur_trans = root->fs_info->running_transaction; + if (!cur_trans) { + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, + GFP_NOFS); + BUG_ON(!cur_trans); + root->fs_info->generation++; + root->fs_info->last_alloc = 0; + root->fs_info->last_data_alloc = 0; + cur_trans->num_writers = 1; + cur_trans->num_joined = 0; + cur_trans->transid = root->fs_info->generation; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + cur_trans->use_count = 1; + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->new_trans_lock); + } else { + cur_trans->num_writers++; + cur_trans->num_joined++; + } + + return 0; +} + +/* + * this does all the record keeping required to make sure that a reference + * counted root is properly recorded in a given transaction. This is required + * to make sure the old root from before we joined the transaction is deleted + * when the transaction commits + */ +noinline int btrfs_record_root_in_trans(struct btrfs_root *root) +{ + struct btrfs_dirty_root *dirty; + u64 running_trans_id = root->fs_info->running_transaction->transid; + if (root->ref_cows && root->last_trans < running_trans_id) { + WARN_ON(root == root->fs_info->extent_root); + if (root->root_item.refs != 0) { + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + BUG_ON(!dirty); + dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); + BUG_ON(!dirty->root); + dirty->latest_root = root; + INIT_LIST_HEAD(&dirty->list); + + root->commit_root = btrfs_root_node(root); + + memcpy(dirty->root, root, sizeof(*root)); + spin_lock_init(&dirty->root->node_lock); + spin_lock_init(&dirty->root->list_lock); + mutex_init(&dirty->root->objectid_mutex); + mutex_init(&dirty->root->log_mutex); + INIT_LIST_HEAD(&dirty->root->dead_list); + dirty->root->node = root->commit_root; + dirty->root->commit_root = NULL; + + spin_lock(&root->list_lock); + list_add(&dirty->root->dead_list, &root->dead_list); + spin_unlock(&root->list_lock); + + root->dirty_root = dirty; + } else { + WARN_ON(1); + } + root->last_trans = running_trans_id; + } + return 0; +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + + cur_trans = root->fs_info->running_transaction; + if (cur_trans && cur_trans->blocked) { + DEFINE_WAIT(wait); + cur_trans->use_count++; + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (cur_trans->blocked) { + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } else { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + } + put_transaction(cur_trans); + } +} + +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + int num_blocks, int wait) +{ + struct btrfs_trans_handle *h = + kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->log_root_recovering && + ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) + wait_current_trans(root); + ret = join_transaction(root); + BUG_ON(ret); + + btrfs_record_root_in_trans(root); + h->transid = root->fs_info->running_transaction->transid; + h->transaction = root->fs_info->running_transaction; + h->blocks_reserved = num_blocks; + h->blocks_used = 0; + h->block_group = 0; + h->alloc_exclude_nr = 0; + h->alloc_exclude_start = 0; + root->fs_info->running_transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + return h; +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, 1); +} +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, 0); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks) +{ + return start_transaction(r, num_blocks, 2); +} + +/* wait for a transaction commit to be fully complete */ +static noinline int wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + DEFINE_WAIT(wait); + mutex_lock(&root->fs_info->trans_mutex); + while (!commit->commit_done) { + prepare_to_wait(&commit->commit_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (commit->commit_done) + break; + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + } + mutex_unlock(&root->fs_info->trans_mutex); + finish_wait(&commit->commit_wait, &wait); + return 0; +} + +/* + * rate limit against the drop_snapshot code. This helps to slow down new + * operations if the drop_snapshot code isn't able to keep up. + */ +static void throttle_on_drops(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + int harder_count = 0; + +harder: + if (atomic_read(&info->throttles)) { + DEFINE_WAIT(wait); + int thr; + thr = atomic_read(&info->throttle_gen); + + do { + prepare_to_wait(&info->transaction_throttle, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&info->throttles)) { + finish_wait(&info->transaction_throttle, &wait); + break; + } + schedule(); + finish_wait(&info->transaction_throttle, &wait); + } while (thr == atomic_read(&info->throttle_gen)); + harder_count++; + + if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && + harder_count < 2) + goto harder; + + if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && + harder_count < 10) + goto harder; + + if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && + harder_count < 20) + goto harder; + } +} + +void btrfs_throttle(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->open_ioctl_trans) + wait_current_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); + + throttle_on_drops(root); +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int throttle) +{ + struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *info = root->fs_info; + + mutex_lock(&info->trans_mutex); + cur_trans = info->running_transaction; + WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans->num_writers < 1); + cur_trans->num_writers--; + + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + mutex_unlock(&info->trans_mutex); + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (throttle) + throttle_on_drops(root); + + return 0; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0); +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + while (start <= end) { + cond_resched(); + + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + + btree_lock_page_hook(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (PageWriteback(page)) { + if (PageDirty(page)) + wait_on_page_writeback(page); + else { + unlock_page(page); + page_cache_release(page); + continue; + } + } + err = write_one_page(page, 0); + if (err) + werr = err; + page_cache_release(page); + } + } + while (1) { + ret = find_first_extent_bit(dirty_pages, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + if (PageDirty(page)) { + btree_lock_page_hook(page); + wait_on_page_writeback(page); + err = write_one_page(page, 0); + if (err) + werr = err; + } + wait_on_page_writeback(page); + page_cache_release(page); + cond_resched(); + } + } + if (err) + werr = err; + return werr; +} + +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans || !trans->transaction) { + struct inode *btree_inode; + btree_inode = root->fs_info->btree_inode; + return filemap_write_and_wait(btree_inode->i_mapping); + } + return btrfs_write_and_wait_marked_extents(root, + &trans->transaction->dirty_pages); +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + struct btrfs_root *tree_root = root->fs_info->tree_root; + + btrfs_extent_post_op(trans, root); + btrfs_write_dirty_block_groups(trans, root); + btrfs_extent_post_op(trans, root); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start) + break; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, trans->transid); + + btrfs_extent_post_op(trans, root); + + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + BUG_ON(ret); + btrfs_write_dirty_block_groups(trans, root); + btrfs_extent_post_op(trans, root); + } + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *next; + struct extent_buffer *eb; + + btrfs_extent_post_op(trans, fs_info->tree_root); + + eb = btrfs_lock_root_node(fs_info->tree_root); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + btrfs_extent_post_op(trans, fs_info->tree_root); + + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + + update_cowonly_root(trans, root); + } + return 0; +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) +{ + struct btrfs_dirty_root *dirty; + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + if (!dirty) + return -ENOMEM; + dirty->root = root; + dirty->latest_root = latest; + + mutex_lock(&root->fs_info->trans_mutex); + list_add(&dirty->list, &latest->fs_info->dead_roots); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; +} + +/* + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot. This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list + */ +static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, + struct radix_tree_root *radix, + struct list_head *list) +{ + struct btrfs_dirty_root *dirty; + struct btrfs_root *gang[8]; + struct btrfs_root *root; + int i; + int ret; + int err = 0; + u32 refs; + + while (1) { + ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + radix_tree_tag_clear(radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + BUG_ON(!root->ref_tree); + dirty = root->dirty_root; + + btrfs_free_log(trans, root); + btrfs_free_reloc_root(trans, root); + + if (root->commit_root == root->node) { + WARN_ON(root->node->start != + btrfs_root_bytenr(&root->root_item)); + + free_extent_buffer(root->commit_root); + root->commit_root = NULL; + root->dirty_root = NULL; + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + spin_unlock(&root->list_lock); + + kfree(dirty->root); + kfree(dirty); + + /* make sure to update the root on disk + * so we get any updates to the block used + * counts + */ + err = btrfs_update_root(trans, + root->fs_info->tree_root, + &root->root_key, + &root->root_item); + continue; + } + + memset(&root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root->root_item.drop_level = 0; + root->commit_root = NULL; + root->dirty_root = NULL; + root->root_key.offset = root->fs_info->generation; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, + root->root_key.offset); + + err = btrfs_insert_root(trans, root->fs_info->tree_root, + &root->root_key, + &root->root_item); + if (err) + break; + + refs = btrfs_root_refs(&dirty->root->root_item); + btrfs_set_root_refs(&dirty->root->root_item, refs - 1); + err = btrfs_update_root(trans, root->fs_info->tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + + BUG_ON(err); + if (refs == 1) { + list_add(&dirty->list, list); + } else { + WARN_ON(1); + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + } + } + } + return err; +} + +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +{ + struct btrfs_fs_info *info = root->fs_info; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + + smp_mb(); + if (root->defrag_running) + return 0; + trans = btrfs_start_transaction(root, 1); + while (1) { + root->defrag_running = 1; + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root, nr); + cond_resched(); + + trans = btrfs_start_transaction(root, 1); + if (root->fs_info->closing || ret != -EAGAIN) + break; + } + root->defrag_running = 0; + smp_mb(); + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ +static noinline int drop_dirty_roots(struct btrfs_root *tree_root, + struct list_head *list) +{ + struct btrfs_dirty_root *dirty; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 num_bytes; + u64 bytes_used; + u64 max_useless; + int ret = 0; + int err; + + while (!list_empty(list)) { + struct btrfs_root *root; + + dirty = list_entry(list->prev, struct btrfs_dirty_root, list); + list_del_init(&dirty->list); + + num_bytes = btrfs_root_used(&dirty->root->root_item); + root = dirty->latest_root; + atomic_inc(&root->fs_info->throttles); + + while (1) { + trans = btrfs_start_transaction(tree_root, 1); + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, dirty->root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + err = btrfs_update_root(trans, + tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + if (err) + ret = err; + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + BUG_ON(ret); + atomic_dec(&root->fs_info->throttles); + wake_up(&root->fs_info->transaction_throttle); + + num_bytes -= btrfs_root_used(&dirty->root->root_item); + bytes_used = btrfs_root_used(&root->root_item); + if (num_bytes) { + btrfs_record_root_in_trans(root); + btrfs_set_root_used(&root->root_item, + bytes_used - num_bytes); + } + + ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); + if (ret) { + BUG(); + break; + } + mutex_unlock(&root->fs_info->drop_mutex); + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + if (!list_empty(&root->dead_list)) { + struct btrfs_root *oldest; + oldest = list_entry(root->dead_list.prev, + struct btrfs_root, dead_list); + max_useless = oldest->root_key.offset - 1; + } else { + max_useless = root->root_key.offset - 1; + } + spin_unlock(&root->list_lock); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + ret = btrfs_remove_leaf_refs(root, max_useless, 0); + BUG_ON(ret); + + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + return ret; +} + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct extent_buffer *tmp; + struct extent_buffer *old; + int ret; + u64 objectid; + + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); + if (!new_root_item) { + ret = -ENOMEM; + goto fail; + } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); + if (ret) + goto fail; + + btrfs_record_root_in_trans(root); + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + + key.objectid = objectid; + key.offset = trans->transid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + old = btrfs_lock_root_node(root); + btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); + + btrfs_copy_root(trans, root, old, &tmp, objectid); + btrfs_tree_unlock(old); + free_extent_buffer(old); + + btrfs_set_root_bytenr(new_root_item, tmp->start); + btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); + btrfs_set_root_generation(new_root_item, trans->transid); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + if (ret) + goto fail; + + key.offset = (u64)-1; + memcpy(&pending->root_key, &key, sizeof(key)); +fail: + kfree(new_root_item); + return ret; +} + +static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + int ret; + int namelen; + u64 index = 0; + struct btrfs_trans_handle *trans; + struct inode *parent_inode; + struct inode *inode; + struct btrfs_root *parent_root; + + parent_inode = pending->dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; + trans = btrfs_join_transaction(parent_root, 1); + + /* + * insert the directory item + */ + namelen = strlen(pending->name); + ret = btrfs_set_inode_index(parent_inode, &index); + ret = btrfs_insert_dir_item(trans, parent_root, + pending->name, namelen, + parent_inode->i_ino, + &pending->root_key, BTRFS_FT_DIR, index); + + if (ret) + goto fail; + + btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + pending->root_key.objectid, + BTRFS_ROOT_BACKREF_KEY, + parent_root->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + parent_root->root_key.objectid, + BTRFS_ROOT_REF_KEY, + pending->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + inode = btrfs_lookup_dentry(parent_inode, pending->dentry); + d_instantiate(pending->dentry, inode); +fail: + btrfs_end_transaction(trans, fs_info->fs_root); + return ret; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + struct list_head *cur; + int ret; + + list_for_each(cur, head) { + pending = list_entry(cur, struct btrfs_pending_snapshot, list); + ret = create_pending_snapshot(trans, fs_info, pending); + BUG_ON(ret); + } + return 0; +} + +static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret; + + while (!list_empty(head)) { + pending = list_entry(head->next, + struct btrfs_pending_snapshot, list); + ret = finish_pending_snapshot(fs_info, pending); + BUG_ON(ret); + list_del(&pending->list); + kfree(pending->name); + kfree(pending); + } + return 0; +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + unsigned long joined = 0; + unsigned long timeout = 1; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct list_head dirty_fs_roots; + struct extent_io_tree *pinned_copy; + DEFINE_WAIT(wait); + int ret; + + INIT_LIST_HEAD(&dirty_fs_roots); + mutex_lock(&root->fs_info->trans_mutex); + if (trans->transaction->in_commit) { + cur_trans = trans->transaction; + trans->transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_end_transaction(trans, root); + + ret = wait_for_commit(root, cur_trans); + BUG_ON(ret); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; + } + + pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); + if (!pinned_copy) + return -ENOMEM; + + extent_io_tree_init(pinned_copy, + root->fs_info->btree_inode->i_mapping, GFP_NOFS); + + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + cur_trans = trans->transaction; + if (cur_trans->list.prev != &root->fs_info->trans_list) { + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (!prev_trans->commit_done) { + prev_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + wait_for_commit(root, prev_trans); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(prev_trans); + } + } + + do { + int snap_pending = 0; + joined = cur_trans->num_joined; + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + + WARN_ON(cur_trans != trans->transaction); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (cur_trans->num_writers > 1) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = 1; + + mutex_unlock(&root->fs_info->trans_mutex); + + if (snap_pending) { + ret = btrfs_wait_ordered_extents(root, 1); + BUG_ON(ret); + } + + schedule_timeout(timeout); + + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&cur_trans->writer_wait, &wait); + } while (cur_trans->num_writers > 1 || + (cur_trans->num_joined != joined)); + + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + + WARN_ON(cur_trans != trans->transaction); + + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + /* + * keep tree reloc code from adding new reloc trees + */ + mutex_lock(&root->fs_info->tree_reloc_mutex); + + + ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, + &dirty_fs_roots); + BUG_ON(ret); + + /* add_dirty_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + + ret = btrfs_commit_tree_roots(trans, root); + BUG_ON(ret); + + cur_trans = root->fs_info->running_transaction; + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + btrfs_set_super_generation(&root->fs_info->super_copy, + cur_trans->transid); + btrfs_set_super_root(&root->fs_info->super_copy, + root->fs_info->tree_root->node->start); + btrfs_set_super_root_level(&root->fs_info->super_copy, + btrfs_header_level(root->fs_info->tree_root->node)); + + btrfs_set_super_chunk_root(&root->fs_info->super_copy, + chunk_root->node->start); + btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, + btrfs_header_level(chunk_root->node)); + btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, + btrfs_header_generation(chunk_root->node)); + + if (!root->fs_info->log_root_recovering) { + btrfs_set_super_log_root(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + } + + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + + btrfs_copy_pinned(root, pinned_copy); + + trans->transaction->blocked = 0; + wake_up(&root->fs_info->transaction_throttle); + wake_up(&root->fs_info->transaction_wait); + + mutex_unlock(&root->fs_info->trans_mutex); + ret = btrfs_write_and_wait_transaction(trans, root); + BUG_ON(ret); + write_ctree_super(trans, root, 0); + + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + + btrfs_finish_extent_commit(trans, root, pinned_copy); + kfree(pinned_copy); + + btrfs_drop_dead_reloc_roots(root); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + /* do the directory inserts of any pending snapshot creations */ + finish_pending_snapshots(trans, root->fs_info); + + mutex_lock(&root->fs_info->trans_mutex); + + cur_trans->commit_done = 1; + root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); + if (root->fs_info->closing) + list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); + + mutex_unlock(&root->fs_info->trans_mutex); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (root->fs_info->closing) + drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); + return ret; +} + +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ +int btrfs_clean_old_snapshots(struct btrfs_root *root) +{ + struct list_head dirty_roots; + INIT_LIST_HEAD(&dirty_roots); +again: + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&root->fs_info->dead_roots, &dirty_roots); + mutex_unlock(&root->fs_info->trans_mutex); + + if (!list_empty(&dirty_roots)) { + drop_dirty_roots(root, &dirty_roots); + goto again; + } + return 0; +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 00000000000..ea292117f88 --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ +#include "btrfs_inode.h" + +struct btrfs_transaction { + u64 transid; + unsigned long num_writers; + unsigned long num_joined; + int in_commit; + int use_count; + int commit_done; + int blocked; + struct list_head list; + struct extent_io_tree dirty_pages; + unsigned long start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; +}; + +struct btrfs_trans_handle { + u64 transid; + unsigned long blocks_reserved; + unsigned long blocks_used; + struct btrfs_transaction *transaction; + u64 block_group; + u64 alloc_exclude_start; + u64 alloc_exclude_nr; +}; + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct btrfs_root *root; + char *name; + struct btrfs_key root_key; + struct list_head list; +}; + +struct btrfs_dirty_root { + struct list_head list; + struct btrfs_root *root; + struct btrfs_root *latest_root; +}; + +static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + trans->block_group = BTRFS_I(inode)->block_group; +} + +static inline void btrfs_update_inode_block_group( + struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->block_group = trans->block_group; +} + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->last_trans = trans->transaction->transid; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks); +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_root *root); +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +#endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 00000000000..3e8358c3616 --- /dev/null +++ b/fs/btrfs/tree-defrag.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" + +/* defrag all the leaves in a given btree. If cache_only == 1, don't read + * things from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int orig_level; + int is_extent = 0; + int next_key_ret = 0; + u64 last_ret = 0; + u64 min_trans = 0; + + if (cache_only) + goto out; + + if (root->fs_info->extent_root == root) { + /* + * there's recursion here right now in the tree locking, + * we can't defrag the extent root without deadlock + */ + goto out; + } + + if (root->ref_cows == 0 && !is_extent) + goto out; + + if (btrfs_test_opt(root, SSD)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + orig_level = level; + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + if (cache_only) + min_trans = root->defrag_trans_start; + + ret = btrfs_search_forward(root, &key, NULL, path, + cache_only, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + min_trans); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + cache_only, &last_ret, + &root->defrag_progress); + WARN_ON(ret && ret != -EAGAIN); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } + + btrfs_release_path(root, path); + if (is_extent) + btrfs_extent_post_op(trans, root); +out: + if (path) + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; + } + return ret; +} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 00000000000..d81cda2e077 --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,2898 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" +#include "tree-log.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * btrfs_add_log_tree adds a new per-subvolume log tree into the + * tree of log tree roots. This must be called with a tree log transaction + * running (see start_log_trans). + */ +static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + int ret; + u64 objectid = root->root_key.objectid; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + return ret; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 0); + btrfs_set_root_used(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, 0); + + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, + &root_item); + if (ret) + goto fail; + + new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, + &key); + BUG_ON(!new_root); + + WARN_ON(root->log_root); + root->log_root = new_root; + + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + new_root->ref_cows = 0; + new_root->last_trans = trans->transid; + + /* + * we need to make sure the root block for this new tree + * is marked as dirty in the dirty_log_pages tree. This + * is how it gets flushed down to disk at tree log commit time. + * + * the tree logging mutex keeps others from coming in and changing + * the new_root->node, so we can safely access it here + */ + set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, + new_root->node->start + new_root->node->len - 1, + GFP_NOFS); + +fail: + return ret; +} + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); + BUG_ON(ret); + } + if (!root->log_root) { + ret = btrfs_add_log_tree(trans, root); + BUG_ON(ret); + } + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + mutex_unlock(&root->fs_info->tree_log_mutex); + return 0; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->fs_info->tree_log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + } + mutex_unlock(&root->fs_info->tree_log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +static int end_log_trans(struct btrfs_root *root) +{ + atomic_dec(&root->fs_info->tree_log_writers); + smp_mb(); + if (waitqueue_active(&root->fs_info->tree_log_wait)) + wake_up(&root->fs_info->tree_log_wait); + return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + if (wc->pin) { + mutex_lock(&log->fs_info->pinned_mutex); + btrfs_update_pinned_extents(log->fs_info->extent_root, + eb->start, eb->len, 1); + mutex_unlock(&log->fs_info->pinned_mutex); + } + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return 0; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(root, path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(root, path); + return 0; + } + + } +insert: + btrfs_release_path(root, path); + /* try to insert the key into the destination tree */ + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) { + btrfs_truncate_item(trans, root, path, item_size, 1); + } else if (found_size < item_size) { + ret = btrfs_extend_item(trans, root, path, + item_size - found_size); + BUG_ON(ret); + } + } else if (ret) { + BUG(); + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) + goto no_copy; + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(root, path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct inode *inode; + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + + } + if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 mask = root->sectorsize - 1; + u64 extent_end; + u64 alloc_hint; + u64 start = key->offset; + u64 saved_nbytes; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) + extent_end = start + btrfs_file_extent_num_bytes(eb, item); + else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, item); + extent_end = (start + size + mask) & ~mask; + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(root, path); + goto out; + } + } + btrfs_release_path(root, path); + + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, + start, extent_end, start, &alloc_hint); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + unsigned long dest_offset; + struct btrfs_key ins; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + BUG_ON(ret); + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + + if (ins.objectid > 0) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_extent(root, ins.objectid, + ins.offset); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid); + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_extent(trans, root, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid, + &ins); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_range(root->log_root, + csum_start, csum_end - 1, + &ordered_sums); + BUG_ON(ret); + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, + root->fs_info->csum_root, + sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + } else { + btrfs_release_path(root, path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + } + + inode_set_bytes(inode, saved_nbytes); + btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(root, path); + + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + BUG_ON(ret); + kfree(name); + + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(root, path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir; + int ret; + struct btrfs_key location; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *inode; + char *name; + int namelen; + unsigned long ref_ptr; + unsigned long ref_end; + + location.objectid = key->objectid; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + goto out; + } + + /* + * look for a conflicting back reference in the metadata. + * if we find one we have to unlink that name of the file + * before we add our new link. Later on, we overwrite any + * existing back reference, and we don't want to create + * dangling pointers in the directory. + */ +conflict_again: + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *victim_name; + int victim_name_len; + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + struct extent_buffer *leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (key->objectid == key->offset) + goto out_nowrite; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + BUG_ON(!victim_name); + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log, key, victim_name, + victim_name_len)) { + btrfs_inc_nlink(inode); + btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + kfree(victim_name); + btrfs_release_path(root, path); + goto conflict_again; + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + + /* look for a conflicting name */ + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, + btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + +out: + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + if (ref_ptr < ref_end) + goto again; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + +out_nowrite: + btrfs_release_path(root, path); + iput(dir); + iput(inode); + return 0; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + u64 nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + + key.objectid = inode->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != inode->i_ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + key.offset--; + btrfs_release_path(root, path); + } + btrfs_free_path(path); + if (nlink != inode->i_nlink) { + inode->i_nlink = nlink; + btrfs_update_inode(trans, root, inode); + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_release_path(root, path); + inode = read_one_inode(root, key.offset); + BUG_ON(!inode); + + ret = fixup_inode_link_count(trans, root, inode); + BUG_ON(ret); + + iput(inode); + + if (key.offset == 0) + break; + key.offset--; + } + btrfs_release_path(root, path); + return 0; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + BUG_ON(!inode); + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(root, path); + if (ret == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + u8 log_type; + int exists; + int ret; + + dir = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); + if (exists == 0) + exists = 1; + else + exists = 0; + btrfs_release_path(root, path); + + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + BUG(); + } + if (!dst_di || IS_ERR(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + if (!exists) + goto out; + + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + BUG_ON(ret); + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(root, path); + kfree(name); + iput(dir); + return 0; + +insert: + btrfs_release_path(root, path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + + if (ret && ret != -ENOENT) + BUG(); + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + BUG_ON(ret); + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(root, path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (!log_di || IS_ERR(log_di)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + BUG_ON(ret); + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); + kfree(name); + iput(inode); + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } + btrfs_release_path(log, log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while (1) { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, &found_key); + BUG_ON(ret); + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(root, path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_release_path(root, path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + u32 item_size; + int level; + int i; + int ret; + + btrfs_read_buffer(eb, gen); + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + item_size = btrfs_item_size_nr(eb, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct inode *inode; + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid); + BUG_ON(ret); + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + + /* for regular files, truncate away + * extents past the new EOF + */ + if (S_ISREG(mode)) { + inode = read_one_inode(root, + key.objectid); + BUG_ON(!inode); + + ret = btrfs_truncate_inode_items(wc->trans, + root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + BUG_ON(ret); + iput(inode); + } + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); + } + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_INODE_REF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_DIR_ITEM_KEY || + key.type == BTRFS_DIR_INDEX_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } + } + btrfs_free_path(path); + return 0; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while (*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + + wc->process_func(root, next, wc, ptr_gen); + + if (*level == 1) { + path->slots[*level]++; + if (wc->free) { + btrfs_read_buffer(next, ptr_gen); + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(next); + continue; + } + btrfs_read_buffer(next, ptr_gen); + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + bytenr = path->nodes[*level]->start; + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + + if (wc->free) { + next = path->nodes[*level]; + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + } + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, + next); + BUG_ON(ret); + } + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + path->nodes[*level]->start, + path->nodes[*level]->len); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (orig_level == 0) { + ret = btrfs_drop_leaf_ref(trans, log, + next); + BUG_ON(ret); + } + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(log, next->start, + next->len); + BUG_ON(ret); + } + } + + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } + btrfs_free_path(path); + if (wc->free) + free_extent_buffer(log->node); + return ret; +} + +static int wait_log_commit(struct btrfs_root *log) +{ + DEFINE_WAIT(wait); + u64 transid = log->fs_info->tree_log_transid; + + do { + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) + schedule(); + finish_wait(&log->fs_info->tree_log_wait, &wait); + mutex_lock(&log->fs_info->tree_log_mutex); + } while (transid == log->fs_info->tree_log_transid && + atomic_read(&log->fs_info->tree_log_commit)); + return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + unsigned long batch; + struct btrfs_root *log = root->log_root; + + mutex_lock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) { + wait_log_commit(log); + goto out; + } + atomic_set(&log->fs_info->tree_log_commit, 1); + + while (1) { + batch = log->fs_info->tree_log_batch; + mutex_unlock(&log->fs_info->tree_log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&log->fs_info->tree_log_mutex); + + while (atomic_read(&log->fs_info->tree_log_writers)) { + DEFINE_WAIT(wait); + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_writers)) + schedule(); + mutex_lock(&log->fs_info->tree_log_mutex); + finish_wait(&log->fs_info->tree_log_wait, &wait); + } + if (batch == log->fs_info->tree_log_batch) + break; + } + + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + BUG_ON(ret); + ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, + &root->fs_info->log_root_tree->dirty_log_pages); + BUG_ON(ret); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log->fs_info->log_root_tree->node->start); + btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_header_level(log->fs_info->log_root_tree->node)); + + write_ctree_super(trans, log->fs_info->tree_root, 2); + log->fs_info->tree_log_transid++; + log->fs_info->tree_log_batch = 0; + atomic_set(&log->fs_info->tree_log_commit, 0); + smp_mb(); + if (waitqueue_active(&log->fs_info->tree_log_wait)) + wake_up(&log->fs_info->tree_log_wait); +out: + mutex_unlock(&log->fs_info->tree_log_mutex); + return 0; +} + +/* * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + int ret; + struct btrfs_root *log; + struct key; + u64 start; + u64 end; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + if (!root->log_root || root->fs_info->log_root_recovering) + return 0; + + log = root->log_root; + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(&log->dirty_log_pages, + start, end, GFP_NOFS); + } + + log = root->log_root; + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + root->log_root = NULL; + kfree(root->log_root); + return 0; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + u64 bytenr = btrfs_root_bytenr(&log->root_item); + int ret; + + if (log->node->start == bytenr) + return 0; + + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_generation(&log->root_item, trans->transid); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + BUG_ON(ret); + return ret; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int bytes_del = 0; + + if (BTRFS_I(dir)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, + name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + btrfs_release_path(log, path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, + index, name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir->i_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(log, path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(log, path); + } + + btrfs_free_path(path); + mutex_unlock(&BTRFS_I(dir)->log_mutex); + end_log_trans(root); + + return 0; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + if (BTRFS_I(inode)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + end_log_trans(root); + + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + BUG_ON(ret); + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(log, path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + + log = root->log_root; + max_key.objectid = inode->i_ino; + max_key.offset = (u64)-1; + max_key.type = key_type; + + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = min_offset; + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != inode->i_ino || + min_key.type != key_type) { + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + } + } + btrfs_release_path(root, path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) { + WARN_ON(1); + goto done; + } + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != inode->i_ino || + min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + BUG_ON(ret); + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != inode->i_ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + + BUG_ON(ret); + last_offset = tmp.offset; + goto done; + } + } +done: + *last_offset_ret = last_offset; + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + + /* insert the log range keys to indicate where the log is valid */ + ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, + first_offset, last_offset); + BUG_ON(ret); + return 0; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while (1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, min_key, + &max_key); + BUG_ON(ret); + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + + if (ret != 1) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + ret = btrfs_del_item(trans, log, path); + BUG_ON(ret); + btrfs_release_path(log, path); + } + btrfs_release_path(log, path); + return 0; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *dst_path, + struct extent_buffer *src, + int start_slot, int nr, int inode_only) +{ + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + char *ins_data; + int i; + struct list_head ordered_sums; + + INIT_LIST_HEAD(&ordered_sums); + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + + for (i = 0; i < nr; i++) { + ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } + ret = btrfs_insert_empty_items(trans, log, dst_path, + ins_keys, ins_sizes, nr); + BUG_ON(ret); + + for (i = 0; i < nr; i++) { + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + + if (inode_only == LOG_INODE_EXISTS && + ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(dst_path->nodes[0], + inode_item, 0); + } + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + int found_type; + extent = btrfs_item_ptr(src, start_slot + i, + struct btrfs_file_extent_item); + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 ds = btrfs_file_extent_disk_bytenr(src, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(src, + extent); + u64 cs = btrfs_file_extent_offset(src, extent); + u64 cl = btrfs_file_extent_num_bytes(src, + extent);; + if (btrfs_file_extent_compression(src, + extent)) { + cs = 0; + cl = dl; + } + /* ds == 0 is a hole */ + if (ds != 0) { + ret = btrfs_inc_extent_ref(trans, log, + ds, dl, + dst_path->nodes[0]->start, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, + ins_keys[i].objectid); + BUG_ON(ret); + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); + } + } + } + dst_path->slots[0]++; + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(log, dst_path); + kfree(ins_data); + + /* + * we have to do this after the loop above to avoid changing the + * log tree while trying to change the log tree. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, log, sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src = NULL; + u32 size; + int ret; + int nritems; + int ins_start_slot = 0; + int ins_nr; + + log = root->log_root; + + path = btrfs_alloc_path(); + dst_path = btrfs_alloc_path(); + + min_key.objectid = inode->i_ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = inode->i_ino; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + /* + * if this inode has already been logged and we're in inode_only + * mode, we don't want to delete the things that have already + * been written to the log. + * + * But, if the inode has been through an inode_only log, + * the logged_trans field is not set. This allows us to catch + * any new names for this inode in the backrefs by logging it + * again + */ + if (inode_only == LOG_INODE_EXISTS && + BTRFS_I(inode)->logged_trans == trans->transid) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + goto out; + } + mutex_lock(&BTRFS_I(inode)->log_mutex); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, + inode->i_ino, max_key_type); + } else { + ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + } + BUG_ON(ret); + path->keep_locks = 1; + + while (1) { + ins_nr = 0; + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + if (ret != 0) + break; +again: + /* note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key.objectid != inode->i_ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + size = btrfs_item_size_nr(src, path->slots[0]); + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + btrfs_release_path(root, path); + + if (min_key.offset < (u64)-1) + min_key.offset++; + else if (min_key.type < (u8)-1) + min_key.type++; + else if (min_key.objectid < (u64)-1) + min_key.objectid++; + else + break; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + WARN_ON(ins_nr); + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + BTRFS_I(inode)->log_dirty_trans = 0; + ret = log_directory_changes(trans, root, inode, path, dst_path); + BUG_ON(ret); + } + BTRFS_I(inode)->logged_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + + mutex_lock(&root->fs_info->tree_log_mutex); + ret = update_log_root(trans, log); + BUG_ON(ret); + mutex_unlock(&root->fs_info->tree_log_mutex); +out: + return 0; +} + +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + int ret; + + start_log_trans(trans, root); + ret = __btrfs_log_inode(trans, root, inode, inode_only); + end_log_trans(root); + return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + int inode_only = LOG_INODE_ALL; + struct super_block *sb; + int ret; + + start_log_trans(trans, root); + sb = dentry->d_inode->i_sb; + while (1) { + ret = __btrfs_log_inode(trans, root, dentry->d_inode, + inode_only); + BUG_ON(ret); + inode_only = LOG_INODE_EXISTS; + + dentry = dentry->d_parent; + if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + break; + + if (BTRFS_I(dentry->d_inode)->generation <= + root->fs_info->last_trans_committed) + break; + } + end_log_trans(root); + return 0; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + u64 gen; + gen = root->fs_info->last_trans_new_blockgroup; + if (gen > root->fs_info->last_trans_committed) + return 1; + else + return btrfs_log_dentry(trans, root, dentry); +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + u64 highest_inode; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + fs_info->log_root_recovering = 1; + path = btrfs_alloc_path(); + BUG_ON(!path); + + trans = btrfs_start_transaction(fs_info->tree_root, 1); + + wc.trans = trans; + wc.pin = 1; + + walk_log_tree(trans, log_root_tree, &wc); + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(log_root_tree, path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root_no_radix(log_root_tree, + &found_key); + BUG_ON(!log); + + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + BUG_ON(!wc.replay_dest); + + wc.replay_dest->log_root = log; + btrfs_record_root_in_trans(wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + if (wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + BUG_ON(ret); + } + ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); + if (ret == 0) { + wc.replay_dest->highest_inode = highest_inode; + wc.replay_dest->last_inode_alloc = highest_inode; + } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; + free_extent_buffer(log->node); + kfree(log); + + if (found_key.offset == 0) + break; + } + btrfs_release_path(log_root_tree, path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + + /* step 4: commit the transaction, which also unpins the blocks */ + btrfs_commit_transaction(trans, fs_info->tree_root); + + kfree(log_root_tree); + return 0; +} diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 00000000000..b9409b32ed0 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +#endif diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 00000000000..9bf3946d5ef --- /dev/null +++ b/fs/btrfs/version.h @@ -0,0 +1,4 @@ +#ifndef __BTRFS_VERSION_H +#define __BTRFS_VERSION_H +#define BTRFS_BUILD_VERSION "Btrfs" +#endif diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh new file mode 100644 index 00000000000..1ca1952fd91 --- /dev/null +++ b/fs/btrfs/version.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# determine-version -- report a useful version for releases +# +# Copyright 2008, Aron Griffis <agriffis@n01se.net> +# Copyright 2008, Oracle +# Released under the GNU GPLv2 + +v="v0.16" + +which git &> /dev/null +if [ $? == 0 ]; then + git branch >& /dev/null + if [ $? == 0 ]; then + if head=`git rev-parse --verify HEAD 2>/dev/null`; then + if tag=`git describe --tags 2>/dev/null`; then + v="$tag" + fi + + # Are there uncommitted changes? + git update-index --refresh --unmerged > /dev/null + if git diff-index --name-only HEAD | \ + grep -v "^scripts/package" \ + | read dummy; then + v="$v"-dirty + fi + fi + fi +fi + +echo "#ifndef __BUILD_VERSION" > .build-version.h +echo "#define __BUILD_VERSION" >> .build-version.h +echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h +echo "#endif" >> .build-version.h + +diff -q version.h .build-version.h >& /dev/null + +if [ $? == 0 ]; then + rm .build-version.h + exit 0 +fi + +mv .build-version.h version.h diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 00000000000..b187b537888 --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,3218 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/version.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +static int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +static DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +void btrfs_lock_volumes(void) +{ + mutex_lock(&uuid_mutex); +} + +void btrfs_unlock_volumes(void) +{ + mutex_unlock(&uuid_mutex); +} + +static void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + kfree(device->name); + kfree(device); + } + kfree(fs_devices); +} + +int btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, list); + list_del(&fs_devices->list); + free_fs_devices(fs_devices); + } + return 0; +} + +static noinline struct btrfs_device *__find_device(struct list_head *head, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + struct list_head *cur; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (dev->devid == devid && + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { + return dev; + } + } + return NULL; +} + +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct list_head *cur; + struct btrfs_fs_devices *fs_devices; + + list_for_each(cur, &fs_uuids) { + fs_devices = list_entry(cur, struct btrfs_fs_devices, list); + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +static noinline int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct btrfs_fs_info *fs_info; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run = 0; + unsigned long limit; + + bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + fs_info = device->dev_root->fs_info; + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + +loop: + spin_lock(&device->io_lock); + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + pending = device->pending_bios; + tail = device->pending_bio_tail; + WARN_ON(pending && !tail); + device->pending_bios = NULL; + device->pending_bio_tail = NULL; + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (pending) { + again = 1; + device->running_pending = 1; + } else { + again = 0; + device->running_pending = 0; + } + spin_unlock(&device->io_lock); + + while (pending) { + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&fs_info->nr_async_bios); + + if (atomic_read(&fs_info->nr_async_bios) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + BUG_ON(atomic_read(&cur->bi_cnt) == 0); + bio_get(cur); + submit_bio(cur->bi_rw, cur); + bio_put(cur); + num_run++; + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && bdi_write_congested(bdi) && + fs_info->fs_devices->open_devices > 1) { + struct bio *old_head; + + spin_lock(&device->io_lock); + + old_head = device->pending_bios; + device->pending_bios = pending; + if (device->pending_bio_tail) + tail->bi_next = old_head; + else + device->pending_bio_tail = tail; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + if (again) + goto loop; +done: + return 0; +} + +static void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + +static noinline int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + u64 found_transid = btrfs_super_generation(disk_super); + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return -ENOMEM; + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + list_add(&fs_devices->list, &fs_uuids); + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid, + disk_super->dev_item.uuid); + } + if (!device) { + if (fs_devices->opened) + return -EBUSY; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + return -ENOMEM; + } + device->devid = devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE); + device->barriers = 1; + spin_lock_init(&device->io_lock); + device->name = kstrdup(path, GFP_NOFS); + if (!device->name) { + kfree(device); + return -ENOMEM; + } + INIT_LIST_HEAD(&device->dev_alloc_list); + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + + if (found_transid > fs_devices->latest_trans) { + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + } + *fs_devices_ret = fs_devices; + return 0; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + INIT_LIST_HEAD(&fs_devices->list); + fs_devices->latest_devid = orig->latest_devid; + fs_devices->latest_trans = orig->latest_trans; + memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); + if (!device->name) + goto error; + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); + device->barriers = 1; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_list); + INIT_LIST_HEAD(&device->dev_alloc_list); + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + return fs_devices; +error: + free_fs_devices(fs_devices); + return ERR_PTR(-ENOMEM); +} + +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *tmp; + struct list_head *cur; + struct btrfs_device *device; + + mutex_lock(&uuid_mutex); +again: + list_for_each_safe(cur, tmp, &fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->in_fs_metadata) + continue; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + device->writeable = 0; + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + kfree(device->name); + kfree(device); + } + + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } + + mutex_unlock(&uuid_mutex); + return 0; +} + +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *cur; + struct btrfs_device *device; + + if (--fs_devices->opened > 0) + return 0; + + list_for_each(cur, &fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + device->bdev = NULL; + device->writeable = 0; + device->in_fs_metadata = 0; + } + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = 0; + + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_fs_devices *seed_devices = NULL; + int ret; + + mutex_lock(&uuid_mutex); + ret = __btrfs_close_devices(fs_devices); + if (!fs_devices->opened) { + seed_devices = fs_devices->seed; + fs_devices->seed = NULL; + } + mutex_unlock(&uuid_mutex); + + while (seed_devices) { + fs_devices = seed_devices; + seed_devices = fs_devices->seed; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } + return ret; +} + +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct list_head *cur; + struct btrfs_device *device; + struct block_device *latest_bdev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 latest_devid = 0; + u64 latest_transid = 0; + u64 devid; + int seeding = 1; + int ret = 0; + + list_for_each(cur, head) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) + continue; + if (!device->name) + continue; + + bdev = open_bdev_exclusive(device->name, flags, holder); + if (IS_ERR(bdev)) { + printk(KERN_INFO "open %s failed\n", device->name); + goto error; + } + set_blocksize(bdev, 4096); + + bh = btrfs_read_dev_super(bdev); + if (!bh) + goto error_close; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + if (!latest_transid || device->generation > latest_transid) { + latest_devid = devid; + latest_transid = device->generation; + latest_bdev = bdev; + } + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + } else { + device->writeable = !bdev_read_only(bdev); + seeding = 0; + } + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + fs_devices->open_devices++; + if (device->writeable) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + } + continue; + +error_brelse: + brelse(bh); +error_close: + close_bdev_exclusive(bdev, FMODE_READ); +error: + continue; + } + if (fs_devices->open_devices == 0) { + ret = -EIO; + goto out; + } + fs_devices->seeding = seeding; + fs_devices->opened = 1; + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + fs_devices->total_rw_bytes = 0; +out: + return ret; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + int ret; + + mutex_lock(&uuid_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + ret = __btrfs_open_devices(fs_devices, flags, holder); + } + mutex_unlock(&uuid_mutex); + return ret; +} + +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct buffer_head *bh; + int ret; + u64 devid; + u64 transid; + + mutex_lock(&uuid_mutex); + + bdev = open_bdev_exclusive(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto error; + } + + ret = set_blocksize(bdev, 4096); + if (ret) + goto error_close; + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + transid = btrfs_super_generation(disk_super); + if (disk_super->label[0]) + printk(KERN_INFO "device label %s ", disk_super->label); + else { + /* FIXME, make a readl uuid parser */ + printk(KERN_INFO "device fsid %llx-%llx ", + *(unsigned long long *)disk_super->fsid, + *(unsigned long long *)(disk_super->fsid + 8)); + } + printk(KERN_INFO "devid %llu transid %llu %s\n", + (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + + brelse(bh); +error_close: + close_bdev_exclusive(bdev, flags); +error: + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + */ +static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 num_bytes, u64 *start) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 hole_size = 0; + u64 last_byte = 0; + u64 search_start = 0; + u64 search_end = device->total_bytes; + int ret; + int slot = 0; + int start_found; + struct extent_buffer *l; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + start_found = 0; + + /* FIXME use last free of some kind */ + + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max((u64)1024 * 1024, search_start); + + if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + search_start = max(root->fs_info->alloc_start, search_start); + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto error; + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto error; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; +no_more_items: + if (!start_found) { + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + *start = search_start; + start_found = 1; + goto check_pending; + } + *start = last_byte > search_start ? + last_byte : search_start; + if (search_end <= *start) { + ret = -ENOSPC; + goto error; + } + goto check_pending; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + goto no_more_items; + + if (key.offset >= search_start && key.offset > last_byte && + start_found) { + if (last_byte < search_start) + last_byte = search_start; + hole_size = key.offset - last_byte; + if (key.offset > last_byte && + hole_size >= num_bytes) { + *start = last_byte; + goto check_pending; + } + } + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + start_found = 1; + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); +next: + path->slots[0]++; + cond_resched(); + } +check_pending: + /* we have to make sure we didn't find an extent that has already + * been allocated by the map tree or the original allocation + */ + BUG_ON(*start < search_start); + + if (*start + num_bytes > search_end) { + ret = -ENOSPC; + goto error; + } + /* check for pending inserts here */ + ret = 0; + +error: + btrfs_free_path(path); + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + BUG_ON(ret); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + ret = 0; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } + BUG_ON(ret); + + if (device->bytes_used > 0) + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + WARN_ON(!device->in_fs_metadata); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), + BTRFS_UUID_SIZE); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_chunk(struct btrfs_root *root, + u64 objectid, u64 *offset) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); + if (ret) { + *offset = 0; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != objectid) + *offset = 0; + else { + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_chunk); + *offset = found_key.offset + + btrfs_chunk_length(path->nodes[0], chunk); + } + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *objectid = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = (unsigned long)btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + lock_chunks(root); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; +out: + btrfs_free_path(path); + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + u64 all_avail; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + int ret = 0; + + mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->volume_mutex); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && + root->fs_info->fs_devices->rw_devices <= 4) { + printk(KERN_ERR "btrfs: unable to go below four devices " + "on raid10\n"); + ret = -EINVAL; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + ret = -EINVAL; + goto out; + } + + if (strcmp(device_path, "missing") == 0) { + struct list_head *cur; + struct list_head *devices; + struct btrfs_device *tmp; + + device = NULL; + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + tmp = list_entry(cur, struct btrfs_device, dev_list); + if (tmp->in_fs_metadata && !tmp->bdev) { + device = tmp; + break; + } + } + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + printk(KERN_ERR "btrfs: no missing devices found to " + "remove\n"); + goto out; + } + } else { + bdev = open_bdev_exclusive(device_path, FMODE_READ, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + set_blocksize(bdev, 4096); + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + dev_uuid = disk_super->dev_item.uuid; + device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } + + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + printk(KERN_ERR "btrfs: unable to remove the only writeable " + "device\n"); + ret = -EINVAL; + goto error_brelse; + } + + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + root->fs_info->fs_devices->rw_devices--; + } + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_brelse; + + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_brelse; + + device->in_fs_metadata = 0; + list_del_init(&device->dev_list); + device->fs_devices->num_devices--; + + next_device = list_entry(root->fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (device->bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_device->bdev; + if (device->bdev == root->fs_info->fs_devices->latest_bdev) + root->fs_info->fs_devices->latest_bdev = next_device->bdev; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + device->fs_devices->open_devices--; + } + + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + + if (device->fs_devices->open_devices == 0) { + struct btrfs_fs_devices *fs_devices; + fs_devices = root->fs_info->fs_devices; + while (fs_devices) { + if (fs_devices->seed == device->fs_devices) + break; + fs_devices = fs_devices->seed; + } + fs_devices->seed = device->fs_devices->seed; + device->fs_devices->seed = NULL; + __btrfs_close_devices(device->fs_devices); + free_fs_devices(device->fs_devices); + } + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + if (device->writeable) { + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } + + kfree(device->name); + kfree(device); + ret = 0; + +error_brelse: + brelse(bh); +error_close: + if (bdev) + close_bdev_exclusive(bdev, FMODE_READ); +out: + mutex_unlock(&root->fs_info->volume_mutex); + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * does all the dirty work required for changing file system's UUID. + */ +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + if (!fs_devices->seeding) + return -EINVAL; + + seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!seed_devices) + return -ENOMEM; + + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return PTR_ERR(old_devices); + } + + list_add(&old_devices->list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + list_splice_init(&fs_devices->devices, &seed_devices->devices); + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); + list_for_each_entry(device, &seed_devices->devices, dev_list) { + device->fs_devices = seed_devices; + } + + fs_devices->seeding = 0; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->seed = seed_devices; + + generate_random_uuid(fs_devices->fsid); + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); + + return 0; +} + +/* + * strore the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + u64 devid; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root = root->fs_info->chunk_root; + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + BUG_ON(!device); + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *cur; + struct list_head *devices; + struct super_block *sb = root->fs_info->sb; + u64 total_bytes; + int seeding_dev = 0; + int ret = 0; + + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + if (!bdev) + return -EIO; + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + mutex_lock(&root->fs_info->volume_mutex); + + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + ret = -ENOMEM; + goto error; + } + + device->name = kstrdup(device_path, GFP_NOFS); + if (!device->name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + + ret = find_next_devid(root, &device->devid); + if (ret) { + kfree(device); + goto error; + } + + trans = btrfs_start_transaction(root, 1); + lock_chunks(root); + + device->barriers = 1; + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + spin_lock_init(&device->io_lock); + device->generation = trans->transid; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->mode = 0; + set_blocksize(device->bdev, 4096); + + if (seeding_dev) { + sb->s_flags &= ~MS_RDONLY; + ret = btrfs_prepare_sprout(trans, root); + BUG_ON(ret); + } + + device->fs_devices = root->fs_info->fs_devices; + list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; + root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes + device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes + 1); + + if (seeding_dev) { + ret = init_first_rw_device(trans, root, device); + BUG_ON(ret); + ret = btrfs_finish_sprout(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_add_device(trans, root, device); + } + + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + + ret = btrfs_relocate_sys_chunks(root); + BUG_ON(ret); + } +out: + mutex_unlock(&root->fs_info->volume_mutex); + return ret; +error: + close_bdev_exclusive(bdev, 0); + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + goto out; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + &device->dev_root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = new_size - device->total_bytes; + + if (!device->writeable) + return -EACCES; + if (new_size <= device->total_bytes) + return -EINVAL; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + device->fs_devices->total_rw_bytes += diff; + + device->total_bytes = new_size; + return btrfs_update_device(trans, device); +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + int ret; + lock_chunks(device->dev_root); + ret = __btrfs_grow_device(trans, device, new_size); + unlock_chunks(device->dev_root); + return ret; +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct map_lookup *map; + int ret; + int i; + + printk(KERN_INFO "btrfs relocating chunk %llu\n", + (unsigned long long)chunk_offset); + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + BUG_ON(ret); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + lock_chunks(root); + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, + map->stripes[i].physical); + BUG_ON(ret); + + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); + } + } + ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, + chunk_offset); + + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + BUG_ON(ret); + } + + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + + spin_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; + + /* once for the tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + + unlock_chunks(root); + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +{ + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(chunk_root, path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + } + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +int btrfs_balance(struct btrfs_root *dev_root) +{ + int ret; + struct list_head *cur; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + + if (dev_root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&dev_root->fs_info->volume_mutex); + dev_root = dev_root->fs_info->dev_root; + + /* step one make some room on all the devices */ + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + old_size = device->total_bytes; + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (!device->writeable || + device->total_bytes - device->bytes_used > size_to_free) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); + BUG_ON(!trans); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); + key.offset = found_key.offset; + /* chunk zero is special */ + if (key.offset == 0) + break; + + btrfs_release_path(chunk_root, path); + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + } + ret = 0; +error: + btrfs_free_path(path); + mutex_unlock(&dev_root->fs_info->volume_mutex); + return ret; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + + path->reada = 2; + + lock_chunks(root); + + device->total_bytes = new_size; + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); + + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + goto done; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) + goto done; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) + goto done; + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(root, path); + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); + if (ret) + goto done; + } + +done: + btrfs_free_path(path); + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + return 0; +} + +static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, + int num_stripes, int sub_stripes) +{ + if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) + return calc_size; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + return calc_size * (num_stripes / sub_stripes); + else + return calc_size * num_stripes; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct list_head private_devs; + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = 1024 * 1024 * 1024; + u64 max_chunk_size = calc_size; + u64 min_free; + u64 avail; + u64 max_avail = 0; + u64 dev_offset; + int num_stripes = 1; + int min_stripes = 1; + int sub_stripes = 0; + int looped = 0; + int ret; + int index; + int stripe_len = 64 * 1024; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + if (type & (BTRFS_BLOCK_GROUP_RAID0)) { + num_stripes = fs_devices->rw_devices; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = 2; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID1)) { + num_stripes = min_t(u64, 2, fs_devices->rw_devices); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 4) + return -ENOSPC; + num_stripes &= ~(u32)1; + sub_stripes = 2; + min_stripes = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_chunk_size = 10 * calc_size; + min_stripe_size = 64 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + max_chunk_size = 4 * calc_size; + min_stripe_size = 32 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + calc_size = 8 * 1024 * 1024; + max_chunk_size = calc_size * 2; + min_stripe_size = 1 * 1024 * 1024; + } + + /* we don't want a chunk larger than 10% of writeable space */ + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + max_chunk_size); + +again: + if (!map || map->num_stripes != num_stripes) { + kfree(map); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = num_stripes; + } + + if (calc_size * num_stripes > max_chunk_size) { + calc_size = max_chunk_size; + do_div(calc_size, num_stripes); + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + } + /* we don't want tiny stripes */ + calc_size = max_t(u64, min_stripe_size, calc_size); + + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + + cur = fs_devices->alloc_list.next; + index = 0; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = calc_size * 2; + else + min_free = calc_size; + + /* + * we add 1MB because we never use the first 1MB of the device, unless + * we've looped, then we are likely allocating the maximum amount of + * space left already + */ + if (!looped) + min_free += 1024 * 1024; + + INIT_LIST_HEAD(&private_devs); + while (index < num_stripes) { + device = list_entry(cur, struct btrfs_device, dev_alloc_list); + BUG_ON(!device->writeable); + if (device->total_bytes > device->bytes_used) + avail = device->total_bytes - device->bytes_used; + else + avail = 0; + cur = cur->next; + + if (device->in_fs_metadata && avail >= min_free) { + ret = find_free_dev_extent(trans, device, + min_free, &dev_offset); + if (ret == 0) { + list_move_tail(&device->dev_alloc_list, + &private_devs); + map->stripes[index].dev = device; + map->stripes[index].physical = dev_offset; + index++; + if (type & BTRFS_BLOCK_GROUP_DUP) { + map->stripes[index].dev = device; + map->stripes[index].physical = + dev_offset + calc_size; + index++; + } + } + } else if (device->in_fs_metadata && avail > max_avail) + max_avail = avail; + if (cur == &fs_devices->alloc_list) + break; + } + list_splice(&private_devs, &fs_devices->alloc_list); + if (index < num_stripes) { + if (index >= min_stripes) { + num_stripes = index; + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes /= sub_stripes; + num_stripes *= sub_stripes; + } + looped = 1; + goto again; + } + if (!looped && max_avail > 0) { + looped = 1; + calc_size = max_avail; + goto again; + } + kfree(map); + return -ENOSPC; + } + map->sector_size = extent_root->sectorsize; + map->stripe_len = stripe_len; + map->io_align = stripe_len; + map->io_width = stripe_len; + map->type = type; + map->num_stripes = num_stripes; + map->sub_stripes = sub_stripes; + + *map_ret = map; + *stripe_size = calc_size; + *num_bytes = chunk_bytes_by_type(type, calc_size, + num_stripes, sub_stripes); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + kfree(map); + return -ENOMEM; + } + em->bdev = (struct block_device *)map; + em->start = start; + em->len = *num_bytes; + em->block_start = 0; + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, *num_bytes); + BUG_ON(ret); + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + ret = btrfs_alloc_dev_extent(trans, device, + info->chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, dev_offset, calc_size); + BUG_ON(ret); + index++; + } + + return 0; +} + +static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup *map, u64 chunk_offset, + u64 chunk_size, u64 stripe_size) +{ + u64 dev_offset; + struct btrfs_key key; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_device *device; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + size_t item_size = btrfs_chunk_item_size(map->num_stripes); + int index = 0; + int ret; + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) + return -ENOMEM; + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + device->bytes_used += stripe_size; + ret = btrfs_update_device(trans, device); + BUG_ON(ret); + index++; + } + + index = 0; + stripe = &chunk->stripe; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + index++; + } + + btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + item_size); + BUG_ON(ret); + } + kfree(chunk); + return 0; +} + +/* + * Chunk allocation falls into two parts. The first part does works + * that make the new allocated chunk useable, but not do any operation + * that modifies the chunk tree. The second part does the works that + * require modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type) +{ + u64 chunk_offset; + u64 chunk_size; + u64 stripe_size; + struct map_lookup *map; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + int ret; + + ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, + &chunk_offset); + if (ret) + return ret; + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, type); + if (ret) + return ret; + + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + return 0; +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + u64 chunk_offset; + u64 sys_chunk_offset; + u64 chunk_size; + u64 sys_chunk_size; + u64 stripe_size; + u64 sys_stripe_size; + u64 alloc_profile; + struct map_lookup *map; + struct map_lookup *sys_map; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + + ret = find_next_chunk(fs_info->chunk_root, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); + BUG_ON(ret); + + alloc_profile = BTRFS_BLOCK_GROUP_METADATA | + (fs_info->metadata_alloc_profile & + fs_info->avail_metadata_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, alloc_profile); + BUG_ON(ret); + + sys_chunk_offset = chunk_offset + chunk_size; + + alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | + (fs_info->system_alloc_profile & + fs_info->avail_system_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, + &sys_chunk_size, &sys_stripe_size, + sys_chunk_offset, alloc_profile); + BUG_ON(ret); + + ret = btrfs_add_device(trans, fs_info->chunk_root, device); + BUG_ON(ret); + + /* + * Modifying chunk tree needs allocating new blocks from both + * system block group and metadata block group. So we only can + * do operations require modifying the chunk tree after both + * block groups were created. + */ + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + + ret = __finish_chunk_alloc(trans, extent_root, sys_map, + sys_chunk_offset, sys_chunk_size, + sys_stripe_size); + BUG_ON(ret); + return 0; +} + +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int readonly = 0; + int i; + + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + spin_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (!map->stripes[i].dev->writeable) { + readonly = 1; + break; + } + } + free_extent_map(em); + return readonly; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree, GFP_NOFS); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while (1) { + spin_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + spin_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + spin_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID10) + ret = map->sub_stripes; + else + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct map_lookup *map, int first, int num, + int optimal) +{ + int i; + if (map->stripes[optimal].dev->bdev) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev) + return i; + } + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + +static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, + int mirror_num, struct page *unplug_page) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + int stripes_allocated = 8; + int stripes_required = 1; + int stripe_index; + int i; + int num_stripes; + int max_errors = 0; + struct btrfs_multi_bio *multi = NULL; + + if (multi_ret && !(rw & (1 << BIO_RW))) + stripes_allocated = 1; +again: + if (multi_ret) { + multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + GFP_NOFS); + if (!multi) + return -ENOMEM; + + atomic_set(&multi->error, 0); + } + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + spin_unlock(&em_tree->lock); + + if (!em && unplug_page) + return 0; + + if (!em) { + printk(KERN_CRIT "unable to find logical %llu len %llu\n", + (unsigned long long)logical, + (unsigned long long)*length); + BUG(); + } + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + + if (mirror_num > map->num_stripes) + mirror_num = 0; + + /* if our multi bio struct is too small, back off and try again */ + if (rw & (1 << BIO_RW)) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + stripes_required = map->num_stripes; + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripes_required = map->sub_stripes; + max_errors = 1; + } + } + if (multi_ret && rw == WRITE && + stripes_allocated < stripes_required) { + stripes_allocated = map->num_stripes; + free_extent_map(em); + kfree(multi); + goto again; + } + stripe_nr = offset; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + do_div(stripe_nr, map->stripe_len); + + stripe_offset = stripe_nr * map->stripe_len; + BUG_ON(offset < stripe_offset); + + /* stripe_offset is the offset of this block in its stripe*/ + stripe_offset = offset - stripe_offset; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + /* we limit the length of each bio to what fits in a stripe */ + *length = min_t(u64, em->len - offset, + map->stripe_len - stripe_offset); + } else { + *length = em->len - offset; + } + + if (!multi_ret && !unplug_page) + goto out; + + num_stripes = 1; + stripe_index = 0; + if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + else { + stripe_index = find_live_mirror(map, 0, + map->num_stripes, + current->pid % map->num_stripes); + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (rw & (1 << BIO_RW)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + + stripe_index = do_div(stripe_nr, factor); + stripe_index *= map->sub_stripes; + + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->sub_stripes; + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + stripe_index = find_live_mirror(map, stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes); + } + } else { + /* + * after this do_div call, stripe_nr is the number of stripes + * on this device we have to walk to find the data, and + * stripe_index is the number of our device in the stripe array + */ + stripe_index = do_div(stripe_nr, map->num_stripes); + } + BUG_ON(stripe_index >= map->num_stripes); + + for (i = 0; i < num_stripes; i++) { + if (unplug_page) { + struct btrfs_device *device; + struct backing_dev_info *bdi; + + device = map->stripes[stripe_index].dev; + if (device->bdev) { + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, unplug_page); + } + } else { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + } + stripe_index++; + } + if (multi_ret) { + *multi_ret = multi; + multi->num_stripes = num_stripes; + multi->max_errors = max_errors; + } +out: + free_extent_map(em); + return 0; +} + +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num) +{ + return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + mirror_num, NULL); +} + +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map_tree *em_tree = &map_tree->map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 length; + u64 stripe_nr; + int i, j, nr = 0; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; + + length = em->len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + do_div(length, map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + do_div(length, map->num_stripes); + + buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + BUG_ON(!buf); + + for (i = 0; i < map->num_stripes; i++) { + if (devid && map->stripes[i].dev->devid != devid) + continue; + if (map->stripes[i].physical > physical || + map->stripes[i].physical + length <= physical) + continue; + + stripe_nr = physical - map->stripes[i].physical; + do_div(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + do_div(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + bytenr = chunk_start + stripe_nr * map->stripe_len; + WARN_ON(nr >= map->num_stripes); + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) + break; + } + if (j == nr) { + WARN_ON(nr >= map->num_stripes); + buf[nr++] = bytenr; + } + } + + for (i = 0; i > nr; i++) { + struct btrfs_multi_bio *multi; + struct btrfs_bio_stripe *stripe; + int ret; + + length = 1; + ret = btrfs_map_block(map_tree, WRITE, buf[i], + &length, &multi, 0); + BUG_ON(ret); + + stripe = multi->stripes; + for (j = 0; j < multi->num_stripes; j++) { + if (stripe->physical >= physical && + physical < stripe->physical + length) + break; + } + BUG_ON(j >= multi->num_stripes); + kfree(multi); + } + + *logical = buf; + *naddrs = nr; + *stripe_len = map->stripe_len; + + free_extent_map(em); + return 0; +} + +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page) +{ + u64 length = PAGE_CACHE_SIZE; + return __btrfs_map_block(map_tree, READ, logical, &length, + NULL, 0, page); +} + +static void end_bio_multi_stripe(struct bio *bio, int err) +{ + struct btrfs_multi_bio *multi = bio->bi_private; + int is_orig_bio = 0; + + if (err) + atomic_inc(&multi->error); + + if (bio == multi->orig_bio) + is_orig_bio = 1; + + if (atomic_dec_and_test(&multi->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = multi->orig_bio; + } + bio->bi_private = multi->private; + bio->bi_end_io = multi->end_io; + /* only send an error to the higher layers if it is + * beyond the tolerance of the multi-bio + */ + if (atomic_read(&multi->error) > multi->max_errors) { + err = -EIO; + } else if (err) { + /* + * this bio is actually up to date, we didn't + * go over the max number of errors + */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + err = 0; + } + kfree(multi); + + bio_endio(bio, err); + } else if (!is_orig_bio) { + bio_put(bio); + } +} + +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +static noinline int schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & (1 << BIO_RW))) { + bio_get(bio); + submit_bio(rw, bio); + bio_put(bio); + return 0; + } + + /* + * nr_async_bios allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_bios); + WARN_ON(bio->bi_next); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + + if (device->pending_bio_tail) + device->pending_bio_tail->bi_next = bio; + + device->pending_bio_tail = bio; + if (!device->pending_bios) + device->pending_bios = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->submit_workers, + &device->work); + return 0; +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit) +{ + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + struct bio *first_bio = bio; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + struct btrfs_multi_bio *multi = NULL; + int ret; + int dev_nr = 0; + int total_devs = 1; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num); + BUG_ON(ret); + + total_devs = multi->num_stripes; + if (map_length < length) { + printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + "len %llu\n", (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); + BUG(); + } + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; + atomic_set(&multi->stripes_pending, multi->num_stripes); + + while (dev_nr < total_devs) { + if (total_devs > 1) { + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; + } + bio->bi_private = multi; + bio->bi_end_io = end_bio_multi_stripe; + } + bio->bi_sector = multi->stripes[dev_nr].physical >> 9; + dev = multi->stripes[dev_nr].dev; + BUG_ON(rw == WRITE && !dev->writeable); + if (dev && dev->bdev) { + bio->bi_bdev = dev->bdev; + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); + } else { + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; + bio->bi_sector = logical >> 9; + bio_endio(bio, -EIO); + } + dev_nr++; + } + if (total_devs == 1) + kfree(multi); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + + cur_devices = root->fs_info->fs_devices; + while (cur_devices) { + if (!fsid || + !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + device = __find_device(&cur_devices->devices, + devid, uuid); + if (device) + return device; + } + cur_devices = cur_devices->seed; + } + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + return NULL; + list_add(&device->dev_list, + &fs_devices->devices); + device->barriers = 1; + device->dev_root = root->fs_info->dev_root; + device->devid = devid; + device->work.func = pending_bios_fn; + device->fs_devices = fs_devices; + fs_devices->num_devices++; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_alloc_list); + memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + spin_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return -ENOMEM; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->sector_size = btrfs_chunk_sector_size(leaf, chunk); + map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = btrfs_chunk_type(leaf, chunk); + map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + map->stripes[i].dev = btrfs_find_device(root, devid, uuid, + NULL); + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + kfree(map); + free_extent_map(em); + return -EIO; + } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, devid, uuid); + if (!map->stripes[i].dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; + } + + spin_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); + spin_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + + return 0; +} + +static int fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + + return 0; +} + +static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + mutex_lock(&uuid_mutex); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + ret = 0; + goto out; + } + fs_devices = fs_devices->seed; + } + + fs_devices = find_fsid(fsid); + if (!fs_devices) { + ret = -ENOENT; + goto out; + } + + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) { + ret = PTR_ERR(fs_devices); + goto out; + } + + ret = __btrfs_open_devices(fs_devices, FMODE_READ, + root->fs_info->bdev_holder); + if (ret) + goto out; + + if (!fs_devices->seeding) { + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + ret = -EINVAL; + goto out; + } + + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; +out: + mutex_unlock(&uuid_mutex); + return ret; +} + +static int read_one_dev(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + + if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { + ret = open_seed_devices(root, fs_uuid); + if (ret && !btrfs_test_opt(root, DEGRADED)) + return ret; + } + + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + if (!device || !device->bdev) { + if (!btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if (!device) { + printk(KERN_WARNING "warning devid %llu missing\n", + (unsigned long long)devid); + device = add_missing_dev(root, devid, dev_uuid); + if (!device) + return -ENOMEM; + } + } + + if (device->fs_devices != root->fs_info->fs_devices) { + BUG_ON(device->writeable); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + device->dev_root = root->fs_info->dev_root; + device->in_fs_metadata = 1; + if (device->writeable) + device->fs_devices->total_rw_bytes += device->total_bytes; + ret = 0; + return ret; +} + +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) +{ + struct btrfs_dev_item *dev_item; + + dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, + dev_item); + return read_one_dev(root, buf, dev_item); +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + unsigned long sb_ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, + BTRFS_SUPER_INFO_SIZE); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); ptr += len; + sb_ptr += len; + cur += len; + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)sb_ptr; + ret = read_one_chunk(root, &key, sb, chunk); + if (ret) + break; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + ptr += len; + sb_ptr += len; + cur += len; + } + free_extent_buffer(sb); + return ret; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) + break; + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; + } + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + if (ret) + goto error; + } + path->slots[0]++; + } + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + key.objectid = 0; + btrfs_release_path(root, path); + goto again; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 00000000000..86c44e9ae11 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ + +#include <linux/bio.h> +#include "async-thread.h" + +struct buffer_head; +struct btrfs_device { + struct list_head dev_list; + struct list_head dev_alloc_list; + struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + struct bio *pending_bios; + struct bio *pending_bio_tail; + int running_pending; + u64 generation; + + int barriers; + int writeable; + int in_fs_metadata; + + spinlock_t io_lock; + + struct block_device *bdev; + + /* the mode sent to open_bdev_exclusive */ + fmode_t mode; + + char *name; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* type and info about this device */ + u64 type; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + struct btrfs_work work; +}; + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* the device with this id has the most recent coyp of the super */ + u64 latest_devid; + u64 latest_trans; + u64 num_devices; + u64 open_devices; + u64 rw_devices; + u64 total_rw_bytes; + struct block_device *latest_bdev; + /* all of the devices in the FS */ + struct list_head devices; + + /* devices not currently being allocated */ + struct list_head alloc_list; + struct list_head list; + + struct btrfs_fs_devices *seed; + int seeding; + + int opened; +}; + +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; +}; + +struct btrfs_multi_bio { + atomic_t stripes_pending; + bio_end_io_t *end_io; + struct bio *orig_bio; + void *private; + atomic_t error; + int max_errors; + int num_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num); +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit); +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder); +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_balance(struct btrfs_root *dev_root); +void btrfs_unlock_volumes(void); +void btrfs_lock_volumes(void); +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +#endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 00000000000..7f332e27089 --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,322 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" + + +ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, + strlen(name), 0); + if (!di || IS_ERR(di)) { + ret = -ENODATA; + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + int ret = 0, mod = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, + strlen(name), -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + /* ok we already have this xattr, lets remove it */ + if (di) { + /* if we want create only exit */ + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_release_path(root, path); + + /* if we don't have a value then we are removing the xattr */ + if (!value) { + mod = 1; + goto out; + } + } else { + btrfs_release_path(root, path); + + if (flags & XATTR_REPLACE) { + /* we couldn't find the attr to replace */ + ret = -ENODATA; + goto out; + } + } + + /* ok we have to create a completely new xattr */ + ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), + value, size, inode->i_ino); + if (ret) + goto out; + mod = 1; + +out: + if (mod) { + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + } + + btrfs_end_transaction(trans, root); + btrfs_free_path(path); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_item *item; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + int ret = 0, slot, advance; + size_t total_size = 0, size_left = size; + unsigned long name_ptr; + size_t name_len; + u32 nritems; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + ret = 0; + advance = 0; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (advance || slot >= nritems) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + if (slot >= nritems-1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + /* + * just walking through the slots on this leaf + */ + slot++; + path->slots[0]++; + } + } + advance = 1; + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + + name_len = btrfs_dir_name_len(leaf, di); + total_size += name_len + 1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + continue; + + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } + + name_ptr = (unsigned long)(di + 1); + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; + } + ret = total_size; + +err: + btrfs_free_path(path); + + return ret; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +struct xattr_handler *btrfs_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, +#endif + NULL, +}; + +/* + * Check if the attribute is in a supported namespace. + * + * This applied after the check for the synthetic attributes in the system + * namespace. + */ +static bool btrfs_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_getxattr(dentry->d_inode, name, buffer, size); +} + +int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); +} + +int btrfs_removexattr(struct dentry *dentry, const char *name) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 00000000000..5b1d08f8e68 --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include <linux/xattr.h> + +extern struct xattr_handler btrfs_xattr_acl_access_handler; +extern struct xattr_handler btrfs_xattr_acl_default_handler; +extern struct xattr_handler *btrfs_xattr_handlers[]; + +extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +extern int __btrfs_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags); + +extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +#endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 00000000000..ecfbce836d3 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/zlib.h> +#include <linux/zutil.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include "compression.h" + +/* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ + struct workspace *workspace; + int ret; + int cpus = num_online_cpus(); + +again: + spin_lock(&workspace_lock); + if (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + num_workspace--; + spin_unlock(&workspace_lock); + return workspace; + + } + spin_unlock(&workspace_lock); + if (atomic_read(&alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&alloc_workspace) > cpus) + schedule(); + finish_wait(&workspace_wait, &wait); + goto again; + } + atomic_inc(&alloc_workspace); + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) { + ret = -ENOMEM; + goto fail; + } + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); + if (!workspace->def_strm.workspace) { + ret = -ENOMEM; + goto fail; + } + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!workspace->inf_strm.workspace) { + ret = -ENOMEM; + goto fail_inflate; + } + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->buf) { + ret = -ENOMEM; + goto fail_kmalloc; + } + return workspace; + +fail_kmalloc: + vfree(workspace->inf_strm.workspace); +fail_inflate: + vfree(workspace->def_strm.workspace); +fail: + kfree(workspace); + atomic_dec(&alloc_workspace); + wake_up(&workspace_wait); + return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ + spin_lock(&workspace_lock); + if (num_workspace < num_online_cpus()) { + list_add_tail(&workspace->list, &idle_workspace); + num_workspace++; + spin_unlock(&workspace_lock); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; + } + spin_unlock(&workspace_lock); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + + atomic_dec(&alloc_workspace); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct workspace *workspace; + while (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + atomic_dec(&alloc_workspace); + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + int ret; + struct workspace *workspace; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + int out_written = 0; + int in_read = 0; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + workspace = find_zlib_workspace(); + if (!workspace) + return -1; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + out_written = 0; + in_read = 0; + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + free_workspace(workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + char *data_in; + size_t total_out = 0; + unsigned long page_bytes_left; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + struct page *page_out; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + unsigned long start_byte; + unsigned long current_buf_start; + char *kaddr; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + page_out = bvec[page_out_index].bv_page; + page_bytes_left = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + while (workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + /* + * buf start is the byte offset we're of the start of + * our workspace buffer + */ + buf_start = total_out; + + /* total_out is the last byte of the workspace buffer */ + total_out = workspace->inf_strm.total_out; + + working_bytes = total_out - buf_start; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + if (working_bytes == 0) { + /* we didn't make progress in this inflate + * call, we're done + */ + if (ret != Z_STREAM_END) + ret = -1; + break; + } + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + goto next; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, + bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + pg_offset += bytes; + page_bytes_left -= bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (page_bytes_left == 0) { + page_out_index++; + if (page_out_index >= vcnt) { + ret = 0; + goto done; + } + + page_out = bvec[page_out_index].bv_page; + pg_offset = 0; + page_bytes_left = PAGE_CACHE_SIZE; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + goto next; + + /* the next page in the biovec might not + * be adjacent to the last page, but it + * might still be found inside this working + * buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + + buf_offset; + } + } + } +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -1; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); +out: + free_workspace(workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + if (destlen > PAGE_CACHE_SIZE) + return -ENOMEM; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -1; + else + ret = 0; + + zlib_inflateEnd(&workspace->inf_strm); +out: + free_workspace(workspace); + return ret; +} + +void btrfs_zlib_exit(void) +{ + free_workspaces(); +} diff --git a/fs/buffer.c b/fs/buffer.c index 776ae091d3b..c26da785938 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1996,7 +1996,7 @@ int block_write_begin(struct file *file, struct address_space *mapping, page = *pagep; if (page == NULL) { ownpage = 1; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { status = -ENOMEM; goto out; @@ -2022,7 +2022,6 @@ int block_write_begin(struct file *file, struct address_space *mapping, if (pos + len > inode->i_size) vmtruncate(inode, inode->i_size); } - goto out; } out: @@ -2502,7 +2501,7 @@ int nobh_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/char_dev.c b/fs/char_dev.c index 700697a7261..38f71222a55 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -120,7 +120,7 @@ __register_chrdev_region(unsigned int major, unsigned int baseminor, cd->major = major; cd->baseminor = baseminor; cd->minorct = minorct; - strncpy(cd->name,name, 64); + strlcpy(cd->name, name, sizeof(cd->name)); i = major_to_index(major); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b1e1fc6a6e6..12bb656fbe7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2074,7 +2074,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping, cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { rc = -ENOMEM; goto out; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f247da9f4ed..5ab9896fdcb 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1641,7 +1641,7 @@ do_expand: i_size_write(inode, offset); spin_unlock(&inode->i_lock); out_truncate: - if (inode->i_op && inode->i_op->truncate) + if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; out_sig: diff --git a/fs/coda/file.c b/fs/coda/file.c index 466303db2df..6a347fbc998 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -201,8 +201,7 @@ int coda_release(struct inode *coda_inode, struct file *coda_file) int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) { struct file *host_file; - struct dentry *host_dentry; - struct inode *host_inode, *coda_inode = coda_dentry->d_inode; + struct inode *coda_inode = coda_dentry->d_inode; struct coda_file_info *cfi; int err = 0; @@ -214,14 +213,7 @@ int coda_fsync(struct file *coda_file, struct dentry *coda_dentry, int datasync) BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (host_file->f_op && host_file->f_op->fsync) { - host_dentry = host_file->f_path.dentry; - host_inode = host_dentry->d_inode; - mutex_lock(&host_inode->i_mutex); - err = host_file->f_op->fsync(host_file, host_dentry, datasync); - mutex_unlock(&host_inode->i_mutex); - } - + err = vfs_fsync(host_file, host_file->f_path.dentry, datasync); if ( !err && !datasync ) { lock_kernel(); err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); diff --git a/fs/coda/sysctl.c b/fs/coda/sysctl.c index 81b7771c646..43c96ce2961 100644 --- a/fs/coda/sysctl.c +++ b/fs/coda/sysctl.c @@ -11,7 +11,9 @@ #include "coda_int.h" +#ifdef CONFIG_SYSCTL static struct ctl_table_header *fs_table_header; +#endif static ctl_table coda_table[] = { { @@ -41,6 +43,7 @@ static ctl_table coda_table[] = { {} }; +#ifdef CONFIG_SYSCTL static ctl_table fs_table[] = { { .ctl_name = CTL_UNNUMBERED, @@ -50,7 +53,7 @@ static ctl_table fs_table[] = { }, {} }; - +#endif void coda_sysctl_init(void) { diff --git a/fs/compat.c b/fs/compat.c index d1ece79b641..30f2faa22f5 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1187,6 +1187,9 @@ compat_sys_readv(unsigned long fd, const struct compat_iovec __user *vec, unsign ret = compat_do_readv_writev(READ, file, vec, vlen, &file->f_pos); out: + if (ret > 0) + add_rchar(current, ret); + inc_syscr(current); fput(file); return ret; } @@ -1210,6 +1213,9 @@ compat_sys_writev(unsigned long fd, const struct compat_iovec __user *vec, unsig ret = compat_do_readv_writev(WRITE, file, vec, vlen, &file->f_pos); out: + if (ret > 0) + add_wchar(current, ret); + inc_syscw(current); fput(file); return ret; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4803ccc9448..5d349d38e05 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -117,8 +117,6 @@ int configfs_setattr(struct dentry * dentry, struct iattr * iattr) static inline void set_default_inode_attr(struct inode * inode, mode_t mode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } @@ -136,7 +134,6 @@ struct inode * configfs_new_inode(mode_t mode, struct configfs_dirent * sd) { struct inode * inode = new_inode(configfs_sb); if (inode) { - inode->i_blocks = 0; inode->i_mapping->a_ops = &configfs_aops; inode->i_mapping->backing_dev_info = &configfs_backing_dev_info; inode->i_op = &configfs_inode_operations; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index f40423eb1a1..a07338d2d14 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -83,8 +83,6 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode->i_op = &page_symlink_inode_operations; inode->i_data.a_ops = &cramfs_aops; } else { - inode->i_size = 0; - inode->i_blocks = 0; init_special_inode(inode, inode->i_mode, old_decode_dev(cramfs_inode->size)); } diff --git a/fs/dcache.c b/fs/dcache.c index e88c23b85a3..4547f66884a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1567,10 +1567,6 @@ void d_rehash(struct dentry * entry) spin_unlock(&dcache_lock); } -#define do_switch(x,y) do { \ - __typeof__ (x) __tmp = x; \ - x = y; y = __tmp; } while (0) - /* * When switching names, the actual string doesn't strictly have to * be preserved in the target - because we're dropping the target @@ -1589,7 +1585,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target) /* * Both external: swap the pointers */ - do_switch(target->d_name.name, dentry->d_name.name); + swap(target->d_name.name, dentry->d_name.name); } else { /* * dentry:internal, target:external. Steal target's @@ -1620,7 +1616,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target) return; } } - do_switch(dentry->d_name.len, target->d_name.len); + swap(dentry->d_name.len, target->d_name.len); } /* @@ -1680,7 +1676,7 @@ already_unhashed: /* Switch the names.. */ switch_names(dentry, target); - do_switch(dentry->d_name.hash, target->d_name.hash); + swap(dentry->d_name.hash, target->d_name.hash); /* ... and switch the parents */ if (IS_ROOT(dentry)) { @@ -1688,7 +1684,7 @@ already_unhashed: target->d_parent = target; INIT_LIST_HEAD(&target->d_u.d_child); } else { - do_switch(dentry->d_parent, target->d_parent); + swap(dentry->d_parent, target->d_parent); /* And add them back to the (new) parent lists */ list_add(&target->d_u.d_child, &target->d_parent->d_subdirs); @@ -1789,7 +1785,7 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) struct dentry *dparent, *aparent; switch_names(dentry, anon); - do_switch(dentry->d_name.hash, anon->d_name.hash); + swap(dentry->d_name.hash, anon->d_name.hash); dparent = dentry->d_parent; aparent = anon->d_parent; diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 159a5efd6a8..33a90120f6a 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -294,6 +294,38 @@ struct dentry *debugfs_create_x32(const char *name, mode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_x32); + +static int debugfs_size_t_set(void *data, u64 val) +{ + *(size_t *)data = val; + return 0; +} +static int debugfs_size_t_get(void *data, u64 *val) +{ + *val = *(size_t *)data; + return 0; +} +DEFINE_SIMPLE_ATTRIBUTE(fops_size_t, debugfs_size_t_get, debugfs_size_t_set, + "%llu\n"); /* %llu and %zu are more or less the same */ + +/** + * debugfs_create_size_t - create a debugfs file that is used to read and write an size_t value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + */ +struct dentry *debugfs_create_size_t(const char *name, mode_t mode, + struct dentry *parent, size_t *value) +{ + return debugfs_create_file(name, mode, parent, value, &fops_size_t); +} +EXPORT_SYMBOL_GPL(debugfs_create_size_t); + + static ssize_t read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 3dbe2169cf3..81ae9ea3c6e 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -37,9 +37,6 @@ static struct inode *debugfs_get_inode(struct super_block *sb, int mode, dev_t d if (inode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; switch (mode & S_IFMT) { default: diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index fff96e152c0..5f3231b9633 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -189,8 +189,6 @@ static int mknod_ptmx(struct super_block *sb) } inode->i_ino = 2; - inode->i_uid = inode->i_gid = 0; - inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; mode = S_IFCHR|opts->ptmxmode; @@ -300,8 +298,6 @@ devpts_fill_super(struct super_block *s, void *data, int silent) goto free_fsi; inode->i_ino = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; - inode->i_blocks = 0; - inode->i_uid = inode->i_gid = 0; inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; diff --git a/fs/direct-io.c b/fs/direct-io.c index af0558dbe8b..b6d43908ff7 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1209,6 +1209,19 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, retval = direct_io_worker(rw, iocb, inode, iov, offset, nr_segs, blkbits, get_block, end_io, dio); + /* + * In case of error extending write may have instantiated a few + * blocks outside i_size. Trim these off again for DIO_LOCKING. + * NOTE: DIO_NO_LOCK/DIO_OWN_LOCK callers have to handle this by + * it's own meaner. + */ + if (unlikely(retval < 0 && (rw & WRITE))) { + loff_t isize = i_size_read(inode); + + if (end > isize && dio_lock_type == DIO_LOCKING) + vmtruncate(inode, isize); + } + if (rw == READ && dio_lock_type == DIO_LOCKING) release_i_mutex = 0; diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c index 8bf31e3fbf0..dc2ad6008b2 100644 --- a/fs/dlm/ast.c +++ b/fs/dlm/ast.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -33,10 +33,10 @@ void dlm_del_ast(struct dlm_lkb *lkb) spin_unlock(&ast_queue_lock); } -void dlm_add_ast(struct dlm_lkb *lkb, int type) +void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode) { if (lkb->lkb_flags & DLM_IFL_USER) { - dlm_user_add_ast(lkb, type); + dlm_user_add_ast(lkb, type, bastmode); return; } @@ -46,6 +46,8 @@ void dlm_add_ast(struct dlm_lkb *lkb, int type) list_add_tail(&lkb->lkb_astqueue, &ast_queue); } lkb->lkb_ast_type |= type; + if (bastmode) + lkb->lkb_bastmode = bastmode; spin_unlock(&ast_queue_lock); set_bit(WAKE_ASTS, &astd_wakeflags); @@ -59,50 +61,40 @@ static void process_asts(void) struct dlm_lkb *lkb; void (*cast) (void *astparam); void (*bast) (void *astparam, int mode); - int type = 0, found, bmode; - - for (;;) { - found = 0; - spin_lock(&ast_queue_lock); - list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { - r = lkb->lkb_resource; - ls = r->res_ls; - - if (dlm_locking_stopped(ls)) - continue; - - list_del(&lkb->lkb_astqueue); - type = lkb->lkb_ast_type; - lkb->lkb_ast_type = 0; - found = 1; - break; - } - spin_unlock(&ast_queue_lock); + int type = 0, bastmode; + +repeat: + spin_lock(&ast_queue_lock); + list_for_each_entry(lkb, &ast_queue, lkb_astqueue) { + r = lkb->lkb_resource; + ls = r->res_ls; + + if (dlm_locking_stopped(ls)) + continue; - if (!found) - break; + list_del(&lkb->lkb_astqueue); + type = lkb->lkb_ast_type; + lkb->lkb_ast_type = 0; + bastmode = lkb->lkb_bastmode; + spin_unlock(&ast_queue_lock); cast = lkb->lkb_astfn; bast = lkb->lkb_bastfn; - bmode = lkb->lkb_bastmode; if ((type & AST_COMP) && cast) cast(lkb->lkb_astparam); - /* FIXME: Is it safe to look at lkb_grmode here - without doing a lock_rsb() ? - Look at other checks in v1 to avoid basts. */ - if ((type & AST_BAST) && bast) - if (!dlm_modes_compat(lkb->lkb_grmode, bmode)) - bast(lkb->lkb_astparam, bmode); + bast(lkb->lkb_astparam, bastmode); /* this removes the reference added by dlm_add_ast and may result in the lkb being freed */ dlm_put_lkb(lkb); - schedule(); + cond_resched(); + goto repeat; } + spin_unlock(&ast_queue_lock); } static inline int no_asts(void) diff --git a/fs/dlm/ast.h b/fs/dlm/ast.h index 6ee276c74c5..1b5fc5f428f 100644 --- a/fs/dlm/ast.h +++ b/fs/dlm/ast.h @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -13,7 +13,7 @@ #ifndef __ASTD_DOT_H__ #define __ASTD_DOT_H__ -void dlm_add_ast(struct dlm_lkb *lkb, int type); +void dlm_add_ast(struct dlm_lkb *lkb, int type, int bastmode); void dlm_del_ast(struct dlm_lkb *lkb); void dlm_astd_wake(void); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index 8fc24f4507a..2f107d1a6a4 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -1,7 +1,7 @@ /****************************************************************************** ******************************************************************************* ** -** Copyright (C) 2005 Red Hat, Inc. All rights reserved. +** Copyright (C) 2005-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -27,7 +27,7 @@ static struct dentry *dlm_root; struct rsb_iter { int entry; - int locks; + int format; int header; struct dlm_ls *ls; struct list_head *next; @@ -60,8 +60,8 @@ static char *print_lockmode(int mode) } } -static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, - struct dlm_rsb *res) +static void print_format1_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *res) { seq_printf(s, "%08x %s", lkb->lkb_id, print_lockmode(lkb->lkb_grmode)); @@ -83,7 +83,7 @@ static void print_resource_lock(struct seq_file *s, struct dlm_lkb *lkb, seq_printf(s, "\n"); } -static int print_resource(struct dlm_rsb *res, struct seq_file *s) +static int print_format1(struct dlm_rsb *res, struct seq_file *s) { struct dlm_lkb *lkb; int i, lvblen = res->res_ls->ls_lvblen, recover_list, root_list; @@ -134,15 +134,15 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) /* Print the locks attached to this resource */ seq_printf(s, "Granted Queue\n"); list_for_each_entry(lkb, &res->res_grantqueue, lkb_statequeue) - print_resource_lock(s, lkb, res); + print_format1_lock(s, lkb, res); seq_printf(s, "Conversion Queue\n"); list_for_each_entry(lkb, &res->res_convertqueue, lkb_statequeue) - print_resource_lock(s, lkb, res); + print_format1_lock(s, lkb, res); seq_printf(s, "Waiting Queue\n"); list_for_each_entry(lkb, &res->res_waitqueue, lkb_statequeue) - print_resource_lock(s, lkb, res); + print_format1_lock(s, lkb, res); if (list_empty(&res->res_lookup)) goto out; @@ -160,23 +160,24 @@ static int print_resource(struct dlm_rsb *res, struct seq_file *s) return 0; } -static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb *r) +static void print_format2_lock(struct seq_file *s, struct dlm_lkb *lkb, + struct dlm_rsb *r) { - unsigned int waiting = 0; - uint64_t xid = 0; + u64 xid = 0; + u64 us; if (lkb->lkb_flags & DLM_IFL_USER) { if (lkb->lkb_ua) xid = lkb->lkb_ua->xid; } - if (lkb->lkb_timestamp) - waiting = jiffies_to_msecs(jiffies - lkb->lkb_timestamp); + /* microseconds since lkb was added to current queue */ + us = ktime_to_us(ktime_sub(ktime_get(), lkb->lkb_timestamp)); - /* id nodeid remid pid xid exflags flags sts grmode rqmode time_ms + /* id nodeid remid pid xid exflags flags sts grmode rqmode time_us r_nodeid r_len r_name */ - seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %u %u %d \"%s\"\n", + seq_printf(s, "%x %d %x %u %llu %x %x %d %d %d %llu %u %d \"%s\"\n", lkb->lkb_id, lkb->lkb_nodeid, lkb->lkb_remid, @@ -187,26 +188,114 @@ static void print_lock(struct seq_file *s, struct dlm_lkb *lkb, struct dlm_rsb * lkb->lkb_status, lkb->lkb_grmode, lkb->lkb_rqmode, - waiting, + (unsigned long long)us, r->res_nodeid, r->res_length, r->res_name); } -static int print_locks(struct dlm_rsb *r, struct seq_file *s) +static int print_format2(struct dlm_rsb *r, struct seq_file *s) { struct dlm_lkb *lkb; lock_rsb(r); list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) - print_lock(s, lkb, r); + print_format2_lock(s, lkb, r); list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) - print_lock(s, lkb, r); + print_format2_lock(s, lkb, r); list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) - print_lock(s, lkb, r); + print_format2_lock(s, lkb, r); + + unlock_rsb(r); + return 0; +} + +static void print_format3_lock(struct seq_file *s, struct dlm_lkb *lkb, + int rsb_lookup) +{ + u64 xid = 0; + + if (lkb->lkb_flags & DLM_IFL_USER) { + if (lkb->lkb_ua) + xid = lkb->lkb_ua->xid; + } + + seq_printf(s, "lkb %x %d %x %u %llu %x %x %d %d %d %d %d %d %u %llu %llu\n", + lkb->lkb_id, + lkb->lkb_nodeid, + lkb->lkb_remid, + lkb->lkb_ownpid, + (unsigned long long)xid, + lkb->lkb_exflags, + lkb->lkb_flags, + lkb->lkb_status, + lkb->lkb_grmode, + lkb->lkb_rqmode, + lkb->lkb_highbast, + rsb_lookup, + lkb->lkb_wait_type, + lkb->lkb_lvbseq, + (unsigned long long)ktime_to_ns(lkb->lkb_timestamp), + (unsigned long long)ktime_to_ns(lkb->lkb_time_bast)); +} + +static int print_format3(struct dlm_rsb *r, struct seq_file *s) +{ + struct dlm_lkb *lkb; + int i, lvblen = r->res_ls->ls_lvblen; + int print_name = 1; + + lock_rsb(r); + + seq_printf(s, "rsb %p %d %x %lx %d %d %u %d ", + r, + r->res_nodeid, + r->res_first_lkid, + r->res_flags, + !list_empty(&r->res_root_list), + !list_empty(&r->res_recover_list), + r->res_recover_locks_count, + r->res_length); + + for (i = 0; i < r->res_length; i++) { + if (!isascii(r->res_name[i]) || !isprint(r->res_name[i])) + print_name = 0; + } + + seq_printf(s, "%s", print_name ? "str " : "hex"); + + for (i = 0; i < r->res_length; i++) { + if (print_name) + seq_printf(s, "%c", r->res_name[i]); + else + seq_printf(s, " %02x", (unsigned char)r->res_name[i]); + } + seq_printf(s, "\n"); + + if (!r->res_lvbptr) + goto do_locks; + + seq_printf(s, "lvb %u %d", r->res_lvbseq, lvblen); + + for (i = 0; i < lvblen; i++) + seq_printf(s, " %02x", (unsigned char)r->res_lvbptr[i]); + seq_printf(s, "\n"); + + do_locks: + list_for_each_entry(lkb, &r->res_grantqueue, lkb_statequeue) + print_format3_lock(s, lkb, 0); + + list_for_each_entry(lkb, &r->res_convertqueue, lkb_statequeue) + print_format3_lock(s, lkb, 0); + + list_for_each_entry(lkb, &r->res_waitqueue, lkb_statequeue) + print_format3_lock(s, lkb, 0); + + list_for_each_entry(lkb, &r->res_lookup, lkb_rsb_lookup) + print_format3_lock(s, lkb, 1); unlock_rsb(r); return 0; @@ -231,7 +320,7 @@ static int rsb_iter_next(struct rsb_iter *ri) break; } read_unlock(&ls->ls_rsbtbl[i].lock); - } + } ri->entry = i; if (ri->entry >= ls->ls_rsbtbl_size) @@ -248,7 +337,7 @@ static int rsb_iter_next(struct rsb_iter *ri) read_unlock(&ls->ls_rsbtbl[i].lock); dlm_put_rsb(old); goto top; - } + } ri->rsb = list_entry(ri->next, struct dlm_rsb, res_hashchain); dlm_hold_rsb(ri->rsb); read_unlock(&ls->ls_rsbtbl[i].lock); @@ -274,6 +363,7 @@ static struct rsb_iter *rsb_iter_init(struct dlm_ls *ls) ri->ls = ls; ri->entry = 0; ri->next = NULL; + ri->format = 1; if (rsb_iter_next(ri)) { rsb_iter_free(ri); @@ -325,16 +415,26 @@ static int rsb_seq_show(struct seq_file *file, void *iter_ptr) { struct rsb_iter *ri = iter_ptr; - if (ri->locks) { + switch (ri->format) { + case 1: + print_format1(ri->rsb, file); + break; + case 2: if (ri->header) { - seq_printf(file, "id nodeid remid pid xid exflags flags " - "sts grmode rqmode time_ms r_nodeid " - "r_len r_name\n"); + seq_printf(file, "id nodeid remid pid xid exflags " + "flags sts grmode rqmode time_ms " + "r_nodeid r_len r_name\n"); ri->header = 0; } - print_locks(ri->rsb, file); - } else { - print_resource(ri->rsb, file); + print_format2(ri->rsb, file); + break; + case 3: + if (ri->header) { + seq_printf(file, "version rsb 1.1 lvb 1.1 lkb 1.1\n"); + ri->header = 0; + } + print_format3(ri->rsb, file); + break; } return 0; @@ -385,7 +485,7 @@ static struct rsb_iter *locks_iter_init(struct dlm_ls *ls, loff_t *pos) ri->ls = ls; ri->entry = 0; ri->next = NULL; - ri->locks = 1; + ri->format = 2; if (*pos == 0) ri->header = 1; @@ -448,6 +548,84 @@ static const struct file_operations locks_fops = { }; /* + * Dump all rsb/lvb/lkb state in compact listing, more complete than _locks + * This can replace both formats 1 and 2 eventually. + */ + +static struct rsb_iter *all_iter_init(struct dlm_ls *ls, loff_t *pos) +{ + struct rsb_iter *ri; + + ri = kzalloc(sizeof *ri, GFP_KERNEL); + if (!ri) + return NULL; + + ri->ls = ls; + ri->entry = 0; + ri->next = NULL; + ri->format = 3; + + if (*pos == 0) + ri->header = 1; + + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + + return ri; +} + +static void *all_seq_start(struct seq_file *file, loff_t *pos) +{ + struct rsb_iter *ri; + loff_t n = *pos; + + ri = all_iter_init(file->private, pos); + if (!ri) + return NULL; + + while (n--) { + if (rsb_iter_next(ri)) { + rsb_iter_free(ri); + return NULL; + } + } + + return ri; +} + +static struct seq_operations all_seq_ops = { + .start = all_seq_start, + .next = rsb_seq_next, + .stop = rsb_seq_stop, + .show = rsb_seq_show, +}; + +static int all_open(struct inode *inode, struct file *file) +{ + struct seq_file *seq; + int ret; + + ret = seq_open(file, &all_seq_ops); + if (ret) + return ret; + + seq = file->private_data; + seq->private = inode->i_private; + + return 0; +} + +static const struct file_operations all_fops = { + .owner = THIS_MODULE, + .open = all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release +}; + +/* * dump lkb's on the ls_waiters list */ @@ -489,30 +667,33 @@ static const struct file_operations waiters_fops = { .read = waiters_read }; +void dlm_delete_debug_file(struct dlm_ls *ls) +{ + if (ls->ls_debug_rsb_dentry) + debugfs_remove(ls->ls_debug_rsb_dentry); + if (ls->ls_debug_waiters_dentry) + debugfs_remove(ls->ls_debug_waiters_dentry); + if (ls->ls_debug_locks_dentry) + debugfs_remove(ls->ls_debug_locks_dentry); + if (ls->ls_debug_all_dentry) + debugfs_remove(ls->ls_debug_all_dentry); +} + int dlm_create_debug_file(struct dlm_ls *ls) { char name[DLM_LOCKSPACE_LEN+8]; + /* format 1 */ + ls->ls_debug_rsb_dentry = debugfs_create_file(ls->ls_name, S_IFREG | S_IRUGO, dlm_root, ls, &rsb_fops); if (!ls->ls_debug_rsb_dentry) - return -ENOMEM; + goto fail; - memset(name, 0, sizeof(name)); - snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); - - ls->ls_debug_waiters_dentry = debugfs_create_file(name, - S_IFREG | S_IRUGO, - dlm_root, - ls, - &waiters_fops); - if (!ls->ls_debug_waiters_dentry) { - debugfs_remove(ls->ls_debug_rsb_dentry); - return -ENOMEM; - } + /* format 2 */ memset(name, 0, sizeof(name)); snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_locks", ls->ls_name); @@ -522,23 +703,38 @@ int dlm_create_debug_file(struct dlm_ls *ls) dlm_root, ls, &locks_fops); - if (!ls->ls_debug_locks_dentry) { - debugfs_remove(ls->ls_debug_waiters_dentry); - debugfs_remove(ls->ls_debug_rsb_dentry); - return -ENOMEM; - } + if (!ls->ls_debug_locks_dentry) + goto fail; + + /* format 3 */ + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_all", ls->ls_name); + + ls->ls_debug_all_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &all_fops); + if (!ls->ls_debug_all_dentry) + goto fail; + + memset(name, 0, sizeof(name)); + snprintf(name, DLM_LOCKSPACE_LEN+8, "%s_waiters", ls->ls_name); + + ls->ls_debug_waiters_dentry = debugfs_create_file(name, + S_IFREG | S_IRUGO, + dlm_root, + ls, + &waiters_fops); + if (!ls->ls_debug_waiters_dentry) + goto fail; return 0; -} -void dlm_delete_debug_file(struct dlm_ls *ls) -{ - if (ls->ls_debug_rsb_dentry) - debugfs_remove(ls->ls_debug_rsb_dentry); - if (ls->ls_debug_waiters_dentry) - debugfs_remove(ls->ls_debug_waiters_dentry); - if (ls->ls_debug_locks_dentry) - debugfs_remove(ls->ls_debug_locks_dentry); + fail: + dlm_delete_debug_file(ls); + return -ENOMEM; } int __init dlm_register_debugfs(void) diff --git a/fs/dlm/dir.c b/fs/dlm/dir.c index 85defeb64df..92969f879a1 100644 --- a/fs/dlm/dir.c +++ b/fs/dlm/dir.c @@ -374,7 +374,7 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, struct list_head *list; struct dlm_rsb *r; int offset = 0, dir_nodeid; - uint16_t be_namelen; + __be16 be_namelen; down_read(&ls->ls_root_sem); @@ -410,15 +410,15 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if (offset + sizeof(uint16_t)*2 + r->res_length > outlen) { /* Write end-of-block record */ - be_namelen = 0; - memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); - offset += sizeof(uint16_t); + be_namelen = cpu_to_be16(0); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); goto out; } be_namelen = cpu_to_be16(r->res_length); - memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); - offset += sizeof(uint16_t); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); memcpy(outbuf + offset, r->res_name, r->res_length); offset += r->res_length; } @@ -430,9 +430,9 @@ void dlm_copy_master_names(struct dlm_ls *ls, char *inbuf, int inlen, if ((list == &ls->ls_root_list) && (offset + sizeof(uint16_t) <= outlen)) { - be_namelen = 0xFFFF; - memcpy(outbuf + offset, &be_namelen, sizeof(uint16_t)); - offset += sizeof(uint16_t); + be_namelen = cpu_to_be16(0xFFFF); + memcpy(outbuf + offset, &be_namelen, sizeof(__be16)); + offset += sizeof(__be16); } out: diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 868e4c9ef12..ef2f1e35396 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -245,7 +245,8 @@ struct dlm_lkb { struct list_head lkb_astqueue; /* need ast to be sent */ struct list_head lkb_ownqueue; /* list of locks for a process */ struct list_head lkb_time_list; - unsigned long lkb_timestamp; + ktime_t lkb_time_bast; /* for debugging */ + ktime_t lkb_timestamp; unsigned long lkb_timeout_cs; char *lkb_lvbptr; @@ -481,6 +482,7 @@ struct dlm_ls { struct dentry *ls_debug_rsb_dentry; /* debugfs */ struct dentry *ls_debug_waiters_dentry; /* debugfs */ struct dentry *ls_debug_locks_dentry; /* debugfs */ + struct dentry *ls_debug_all_dentry; /* debugfs */ wait_queue_head_t ls_uevent_wait; /* user part of join/leave */ int ls_uevent_result; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 724ddac9153..6cfe65bbf4a 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -307,7 +307,7 @@ static void queue_cast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rv) lkb->lkb_lksb->sb_status = rv; lkb->lkb_lksb->sb_flags = lkb->lkb_sbflags; - dlm_add_ast(lkb, AST_COMP); + dlm_add_ast(lkb, AST_COMP, 0); } static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) @@ -318,12 +318,12 @@ static inline void queue_cast_overlap(struct dlm_rsb *r, struct dlm_lkb *lkb) static void queue_bast(struct dlm_rsb *r, struct dlm_lkb *lkb, int rqmode) { + lkb->lkb_time_bast = ktime_get(); + if (is_master_copy(lkb)) send_bast(r, lkb, rqmode); - else { - lkb->lkb_bastmode = rqmode; - dlm_add_ast(lkb, AST_BAST); - } + else + dlm_add_ast(lkb, AST_BAST, rqmode); } /* @@ -744,6 +744,8 @@ static void add_lkb(struct dlm_rsb *r, struct dlm_lkb *lkb, int status) DLM_ASSERT(!lkb->lkb_status, dlm_print_lkb(lkb);); + lkb->lkb_timestamp = ktime_get(); + lkb->lkb_status = status; switch (status) { @@ -1013,10 +1015,8 @@ static void add_timeout(struct dlm_lkb *lkb) { struct dlm_ls *ls = lkb->lkb_resource->res_ls; - if (is_master_copy(lkb)) { - lkb->lkb_timestamp = jiffies; + if (is_master_copy(lkb)) return; - } if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) && !(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) { @@ -1031,7 +1031,6 @@ static void add_timeout(struct dlm_lkb *lkb) DLM_ASSERT(list_empty(&lkb->lkb_time_list), dlm_print_lkb(lkb);); mutex_lock(&ls->ls_timeout_mutex); hold_lkb(lkb); - lkb->lkb_timestamp = jiffies; list_add_tail(&lkb->lkb_time_list, &ls->ls_timeout); mutex_unlock(&ls->ls_timeout_mutex); } @@ -1059,6 +1058,7 @@ void dlm_scan_timeout(struct dlm_ls *ls) struct dlm_rsb *r; struct dlm_lkb *lkb; int do_cancel, do_warn; + s64 wait_us; for (;;) { if (dlm_locking_stopped(ls)) @@ -1069,14 +1069,15 @@ void dlm_scan_timeout(struct dlm_ls *ls) mutex_lock(&ls->ls_timeout_mutex); list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) { + wait_us = ktime_to_us(ktime_sub(ktime_get(), + lkb->lkb_timestamp)); + if ((lkb->lkb_exflags & DLM_LKF_TIMEOUT) && - time_after_eq(jiffies, lkb->lkb_timestamp + - lkb->lkb_timeout_cs * HZ/100)) + wait_us >= (lkb->lkb_timeout_cs * 10000)) do_cancel = 1; if ((lkb->lkb_flags & DLM_IFL_WATCH_TIMEWARN) && - time_after_eq(jiffies, lkb->lkb_timestamp + - dlm_config.ci_timewarn_cs * HZ/100)) + wait_us >= dlm_config.ci_timewarn_cs * 10000) do_warn = 1; if (!do_cancel && !do_warn) @@ -1122,12 +1123,12 @@ void dlm_scan_timeout(struct dlm_ls *ls) void dlm_adjust_timeouts(struct dlm_ls *ls) { struct dlm_lkb *lkb; - long adj = jiffies - ls->ls_recover_begin; + u64 adj_us = jiffies_to_usecs(jiffies - ls->ls_recover_begin); ls->ls_recover_begin = 0; mutex_lock(&ls->ls_timeout_mutex); list_for_each_entry(lkb, &ls->ls_timeout, lkb_time_list) - lkb->lkb_timestamp += adj; + lkb->lkb_timestamp = ktime_add_us(lkb->lkb_timestamp, adj_us); mutex_unlock(&ls->ls_timeout_mutex); } diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 3962262f991..103a5ebd137 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -295,6 +295,7 @@ static int add_sock(struct socket *sock, struct connection *con) con->sock->sk->sk_write_space = lowcomms_write_space; con->sock->sk->sk_state_change = lowcomms_state_change; con->sock->sk->sk_user_data = con; + con->sock->sk->sk_allocation = GFP_NOFS; return 0; } @@ -823,7 +824,6 @@ static void sctp_init_assoc(struct connection *con) len = e->len; offset = e->offset; spin_unlock(&con->writequeue_lock); - kmap(e->page); /* Send the first block off the write queue */ iov[0].iov_base = page_address(e->page)+offset; @@ -854,7 +854,6 @@ static void sctp_init_assoc(struct connection *con) if (e->len == 0 && e->users == 0) { list_del(&e->list); - kunmap(e->page); free_entry(e); } spin_unlock(&con->writequeue_lock); @@ -1203,8 +1202,6 @@ void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) if (e) { got_one: - if (users == 0) - kmap(e->page); *ppc = page_address(e->page) + offset; return e; } @@ -1233,7 +1230,6 @@ void dlm_lowcomms_commit_buffer(void *mh) if (users) goto out; e->len = e->end - e->offset; - kunmap(e->page); spin_unlock(&con->writequeue_lock); if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) { @@ -1272,7 +1268,6 @@ static void send_to_sock(struct connection *con) offset = e->offset; BUG_ON(len == 0 && e->users == 0); spin_unlock(&con->writequeue_lock); - kmap(e->page); ret = 0; if (len) { @@ -1294,7 +1289,6 @@ static void send_to_sock(struct connection *con) if (e->len == 0 && e->users == 0) { list_del(&e->list); - kunmap(e->page); free_entry(e); continue; } diff --git a/fs/dlm/memory.c b/fs/dlm/memory.c index 54c14c6d06c..c1775b84eba 100644 --- a/fs/dlm/memory.c +++ b/fs/dlm/memory.c @@ -39,7 +39,7 @@ char *dlm_allocate_lvb(struct dlm_ls *ls) { char *p; - p = kzalloc(ls->ls_lvblen, GFP_KERNEL); + p = kzalloc(ls->ls_lvblen, ls->ls_allocation); return p; } @@ -57,7 +57,7 @@ struct dlm_rsb *dlm_allocate_rsb(struct dlm_ls *ls, int namelen) DLM_ASSERT(namelen <= DLM_RESNAME_MAXLEN,); - r = kzalloc(sizeof(*r) + namelen, GFP_KERNEL); + r = kzalloc(sizeof(*r) + namelen, ls->ls_allocation); return r; } @@ -72,7 +72,7 @@ struct dlm_lkb *dlm_allocate_lkb(struct dlm_ls *ls) { struct dlm_lkb *lkb; - lkb = kmem_cache_zalloc(lkb_cache, GFP_KERNEL); + lkb = kmem_cache_zalloc(lkb_cache, ls->ls_allocation); return lkb; } diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 07ac709f3ed..f3396c622ae 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -112,7 +112,7 @@ int dlm_process_incoming_buffer(int nodeid, const void *base, ordinary messages). */ if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_KERNEL); + p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); if (p == NULL) return ret; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index aa2a5775a02..ccc9d62c462 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -115,7 +115,6 @@ static void fill_data(struct dlm_lock_data *data, struct dlm_lkb *lkb) data->status = lkb->lkb_status; data->grmode = lkb->lkb_grmode; data->rqmode = lkb->lkb_rqmode; - data->timestamp = lkb->lkb_timestamp; if (lkb->lkb_ua) data->xid = lkb->lkb_ua->xid; if (r) { diff --git a/fs/dlm/user.c b/fs/dlm/user.c index b3832c67194..065149e84f4 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -175,7 +175,7 @@ static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type) /* we could possibly check if the cancel of an orphan has resulted in the lkb being removed and then remove that lkb from the orphans list and free it */ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type) +void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode) { struct dlm_ls *ls; struct dlm_user_args *ua; @@ -208,6 +208,8 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type) ast_type = lkb->lkb_ast_type; lkb->lkb_ast_type |= type; + if (bastmode) + lkb->lkb_bastmode = bastmode; if (!ast_type) { kref_get(&lkb->lkb_ref); diff --git a/fs/dlm/user.h b/fs/dlm/user.h index 35eb6a13d61..1c968649228 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -9,7 +9,7 @@ #ifndef __USER_DOT_H__ #define __USER_DOT_H__ -void dlm_user_add_ast(struct dlm_lkb *lkb, int type); +void dlm_user_add_ast(struct dlm_lkb *lkb, int type, int bastmode); int dlm_user_init(void); void dlm_user_exit(void); int dlm_device_deregister(struct dlm_ls *ls); diff --git a/fs/dquot.c b/fs/dquot.c index c237ccc8581..48c0571f831 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -211,8 +211,6 @@ static struct hlist_head *dquot_hash; struct dqstats dqstats; -static void dqput(struct dquot *dquot); - static inline unsigned int hashfn(const struct super_block *sb, unsigned int id, int type) { @@ -415,6 +413,17 @@ out_dqlock: return ret; } +void dquot_destroy(struct dquot *dquot) +{ + kmem_cache_free(dquot_cachep, dquot); +} +EXPORT_SYMBOL(dquot_destroy); + +static inline void do_destroy_dquot(struct dquot *dquot) +{ + dquot->dq_sb->dq_op->destroy_dquot(dquot); +} + /* Invalidate all dquots on the list. Note that this function is called after * quota is disabled and pointers from inodes removed so there cannot be new * quota users. There can still be some users of quotas due to inodes being @@ -463,9 +472,44 @@ restart: remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); - kmem_cache_free(dquot_cachep, dquot); + do_destroy_dquot(dquot); + } + spin_unlock(&dq_list_lock); +} + +/* Call callback for every active dquot on given filesystem */ +int dquot_scan_active(struct super_block *sb, + int (*fn)(struct dquot *dquot, unsigned long priv), + unsigned long priv) +{ + struct dquot *dquot, *old_dquot = NULL; + int ret = 0; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + spin_lock(&dq_list_lock); + list_for_each_entry(dquot, &inuse_list, dq_inuse) { + if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags)) + continue; + if (dquot->dq_sb != sb) + continue; + /* Now we have active dquot so we can just increase use count */ + atomic_inc(&dquot->dq_count); + dqstats.lookups++; + spin_unlock(&dq_list_lock); + dqput(old_dquot); + old_dquot = dquot; + ret = fn(dquot, priv); + if (ret < 0) + goto out; + spin_lock(&dq_list_lock); + /* We are safe to continue now because our dquot could not + * be moved out of the inuse list while we hold the reference */ } spin_unlock(&dq_list_lock); +out: + dqput(old_dquot); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return ret; } int vfs_quota_sync(struct super_block *sb, int type) @@ -479,7 +523,7 @@ int vfs_quota_sync(struct super_block *sb, int type) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; - if (!sb_has_quota_enabled(sb, cnt)) + if (!sb_has_quota_active(sb, cnt)) continue; spin_lock(&dq_list_lock); dirty = &dqopt->info[cnt].dqi_dirty_list; @@ -504,8 +548,8 @@ int vfs_quota_sync(struct super_block *sb, int type) } for (cnt = 0; cnt < MAXQUOTAS; cnt++) - if ((cnt == type || type == -1) && sb_has_quota_enabled(sb, cnt) - && info_dirty(&dqopt->info[cnt])) + if ((cnt == type || type == -1) && sb_has_quota_active(sb, cnt) + && info_dirty(&dqopt->info[cnt])) sb->dq_op->write_info(sb, cnt); spin_lock(&dq_list_lock); dqstats.syncs++; @@ -527,7 +571,7 @@ static void prune_dqcache(int count) remove_dquot_hash(dquot); remove_free_dquot(dquot); remove_inuse(dquot); - kmem_cache_free(dquot_cachep, dquot); + do_destroy_dquot(dquot); count--; head = free_dquots.prev; } @@ -558,7 +602,7 @@ static struct shrinker dqcache_shrinker = { * NOTE: If you change this function please check whether dqput_blocks() works right... * MUST be called with either dqptr_sem or dqonoff_mutex held */ -static void dqput(struct dquot *dquot) +void dqput(struct dquot *dquot) { int ret; @@ -584,7 +628,7 @@ we_slept: /* We have more than one user... nothing to do */ atomic_dec(&dquot->dq_count); /* Releasing dquot during quotaoff phase? */ - if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) && + if (!sb_has_quota_active(dquot->dq_sb, dquot->dq_type) && atomic_read(&dquot->dq_count) == 1) wake_up(&dquot->dq_wait_unused); spin_unlock(&dq_list_lock); @@ -625,11 +669,17 @@ we_slept: spin_unlock(&dq_list_lock); } +struct dquot *dquot_alloc(struct super_block *sb, int type) +{ + return kmem_cache_zalloc(dquot_cachep, GFP_NOFS); +} +EXPORT_SYMBOL(dquot_alloc); + static struct dquot *get_empty_dquot(struct super_block *sb, int type) { struct dquot *dquot; - dquot = kmem_cache_zalloc(dquot_cachep, GFP_NOFS); + dquot = sb->dq_op->alloc_dquot(sb, type); if(!dquot) return NODQUOT; @@ -647,15 +697,33 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) } /* + * Check whether dquot is in memory. + * MUST be called with either dqptr_sem or dqonoff_mutex held + */ +int dquot_is_cached(struct super_block *sb, unsigned int id, int type) +{ + unsigned int hashent = hashfn(sb, id, type); + int ret = 0; + + if (!sb_has_quota_active(sb, type)) + return 0; + spin_lock(&dq_list_lock); + if (find_dquot(hashent, sb, id, type) != NODQUOT) + ret = 1; + spin_unlock(&dq_list_lock); + return ret; +} + +/* * Get reference to dquot * MUST be called with either dqptr_sem or dqonoff_mutex held */ -static struct dquot *dqget(struct super_block *sb, unsigned int id, int type) +struct dquot *dqget(struct super_block *sb, unsigned int id, int type) { unsigned int hashent = hashfn(sb, id, type); struct dquot *dquot, *empty = NODQUOT; - if (!sb_has_quota_enabled(sb, type)) + if (!sb_has_quota_active(sb, type)) return NODQUOT; we_slept: spin_lock(&dq_list_lock); @@ -682,7 +750,7 @@ we_slept: dqstats.lookups++; spin_unlock(&dq_list_lock); if (empty) - kmem_cache_free(dquot_cachep, empty); + do_destroy_dquot(empty); } /* Wait for dq_lock - after this we know that either dquot_release() is already * finished or it will be canceled due to dq_count > 1 test */ @@ -820,7 +888,7 @@ static void drop_dquot_ref(struct super_block *sb, int type) } } -static inline void dquot_incr_inodes(struct dquot *dquot, unsigned long number) +static inline void dquot_incr_inodes(struct dquot *dquot, qsize_t number) { dquot->dq_dqb.dqb_curinodes += number; } @@ -830,9 +898,10 @@ static inline void dquot_incr_space(struct dquot *dquot, qsize_t number) dquot->dq_dqb.dqb_curspace += number; } -static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) +static inline void dquot_decr_inodes(struct dquot *dquot, qsize_t number) { - if (dquot->dq_dqb.dqb_curinodes > number) + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curinodes >= number) dquot->dq_dqb.dqb_curinodes -= number; else dquot->dq_dqb.dqb_curinodes = 0; @@ -843,11 +912,12 @@ static inline void dquot_decr_inodes(struct dquot *dquot, unsigned long number) static inline void dquot_decr_space(struct dquot *dquot, qsize_t number) { - if (dquot->dq_dqb.dqb_curspace > number) + if (sb_dqopt(dquot->dq_sb)->flags & DQUOT_NEGATIVE_USAGE || + dquot->dq_dqb.dqb_curspace >= number) dquot->dq_dqb.dqb_curspace -= number; else dquot->dq_dqb.dqb_curspace = 0; - if (toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = (time_t) 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1023,10 +1093,11 @@ static inline char ignore_hardlimit(struct dquot *dquot) } /* needs dq_data_lock */ -static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) +static int check_idq(struct dquot *dquot, qsize_t inodes, char *warntype) { *warntype = QUOTA_NL_NOWARN; - if (inodes <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) return QUOTA_OK; if (dquot->dq_dqb.dqb_ihardlimit && @@ -1058,11 +1129,12 @@ static int check_idq(struct dquot *dquot, ulong inodes, char *warntype) static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *warntype) { *warntype = QUOTA_NL_NOWARN; - if (space <= 0 || test_bit(DQ_FAKE_B, &dquot->dq_flags)) + if (!sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type) || + test_bit(DQ_FAKE_B, &dquot->dq_flags)) return QUOTA_OK; if (dquot->dq_dqb.dqb_bhardlimit && - toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bhardlimit && + dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bhardlimit && !ignore_hardlimit(dquot)) { if (!prealloc) *warntype = QUOTA_NL_BHARDWARN; @@ -1070,7 +1142,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war } if (dquot->dq_dqb.dqb_bsoftlimit && - toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime && get_seconds() >= dquot->dq_dqb.dqb_btime && !ignore_hardlimit(dquot)) { if (!prealloc) @@ -1079,7 +1151,7 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war } if (dquot->dq_dqb.dqb_bsoftlimit && - toqb(dquot->dq_dqb.dqb_curspace + space) > dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace + space > dquot->dq_dqb.dqb_bsoftlimit && dquot->dq_dqb.dqb_btime == 0) { if (!prealloc) { *warntype = QUOTA_NL_BSOFTWARN; @@ -1096,10 +1168,11 @@ static int check_bdq(struct dquot *dquot, qsize_t space, int prealloc, char *war return QUOTA_OK; } -static int info_idq_free(struct dquot *dquot, ulong inodes) +static int info_idq_free(struct dquot *dquot, qsize_t inodes) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit) + dquot->dq_dqb.dqb_curinodes <= dquot->dq_dqb.dqb_isoftlimit || + !sb_has_quota_limits_enabled(dquot->dq_sb, dquot->dq_type)) return QUOTA_NL_NOWARN; if (dquot->dq_dqb.dqb_curinodes - inodes <= dquot->dq_dqb.dqb_isoftlimit) @@ -1113,15 +1186,13 @@ static int info_idq_free(struct dquot *dquot, ulong inodes) static int info_bdq_free(struct dquot *dquot, qsize_t space) { if (test_bit(DQ_FAKE_B, &dquot->dq_flags) || - toqb(dquot->dq_dqb.dqb_curspace) <= dquot->dq_dqb.dqb_bsoftlimit) + dquot->dq_dqb.dqb_curspace <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_NOWARN; - if (toqb(dquot->dq_dqb.dqb_curspace - space) <= - dquot->dq_dqb.dqb_bsoftlimit) + if (dquot->dq_dqb.dqb_curspace - space <= dquot->dq_dqb.dqb_bsoftlimit) return QUOTA_NL_BSOFTBELOW; - if (toqb(dquot->dq_dqb.dqb_curspace) >= dquot->dq_dqb.dqb_bhardlimit && - toqb(dquot->dq_dqb.dqb_curspace - space) < - dquot->dq_dqb.dqb_bhardlimit) + if (dquot->dq_dqb.dqb_curspace >= dquot->dq_dqb.dqb_bhardlimit && + dquot->dq_dqb.dqb_curspace - space < dquot->dq_dqb.dqb_bhardlimit) return QUOTA_NL_BHARDBELOW; return QUOTA_NL_NOWARN; } @@ -1166,17 +1237,23 @@ out_err: * Release all quotas referenced by inode * Transaction must be started at an entry */ -int dquot_drop(struct inode *inode) +int dquot_drop_locked(struct inode *inode) { int cnt; - down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (inode->i_dquot[cnt] != NODQUOT) { dqput(inode->i_dquot[cnt]); inode->i_dquot[cnt] = NODQUOT; } } + return 0; +} + +int dquot_drop(struct inode *inode) +{ + down_write(&sb_dqopt(inode->i_sb)->dqptr_sem); + dquot_drop_locked(inode); up_write(&sb_dqopt(inode->i_sb)->dqptr_sem); return 0; } @@ -1264,7 +1341,7 @@ warn_put_all: /* * This operation can block, but only after everything is updated */ -int dquot_alloc_inode(const struct inode *inode, unsigned long number) +int dquot_alloc_inode(const struct inode *inode, qsize_t number) { int cnt, ret = NO_QUOTA; char warntype[MAXQUOTAS]; @@ -1349,7 +1426,7 @@ out_sub: /* * This operation can block, but only after everything is updated */ -int dquot_free_inode(const struct inode *inode, unsigned long number) +int dquot_free_inode(const struct inode *inode, qsize_t number) { unsigned int cnt; char warntype[MAXQUOTAS]; @@ -1495,7 +1572,7 @@ warn_put_all: /* Wrapper for transferring ownership of an inode */ int vfs_dq_transfer(struct inode *inode, struct iattr *iattr) { - if (sb_any_quota_enabled(inode->i_sb) && !IS_NOQUOTA(inode)) { + if (sb_any_quota_active(inode->i_sb) && !IS_NOQUOTA(inode)) { vfs_dq_init(inode); if (inode->i_sb->dq_op->transfer(inode, iattr) == NO_QUOTA) return 1; @@ -1533,54 +1610,27 @@ struct dquot_operations dquot_operations = { .acquire_dquot = dquot_acquire, .release_dquot = dquot_release, .mark_dirty = dquot_mark_dquot_dirty, - .write_info = dquot_commit_info + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; -static inline void set_enable_flags(struct quota_info *dqopt, int type) -{ - switch (type) { - case USRQUOTA: - dqopt->flags |= DQUOT_USR_ENABLED; - dqopt->flags &= ~DQUOT_USR_SUSPENDED; - break; - case GRPQUOTA: - dqopt->flags |= DQUOT_GRP_ENABLED; - dqopt->flags &= ~DQUOT_GRP_SUSPENDED; - break; - } -} - -static inline void reset_enable_flags(struct quota_info *dqopt, int type, - int remount) -{ - switch (type) { - case USRQUOTA: - dqopt->flags &= ~DQUOT_USR_ENABLED; - if (remount) - dqopt->flags |= DQUOT_USR_SUSPENDED; - else - dqopt->flags &= ~DQUOT_USR_SUSPENDED; - break; - case GRPQUOTA: - dqopt->flags &= ~DQUOT_GRP_ENABLED; - if (remount) - dqopt->flags |= DQUOT_GRP_SUSPENDED; - else - dqopt->flags &= ~DQUOT_GRP_SUSPENDED; - break; - } -} - - /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ -int vfs_quota_off(struct super_block *sb, int type, int remount) +int vfs_quota_disable(struct super_block *sb, int type, unsigned int flags) { int cnt, ret = 0; struct quota_info *dqopt = sb_dqopt(sb); struct inode *toputinode[MAXQUOTAS]; + /* Cannot turn off usage accounting without turning off limits, or + * suspend quotas and simultaneously turn quotas off. */ + if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED)) + || (flags & DQUOT_SUSPENDED && flags & (DQUOT_LIMITS_ENABLED | + DQUOT_USAGE_ENABLED))) + return -EINVAL; + /* We need to serialize quota_off() for device */ mutex_lock(&dqopt->dqonoff_mutex); @@ -1589,7 +1639,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) * sometimes we are called when fill_super() failed and calling * sync_fs() in such cases does no good. */ - if (!sb_any_quota_enabled(sb) && !sb_any_quota_suspended(sb)) { + if (!sb_any_quota_loaded(sb)) { mutex_unlock(&dqopt->dqonoff_mutex); return 0; } @@ -1597,17 +1647,28 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) toputinode[cnt] = NULL; if (type != -1 && cnt != type) continue; - /* If we keep inodes of quota files after remount and quotaoff - * is called, drop kept inodes. */ - if (!remount && sb_has_quota_suspended(sb, cnt)) { - iput(dqopt->files[cnt]); - dqopt->files[cnt] = NULL; - reset_enable_flags(dqopt, cnt, 0); + if (!sb_has_quota_loaded(sb, cnt)) continue; + + if (flags & DQUOT_SUSPENDED) { + dqopt->flags |= + dquot_state_flag(DQUOT_SUSPENDED, cnt); + } else { + dqopt->flags &= ~dquot_state_flag(flags, cnt); + /* Turning off suspended quotas? */ + if (!sb_has_quota_loaded(sb, cnt) && + sb_has_quota_suspended(sb, cnt)) { + dqopt->flags &= ~dquot_state_flag( + DQUOT_SUSPENDED, cnt); + iput(dqopt->files[cnt]); + dqopt->files[cnt] = NULL; + continue; + } } - if (!sb_has_quota_enabled(sb, cnt)) + + /* We still have to keep quota loaded? */ + if (sb_has_quota_loaded(sb, cnt) && !(flags & DQUOT_SUSPENDED)) continue; - reset_enable_flags(dqopt, cnt, remount); /* Note: these are blocking operations */ drop_dquot_ref(sb, cnt); @@ -1623,7 +1684,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) put_quota_format(dqopt->info[cnt].dqi_format); toputinode[cnt] = dqopt->files[cnt]; - if (!remount) + if (!sb_has_quota_loaded(sb, cnt)) dqopt->files[cnt] = NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; @@ -1631,6 +1692,11 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) dqopt->ops[cnt] = NULL; } mutex_unlock(&dqopt->dqonoff_mutex); + + /* Skip syncing and setting flags if quota files are hidden */ + if (dqopt->flags & DQUOT_QUOTA_SYS_FILE) + goto put_inodes; + /* Sync the superblock so that buffers with quota data are written to * disk (and so userspace sees correct data afterwards). */ if (sb->s_op->sync_fs) @@ -1646,7 +1712,7 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) mutex_lock(&dqopt->dqonoff_mutex); /* If quota was reenabled in the meantime, we have * nothing to do */ - if (!sb_has_quota_enabled(sb, cnt)) { + if (!sb_has_quota_loaded(sb, cnt)) { mutex_lock_nested(&toputinode[cnt]->i_mutex, I_MUTEX_QUOTA); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); @@ -1655,26 +1721,43 @@ int vfs_quota_off(struct super_block *sb, int type, int remount) mark_inode_dirty(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); + } + if (sb->s_bdev) + invalidate_bdev(sb->s_bdev); +put_inodes: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (toputinode[cnt]) { /* On remount RO, we keep the inode pointer so that we - * can reenable quota on the subsequent remount RW. - * But we have better not keep inode pointer when there - * is pending delete on the quota file... */ - if (!remount) + * can reenable quota on the subsequent remount RW. We + * have to check 'flags' variable and not use sb_has_ + * function because another quotaon / quotaoff could + * change global state before we got here. We refuse + * to suspend quotas when there is pending delete on + * the quota file... */ + if (!(flags & DQUOT_SUSPENDED)) iput(toputinode[cnt]); else if (!toputinode[cnt]->i_nlink) ret = -EBUSY; } - if (sb->s_bdev) - invalidate_bdev(sb->s_bdev); return ret; } +int vfs_quota_off(struct super_block *sb, int type, int remount) +{ + return vfs_quota_disable(sb, type, remount ? DQUOT_SUSPENDED : + (DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED)); +} + /* * Turn quotas on on a device */ -/* Helper function when we already have the inode */ -static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) +/* + * Helper function to turn quotas on when we already have the inode of + * quota file and no quota information is loaded. + */ +static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, + unsigned int flags) { struct quota_format_type *fmt = find_quota_format(format_id); struct super_block *sb = inode->i_sb; @@ -1696,27 +1779,37 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) error = -EINVAL; goto out_fmt; } + /* Usage always has to be set... */ + if (!(flags & DQUOT_USAGE_ENABLED)) { + error = -EINVAL; + goto out_fmt; + } - /* As we bypass the pagecache we must now flush the inode so that - * we see all the changes from userspace... */ - write_inode_now(inode, 1); - /* And now flush the block cache so that kernel sees the changes */ - invalidate_bdev(sb->s_bdev); + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* As we bypass the pagecache we must now flush the inode so + * that we see all the changes from userspace... */ + write_inode_now(inode, 1); + /* And now flush the block cache so that kernel sees the + * changes */ + invalidate_bdev(sb->s_bdev); + } mutex_lock(&inode->i_mutex); mutex_lock(&dqopt->dqonoff_mutex); - if (sb_has_quota_enabled(sb, type) || - sb_has_quota_suspended(sb, type)) { + if (sb_has_quota_loaded(sb, type)) { error = -EBUSY; goto out_lock; } - /* We don't want quota and atime on quota files (deadlocks possible) - * Also nobody should write to the file - we use special IO operations - * which ignore the immutable bit. */ - down_write(&dqopt->dqptr_sem); - oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); - inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - up_write(&dqopt->dqptr_sem); - sb->dq_op->drop(inode); + + if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) { + /* We don't want quota and atime on quota files (deadlocks + * possible) Also nobody should write to the file - we use + * special IO operations which ignore the immutable bit. */ + down_write(&dqopt->dqptr_sem); + oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); + inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; + up_write(&dqopt->dqptr_sem); + sb->dq_op->drop(inode); + } error = -EIO; dqopt->files[type] = igrab(inode); @@ -1737,7 +1830,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) } mutex_unlock(&dqopt->dqio_mutex); mutex_unlock(&inode->i_mutex); - set_enable_flags(dqopt, type); + dqopt->flags |= dquot_state_flag(flags, type); add_dquot_ref(sb, type); mutex_unlock(&dqopt->dqonoff_mutex); @@ -1770,20 +1863,23 @@ static int vfs_quota_on_remount(struct super_block *sb, int type) struct quota_info *dqopt = sb_dqopt(sb); struct inode *inode; int ret; + unsigned int flags; mutex_lock(&dqopt->dqonoff_mutex); if (!sb_has_quota_suspended(sb, type)) { mutex_unlock(&dqopt->dqonoff_mutex); return 0; } - BUG_ON(sb_has_quota_enabled(sb, type)); - inode = dqopt->files[type]; dqopt->files[type] = NULL; - reset_enable_flags(dqopt, type, 0); + flags = dqopt->flags & dquot_state_flag(DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED, type); + dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, type); mutex_unlock(&dqopt->dqonoff_mutex); - ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id); + flags = dquot_generic_flag(flags, type); + ret = vfs_load_quota_inode(inode, type, dqopt->info[type].dqi_fmt_id, + flags); iput(inode); return ret; @@ -1799,12 +1895,12 @@ int vfs_quota_on_path(struct super_block *sb, int type, int format_id, if (path->mnt->mnt_sb != sb) error = -EXDEV; else - error = vfs_quota_on_inode(path->dentry->d_inode, type, - format_id); + error = vfs_load_quota_inode(path->dentry->d_inode, type, + format_id, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); return error; } -/* Actual function called from quotactl() */ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, int remount) { @@ -1823,6 +1919,50 @@ int vfs_quota_on(struct super_block *sb, int type, int format_id, char *name, } /* + * More powerful function for turning on quotas allowing setting + * of individual quota flags + */ +int vfs_quota_enable(struct inode *inode, int type, int format_id, + unsigned int flags) +{ + int ret = 0; + struct super_block *sb = inode->i_sb; + struct quota_info *dqopt = sb_dqopt(sb); + + /* Just unsuspend quotas? */ + if (flags & DQUOT_SUSPENDED) + return vfs_quota_on_remount(sb, type); + if (!flags) + return 0; + /* Just updating flags needed? */ + if (sb_has_quota_loaded(sb, type)) { + mutex_lock(&dqopt->dqonoff_mutex); + /* Now do a reliable test... */ + if (!sb_has_quota_loaded(sb, type)) { + mutex_unlock(&dqopt->dqonoff_mutex); + goto load_quota; + } + if (flags & DQUOT_USAGE_ENABLED && + sb_has_quota_usage_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + if (flags & DQUOT_LIMITS_ENABLED && + sb_has_quota_limits_enabled(sb, type)) { + ret = -EBUSY; + goto out_lock; + } + sb_dqopt(sb)->flags |= dquot_state_flag(flags, type); +out_lock: + mutex_unlock(&dqopt->dqonoff_mutex); + return ret; + } + +load_quota: + return vfs_load_quota_inode(inode, type, format_id, flags); +} + +/* * This function is used when filesystem needs to initialize quotas * during mount time. */ @@ -1843,7 +1983,8 @@ int vfs_quota_on_mount(struct super_block *sb, char *qf_name, error = security_quota_on(dentry); if (!error) - error = vfs_quota_on_inode(dentry->d_inode, type, format_id); + error = vfs_load_quota_inode(dentry->d_inode, type, format_id, + DQUOT_USAGE_ENABLED | DQUOT_LIMITS_ENABLED); out: dput(dentry); @@ -1866,14 +2007,24 @@ int vfs_dq_quota_on_remount(struct super_block *sb) return ret; } +static inline qsize_t qbtos(qsize_t blocks) +{ + return blocks << QIF_DQBLKSIZE_BITS; +} + +static inline qsize_t stoqb(qsize_t space) +{ + return (space + QIF_DQBLKSIZE - 1) >> QIF_DQBLKSIZE_BITS; +} + /* Generic routine for getting common part of quota structure */ static void do_get_dqblk(struct dquot *dquot, struct if_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; spin_lock(&dq_data_lock); - di->dqb_bhardlimit = dm->dqb_bhardlimit; - di->dqb_bsoftlimit = dm->dqb_bsoftlimit; + di->dqb_bhardlimit = stoqb(dm->dqb_bhardlimit); + di->dqb_bsoftlimit = stoqb(dm->dqb_bsoftlimit); di->dqb_curspace = dm->dqb_curspace; di->dqb_ihardlimit = dm->dqb_ihardlimit; di->dqb_isoftlimit = dm->dqb_isoftlimit; @@ -1918,28 +2069,38 @@ static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) if (di->dqb_valid & QIF_SPACE) { dm->dqb_curspace = di->dqb_curspace; check_blim = 1; + __set_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_BLIMITS) { - dm->dqb_bsoftlimit = di->dqb_bsoftlimit; - dm->dqb_bhardlimit = di->dqb_bhardlimit; + dm->dqb_bsoftlimit = qbtos(di->dqb_bsoftlimit); + dm->dqb_bhardlimit = qbtos(di->dqb_bhardlimit); check_blim = 1; + __set_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_INODES) { dm->dqb_curinodes = di->dqb_curinodes; check_ilim = 1; + __set_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); } if (di->dqb_valid & QIF_ILIMITS) { dm->dqb_isoftlimit = di->dqb_isoftlimit; dm->dqb_ihardlimit = di->dqb_ihardlimit; check_ilim = 1; + __set_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); } - if (di->dqb_valid & QIF_BTIME) + if (di->dqb_valid & QIF_BTIME) { dm->dqb_btime = di->dqb_btime; - if (di->dqb_valid & QIF_ITIME) + check_blim = 1; + __set_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + } + if (di->dqb_valid & QIF_ITIME) { dm->dqb_itime = di->dqb_itime; + check_ilim = 1; + __set_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + } if (check_blim) { - if (!dm->dqb_bsoftlimit || toqb(dm->dqb_curspace) < dm->dqb_bsoftlimit) { + if (!dm->dqb_bsoftlimit || dm->dqb_curspace < dm->dqb_bsoftlimit) { dm->dqb_btime = 0; clear_bit(DQ_BLKS_B, &dquot->dq_flags); } @@ -1970,12 +2131,14 @@ int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d int rc; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!(dquot = dqget(sb, id, type))) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return -ESRCH; + dquot = dqget(sb, id, type); + if (!dquot) { + rc = -ESRCH; + goto out; } rc = do_set_dqblk(dquot, di); dqput(dquot); +out: mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return rc; } @@ -1986,7 +2149,7 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) struct mem_dqinfo *mi; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_enabled(sb, type)) { + if (!sb_has_quota_active(sb, type)) { mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return -ESRCH; } @@ -2005,11 +2168,12 @@ int vfs_get_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) { struct mem_dqinfo *mi; + int err = 0; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - if (!sb_has_quota_enabled(sb, type)) { - mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return -ESRCH; + if (!sb_has_quota_active(sb, type)) { + err = -ESRCH; + goto out; } mi = sb_dqopt(sb)->info + type; spin_lock(&dq_data_lock); @@ -2023,8 +2187,9 @@ int vfs_set_dqinfo(struct super_block *sb, int type, struct if_dqinfo *ii) mark_info_dirty(sb, type); /* Force write to disk */ sb->dq_op->write_info(sb, type); +out: mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return 0; + return err; } struct quotactl_ops vfs_quotactl_ops = { @@ -2186,10 +2351,13 @@ EXPORT_SYMBOL(register_quota_format); EXPORT_SYMBOL(unregister_quota_format); EXPORT_SYMBOL(dqstats); EXPORT_SYMBOL(dq_data_lock); +EXPORT_SYMBOL(vfs_quota_enable); EXPORT_SYMBOL(vfs_quota_on); EXPORT_SYMBOL(vfs_quota_on_path); EXPORT_SYMBOL(vfs_quota_on_mount); +EXPORT_SYMBOL(vfs_quota_disable); EXPORT_SYMBOL(vfs_quota_off); +EXPORT_SYMBOL(dquot_scan_active); EXPORT_SYMBOL(vfs_quota_sync); EXPORT_SYMBOL(vfs_get_dqinfo); EXPORT_SYMBOL(vfs_set_dqinfo); @@ -2202,7 +2370,11 @@ EXPORT_SYMBOL(dquot_release); EXPORT_SYMBOL(dquot_mark_dquot_dirty); EXPORT_SYMBOL(dquot_initialize); EXPORT_SYMBOL(dquot_drop); +EXPORT_SYMBOL(dquot_drop_locked); EXPORT_SYMBOL(vfs_dq_drop); +EXPORT_SYMBOL(dqget); +EXPORT_SYMBOL(dqput); +EXPORT_SYMBOL(dquot_is_cached); EXPORT_SYMBOL(dquot_alloc_space); EXPORT_SYMBOL(dquot_alloc_inode); EXPORT_SYMBOL(dquot_free_space); diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 6046239465a..c01e043670e 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -175,8 +175,8 @@ out: * * Returns zero on success; non-zero on error. */ -static int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, - loff_t offset) +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset) { int rc = 0; char dst[MD5_DIGEST_SIZE]; @@ -924,6 +924,15 @@ static void ecryptfs_copy_mount_wide_flags_to_inode_flags( crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) crypt_stat->flags |= ECRYPTFS_VIEW_AS_ENCRYPTED; + if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + crypt_stat->flags |= ECRYPTFS_ENCRYPT_FILENAMES; + if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_MOUNT_FNEK; + else if (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_FEK) + crypt_stat->flags |= ECRYPTFS_ENCFN_USE_FEK; + } } static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( @@ -1060,7 +1069,8 @@ struct ecryptfs_flag_map_elem { static struct ecryptfs_flag_map_elem ecryptfs_flag_map[] = { {0x00000001, ECRYPTFS_ENABLE_HMAC}, {0x00000002, ECRYPTFS_ENCRYPTED}, - {0x00000004, ECRYPTFS_METADATA_IN_XATTR} + {0x00000004, ECRYPTFS_METADATA_IN_XATTR}, + {0x00000008, ECRYPTFS_ENCRYPT_FILENAMES} }; /** @@ -1149,19 +1159,20 @@ ecryptfs_cipher_code_str_map[] = { /** * ecryptfs_code_for_cipher_string - * @crypt_stat: The cryptographic context + * @cipher_name: The string alias for the cipher + * @key_bytes: Length of key in bytes; used for AES code selection * * Returns zero on no match, or the cipher code on match */ -u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes) { int i; u8 code = 0; struct ecryptfs_cipher_code_str_map_elem *map = ecryptfs_cipher_code_str_map; - if (strcmp(crypt_stat->cipher, "aes") == 0) { - switch (crypt_stat->key_size) { + if (strcmp(cipher_name, "aes") == 0) { + switch (key_bytes) { case 16: code = RFC2440_CIPHER_AES_128; break; @@ -1173,7 +1184,7 @@ u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat) } } else { for (i = 0; i < ARRAY_SIZE(ecryptfs_cipher_code_str_map); i++) - if (strcmp(crypt_stat->cipher, map[i].cipher_str) == 0){ + if (strcmp(cipher_name, map[i].cipher_str) == 0) { code = map[i].cipher_code; break; } @@ -1212,6 +1223,8 @@ int ecryptfs_read_and_validate_header_region(char *data, &(ecryptfs_inode_to_private(ecryptfs_inode)->crypt_stat); int rc; + if (crypt_stat->extent_size == 0) + crypt_stat->extent_size = ECRYPTFS_DEFAULT_EXTENT_SIZE; rc = ecryptfs_read_lower(data, 0, crypt_stat->extent_size, ecryptfs_inode); if (rc) { @@ -1221,7 +1234,6 @@ int ecryptfs_read_and_validate_header_region(char *data, } if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { rc = -EINVAL; - ecryptfs_printk(KERN_DEBUG, "Valid marker not found\n"); } out: return rc; @@ -1628,95 +1640,95 @@ out: } /** - * ecryptfs_encode_filename - converts a plaintext file name to cipher text - * @crypt_stat: The crypt_stat struct associated with the file anem to encode - * @name: The plaintext name - * @length: The length of the plaintext - * @encoded_name: The encypted name + * ecryptfs_encrypt_filename - encrypt filename * - * Encrypts and encodes a filename into something that constitutes a - * valid filename for a filesystem, with printable characters. + * CBC-encrypts the filename. We do not want to encrypt the same + * filename with the same key and IV, which may happen with hard + * links, so we prepend random bits to each filename. * - * We assume that we have a properly initialized crypto context, - * pointed to by crypt_stat->tfm. - * - * TODO: Implement filename decoding and decryption here, in place of - * memcpy. We are keeping the framework around for now to (1) - * facilitate testing of the components needed to implement filename - * encryption and (2) to provide a code base from which other - * developers in the community can easily implement this feature. - * - * Returns the length of encoded filename; negative if error + * Returns zero on success; non-zero otherwise */ -int -ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, char **encoded_name) +static int +ecryptfs_encrypt_filename(struct ecryptfs_filename *filename, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat) { - int error = 0; + int rc = 0; - (*encoded_name) = kmalloc(length + 2, GFP_KERNEL); - if (!(*encoded_name)) { - error = -ENOMEM; + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + size_t packet_size; + size_t remaining_bytes; + + rc = ecryptfs_write_tag_70_packet( + NULL, NULL, + &filename->encrypted_filename_size, + mount_crypt_stat, NULL, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to get packet " + "size for tag 72; rc = [%d]\n", __func__, + rc); + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename = + kmalloc(filename->encrypted_filename_size, GFP_KERNEL); + if (!filename->encrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + filename->encrypted_filename_size); + rc = -ENOMEM; + goto out; + } + remaining_bytes = filename->encrypted_filename_size; + rc = ecryptfs_write_tag_70_packet(filename->encrypted_filename, + &remaining_bytes, + &packet_size, + mount_crypt_stat, + filename->filename, + filename->filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to generate " + "tag 70 packet; rc = [%d]\n", __func__, + rc); + kfree(filename->encrypted_filename); + filename->encrypted_filename = NULL; + filename->encrypted_filename_size = 0; + goto out; + } + filename->encrypted_filename_size = packet_size; + } else { + printk(KERN_ERR "%s: No support for requested filename " + "encryption method in this release\n", __func__); + rc = -ENOTSUPP; goto out; } - /* TODO: Filename encryption is a scheduled feature for a - * future version of eCryptfs. This function is here only for - * the purpose of providing a framework for other developers - * to easily implement filename encryption. Hint: Replace this - * memcpy() with a call to encrypt and encode the - * filename, the set the length accordingly. */ - memcpy((void *)(*encoded_name), (void *)name, length); - (*encoded_name)[length] = '\0'; - error = length + 1; out: - return error; + return rc; } -/** - * ecryptfs_decode_filename - converts the cipher text name to plaintext - * @crypt_stat: The crypt_stat struct associated with the file - * @name: The filename in cipher text - * @length: The length of the cipher text name - * @decrypted_name: The plaintext name - * - * Decodes and decrypts the filename. - * - * We assume that we have a properly initialized crypto context, - * pointed to by crypt_stat->tfm. - * - * TODO: Implement filename decoding and decryption here, in place of - * memcpy. We are keeping the framework around for now to (1) - * facilitate testing of the components needed to implement filename - * encryption and (2) to provide a code base from which other - * developers in the community can easily implement this feature. - * - * Returns the length of decoded filename; negative if error - */ -int -ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, char **decrypted_name) +static int ecryptfs_copy_filename(char **copied_name, size_t *copied_name_size, + const char *name, size_t name_size) { - int error = 0; + int rc = 0; - (*decrypted_name) = kmalloc(length + 2, GFP_KERNEL); - if (!(*decrypted_name)) { - error = -ENOMEM; + (*copied_name) = kmalloc((name_size + 2), GFP_KERNEL); + if (!(*copied_name)) { + rc = -ENOMEM; goto out; } - /* TODO: Filename encryption is a scheduled feature for a - * future version of eCryptfs. This function is here only for - * the purpose of providing a framework for other developers - * to easily implement filename encryption. Hint: Replace this - * memcpy() with a call to decode and decrypt the - * filename, the set the length accordingly. */ - memcpy((void *)(*decrypted_name), (void *)name, length); - (*decrypted_name)[length + 1] = '\0'; /* Only for convenience + memcpy((void *)(*copied_name), (void *)name, name_size); + (*copied_name)[(name_size)] = '\0'; /* Only for convenience * in printing out the * string in debug * messages */ - error = length; + (*copied_name_size) = (name_size + 1); out: - return error; + return rc; } /** @@ -1740,7 +1752,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, *key_tfm = NULL; if (*key_size > ECRYPTFS_MAX_KEY_BYTES) { rc = -EINVAL; - printk(KERN_ERR "Requested key size is [%Zd] bytes; maximum " + printk(KERN_ERR "Requested key size is [%zd] bytes; maximum " "allowable is [%d]\n", *key_size, ECRYPTFS_MAX_KEY_BYTES); goto out; } @@ -1765,7 +1777,7 @@ ecryptfs_process_key_cipher(struct crypto_blkcipher **key_tfm, get_random_bytes(dummy_key, *key_size); rc = crypto_blkcipher_setkey(*key_tfm, dummy_key, *key_size); if (rc) { - printk(KERN_ERR "Error attempting to set key of size [%Zd] for " + printk(KERN_ERR "Error attempting to set key of size [%zd] for " "cipher [%s]; rc = [%d]\n", *key_size, cipher_name, rc); rc = -EINVAL; goto out; @@ -1910,3 +1922,341 @@ out: mutex_unlock(&key_tfm_list_mutex); return rc; } + +/* 64 characters forming a 6-bit target field */ +static unsigned char *portable_filename_chars = ("-.0123456789ABCD" + "EFGHIJKLMNOPQRST" + "UVWXYZabcdefghij" + "klmnopqrstuvwxyz"); + +/* We could either offset on every reverse map or just pad some 0x00's + * at the front here */ +static const unsigned char filename_rev_map[] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 7 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 15 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 23 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 31 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 39 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, /* 47 */ + 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, /* 55 */ + 0x0A, 0x0B, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 63 */ + 0x00, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, /* 71 */ + 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, /* 79 */ + 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, /* 87 */ + 0x23, 0x24, 0x25, 0x00, 0x00, 0x00, 0x00, 0x00, /* 95 */ + 0x00, 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, /* 103 */ + 0x2D, 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, /* 111 */ + 0x35, 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, /* 119 */ + 0x3D, 0x3E, 0x3F +}; + +/** + * ecryptfs_encode_for_filename + * @dst: Destination location for encoded filename + * @dst_size: Size of the encoded filename in bytes + * @src: Source location for the filename to encode + * @src_size: Size of the source in bytes + */ +void ecryptfs_encode_for_filename(unsigned char *dst, size_t *dst_size, + unsigned char *src, size_t src_size) +{ + size_t num_blocks; + size_t block_num = 0; + size_t dst_offset = 0; + unsigned char last_block[3]; + + if (src_size == 0) { + (*dst_size) = 0; + goto out; + } + num_blocks = (src_size / 3); + if ((src_size % 3) == 0) { + memcpy(last_block, (&src[src_size - 3]), 3); + } else { + num_blocks++; + last_block[2] = 0x00; + switch (src_size % 3) { + case 1: + last_block[0] = src[src_size - 1]; + last_block[1] = 0x00; + break; + case 2: + last_block[0] = src[src_size - 2]; + last_block[1] = src[src_size - 1]; + } + } + (*dst_size) = (num_blocks * 4); + if (!dst) + goto out; + while (block_num < num_blocks) { + unsigned char *src_block; + unsigned char dst_block[4]; + + if (block_num == (num_blocks - 1)) + src_block = last_block; + else + src_block = &src[block_num * 3]; + dst_block[0] = ((src_block[0] >> 2) & 0x3F); + dst_block[1] = (((src_block[0] << 4) & 0x30) + | ((src_block[1] >> 4) & 0x0F)); + dst_block[2] = (((src_block[1] << 2) & 0x3C) + | ((src_block[2] >> 6) & 0x03)); + dst_block[3] = (src_block[2] & 0x3F); + dst[dst_offset++] = portable_filename_chars[dst_block[0]]; + dst[dst_offset++] = portable_filename_chars[dst_block[1]]; + dst[dst_offset++] = portable_filename_chars[dst_block[2]]; + dst[dst_offset++] = portable_filename_chars[dst_block[3]]; + block_num++; + } +out: + return; +} + +/** + * ecryptfs_decode_from_filename + * @dst: If NULL, this function only sets @dst_size and returns. If + * non-NULL, this function decodes the encoded octets in @src + * into the memory that @dst points to. + * @dst_size: Set to the size of the decoded string. + * @src: The encoded set of octets to decode. + * @src_size: The size of the encoded set of octets to decode. + */ +static void +ecryptfs_decode_from_filename(unsigned char *dst, size_t *dst_size, + const unsigned char *src, size_t src_size) +{ + u8 current_bit_offset = 0; + size_t src_byte_offset = 0; + size_t dst_byte_offset = 0; + + if (dst == NULL) { + /* Not exact; conservatively long. Every block of 4 + * encoded characters decodes into a block of 3 + * decoded characters. This segment of code provides + * the caller with the maximum amount of allocated + * space that @dst will need to point to in a + * subsequent call. */ + (*dst_size) = (((src_size + 1) * 3) / 4); + goto out; + } + while (src_byte_offset < src_size) { + unsigned char src_byte = + filename_rev_map[(int)src[src_byte_offset]]; + + switch (current_bit_offset) { + case 0: + dst[dst_byte_offset] = (src_byte << 2); + current_bit_offset = 6; + break; + case 6: + dst[dst_byte_offset++] |= (src_byte >> 4); + dst[dst_byte_offset] = ((src_byte & 0xF) + << 4); + current_bit_offset = 4; + break; + case 4: + dst[dst_byte_offset++] |= (src_byte >> 2); + dst[dst_byte_offset] = (src_byte << 6); + current_bit_offset = 2; + break; + case 2: + dst[dst_byte_offset++] |= (src_byte); + dst[dst_byte_offset] = 0; + current_bit_offset = 0; + break; + } + src_byte_offset++; + } + (*dst_size) = dst_byte_offset; +out: + return; +} + +/** + * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text + * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @name: The plaintext name + * @length: The length of the plaintext + * @encoded_name: The encypted name + * + * Encrypts and encodes a filename into something that constitutes a + * valid filename for a filesystem, with printable characters. + * + * We assume that we have a properly initialized crypto context, + * pointed to by crypt_stat->tfm. + * + * Returns zero on success; non-zero on otherwise + */ +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size) +{ + size_t encoded_name_no_prefix_size; + int rc = 0; + + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + if ((crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + || (mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) { + struct ecryptfs_filename *filename; + + filename = kzalloc(sizeof(*filename), GFP_KERNEL); + if (!filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + sizeof(*filename)); + rc = -ENOMEM; + goto out; + } + filename->filename = (char *)name; + filename->filename_size = name_size; + rc = ecryptfs_encrypt_filename(filename, crypt_stat, + mount_crypt_stat); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt " + "filename; rc = [%d]\n", __func__, rc); + kfree(filename); + goto out; + } + ecryptfs_encode_for_filename( + NULL, &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + else + (*encoded_name_size) = + (ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name) = kmalloc((*encoded_name_size) + 1, GFP_KERNEL); + if (!(*encoded_name)) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kzalloc [%zd] bytes\n", __func__, + (*encoded_name_size)); + rc = -ENOMEM; + kfree(filename->encrypted_filename); + kfree(filename); + goto out; + } + if ((crypt_stat && (crypt_stat->flags + & ECRYPTFS_ENCFN_USE_MOUNT_FNEK)) + || (mount_crypt_stat + && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK))) { + memcpy((*encoded_name), + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE); + ecryptfs_encode_for_filename( + ((*encoded_name) + + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE), + &encoded_name_no_prefix_size, + filename->encrypted_filename, + filename->encrypted_filename_size); + (*encoded_name_size) = + (ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE + + encoded_name_no_prefix_size); + (*encoded_name)[(*encoded_name_size)] = '\0'; + (*encoded_name_size)++; + } else { + rc = -ENOTSUPP; + } + if (rc) { + printk(KERN_ERR "%s: Error attempting to encode " + "encrypted filename; rc = [%d]\n", __func__, + rc); + kfree((*encoded_name)); + (*encoded_name) = NULL; + (*encoded_name_size) = 0; + } + kfree(filename->encrypted_filename); + kfree(filename); + } else { + rc = ecryptfs_copy_filename(encoded_name, + encoded_name_size, + name, name_size); + } +out: + return rc; +} + +/** + * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext + * @plaintext_name: The plaintext name + * @plaintext_name_size: The plaintext name size + * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @name: The filename in cipher text + * @name_size: The cipher text name size + * + * Decrypts and decodes the filename. + * + * Returns zero on error; non-zero otherwise + */ +int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, + size_t *plaintext_name_size, + struct dentry *ecryptfs_dir_dentry, + const char *name, size_t name_size) +{ + char *decoded_name; + size_t decoded_name_size; + size_t packet_size; + int rc = 0; + + if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, + ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; + const char *orig_name = name; + size_t orig_name_size = name_size; + + name += ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + name_size -= ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE; + ecryptfs_decode_from_filename(NULL, &decoded_name_size, + name, name_size); + decoded_name = kmalloc(decoded_name_size, GFP_KERNEL); + if (!decoded_name) { + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc [%zd] bytes\n", __func__, + decoded_name_size); + rc = -ENOMEM; + goto out; + } + ecryptfs_decode_from_filename(decoded_name, &decoded_name_size, + name, name_size); + rc = ecryptfs_parse_tag_70_packet(plaintext_name, + plaintext_name_size, + &packet_size, + mount_crypt_stat, + decoded_name, + decoded_name_size); + if (rc) { + printk(KERN_INFO "%s: Could not parse tag 70 packet " + "from filename; copying through filename " + "as-is\n", __func__); + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + orig_name, orig_name_size); + goto out_free; + } + } else { + rc = ecryptfs_copy_filename(plaintext_name, + plaintext_name_size, + name, name_size); + goto out; + } +out_free: + kfree(decoded_name); +out: + return rc; +} diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index a75026d35d1..c11fc95714a 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -51,12 +51,16 @@ #define ECRYPTFS_VERSIONING_XATTR 0x00000010 #define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 #define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 +#define ECRYPTFS_VERSIONING_HMAC 0x00000080 +#define ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION 0x00000100 +#define ECRYPTFS_VERSIONING_GCM 0x00000200 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ | ECRYPTFS_VERSIONING_PUBKEY \ | ECRYPTFS_VERSIONING_XATTR \ | ECRYPTFS_VERSIONING_MULTKEY \ - | ECRYPTFS_VERSIONING_DEVMISC) + | ECRYPTFS_VERSIONING_DEVMISC \ + | ECRYPTFS_VERSIONING_FILENAME_ENCRYPTION) #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH #define ECRYPTFS_SALT_SIZE 8 @@ -199,6 +203,7 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_DEFAULT_CIPHER "aes" #define ECRYPTFS_DEFAULT_KEY_BYTES 16 #define ECRYPTFS_DEFAULT_HASH "md5" +#define ECRYPTFS_TAG_70_DIGEST ECRYPTFS_DEFAULT_HASH #define ECRYPTFS_TAG_1_PACKET_TYPE 0x01 #define ECRYPTFS_TAG_3_PACKET_TYPE 0x8C #define ECRYPTFS_TAG_11_PACKET_TYPE 0xED @@ -206,30 +211,64 @@ ecryptfs_get_key_payload_data(struct key *key) #define ECRYPTFS_TAG_65_PACKET_TYPE 0x41 #define ECRYPTFS_TAG_66_PACKET_TYPE 0x42 #define ECRYPTFS_TAG_67_PACKET_TYPE 0x43 +#define ECRYPTFS_TAG_70_PACKET_TYPE 0x46 /* FNEK-encrypted filename + * as dentry name */ +#define ECRYPTFS_TAG_71_PACKET_TYPE 0x47 /* FNEK-encrypted filename in + * metadata */ +#define ECRYPTFS_TAG_72_PACKET_TYPE 0x48 /* FEK-encrypted filename as + * dentry name */ +#define ECRYPTFS_TAG_73_PACKET_TYPE 0x49 /* FEK-encrypted filename as + * metadata */ +/* Constraint: ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES >= + * ECRYPTFS_MAX_IV_BYTES */ +#define ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES 16 +#define ECRYPTFS_NON_NULL 0x42 /* A reasonable substitute for NULL */ #define MD5_DIGEST_SIZE 16 +#define ECRYPTFS_TAG_70_DIGEST_SIZE MD5_DIGEST_SIZE +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FEK_ENCRYPTED." +#define ECRYPTFS_FEK_ENCRYPTED_FILENAME_PREFIX_SIZE 23 +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX "ECRYPTFS_FNEK_ENCRYPTED." +#define ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE 24 +#define ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN (18 + 1 + 4 + 1 + 32) struct ecryptfs_key_sig { struct list_head crypt_stat_list; char keysig[ECRYPTFS_SIG_SIZE_HEX]; }; +struct ecryptfs_filename { + struct list_head crypt_stat_list; +#define ECRYPTFS_FILENAME_CONTAINS_DECRYPTED 0x00000001 + u32 flags; + u32 seq_no; + char *filename; + char *encrypted_filename; + size_t filename_size; + size_t encrypted_filename_size; + char fnek_sig[ECRYPTFS_SIG_SIZE_HEX]; + char dentry_name[ECRYPTFS_ENCRYPTED_DENTRY_NAME_LEN + 1]; +}; + /** * This is the primary struct associated with each encrypted file. * * TODO: cache align/pack? */ struct ecryptfs_crypt_stat { -#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 -#define ECRYPTFS_POLICY_APPLIED 0x00000002 -#define ECRYPTFS_NEW_FILE 0x00000004 -#define ECRYPTFS_ENCRYPTED 0x00000008 -#define ECRYPTFS_SECURITY_WARNING 0x00000010 -#define ECRYPTFS_ENABLE_HMAC 0x00000020 -#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 -#define ECRYPTFS_KEY_VALID 0x00000080 -#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 -#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 -#define ECRYPTFS_KEY_SET 0x00000400 +#define ECRYPTFS_STRUCT_INITIALIZED 0x00000001 +#define ECRYPTFS_POLICY_APPLIED 0x00000002 +#define ECRYPTFS_NEW_FILE 0x00000004 +#define ECRYPTFS_ENCRYPTED 0x00000008 +#define ECRYPTFS_SECURITY_WARNING 0x00000010 +#define ECRYPTFS_ENABLE_HMAC 0x00000020 +#define ECRYPTFS_ENCRYPT_IV_PAGES 0x00000040 +#define ECRYPTFS_KEY_VALID 0x00000080 +#define ECRYPTFS_METADATA_IN_XATTR 0x00000100 +#define ECRYPTFS_VIEW_AS_ENCRYPTED 0x00000200 +#define ECRYPTFS_KEY_SET 0x00000400 +#define ECRYPTFS_ENCRYPT_FILENAMES 0x00000800 +#define ECRYPTFS_ENCFN_USE_MOUNT_FNEK 0x00001000 +#define ECRYPTFS_ENCFN_USE_FEK 0x00002000 u32 flags; unsigned int file_version; size_t iv_bytes; @@ -332,13 +371,20 @@ struct ecryptfs_mount_crypt_stat { #define ECRYPTFS_XATTR_METADATA_ENABLED 0x00000002 #define ECRYPTFS_ENCRYPTED_VIEW_ENABLED 0x00000004 #define ECRYPTFS_MOUNT_CRYPT_STAT_INITIALIZED 0x00000008 +#define ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES 0x00000010 +#define ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK 0x00000020 +#define ECRYPTFS_GLOBAL_ENCFN_USE_FEK 0x00000040 u32 flags; struct list_head global_auth_tok_list; struct mutex global_auth_tok_list_mutex; size_t num_global_auth_toks; size_t global_default_cipher_key_size; + size_t global_default_fn_cipher_key_bytes; unsigned char global_default_cipher_name[ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + unsigned char global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE + 1]; + char global_default_fnek_sig[ECRYPTFS_SIG_SIZE_HEX + 1]; }; /* superblock private data. */ @@ -571,13 +617,22 @@ struct ecryptfs_open_req { int ecryptfs_interpose(struct dentry *hidden_dentry, struct dentry *this_dentry, struct super_block *sb, u32 flags); +int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + struct inode *ecryptfs_dir_inode, + struct nameidata *ecryptfs_nd); +int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, + size_t *decrypted_name_size, + struct dentry *ecryptfs_dentry, + const char *name, size_t name_size); int ecryptfs_fill_zeros(struct file *file, loff_t new_length); -int ecryptfs_decode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, - char **decrypted_name); -int ecryptfs_encode_filename(struct ecryptfs_crypt_stat *crypt_stat, - const char *name, int length, - char **encoded_name); +int ecryptfs_encrypt_and_encode_filename( + char **encoded_name, + size_t *encoded_name_size, + struct ecryptfs_crypt_stat *crypt_stat, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + const char *name, size_t name_size); struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, @@ -599,7 +654,7 @@ int ecryptfs_read_and_validate_header_region(char *data, struct inode *ecryptfs_inode); int ecryptfs_read_and_validate_xattr_region(char *page_virt, struct dentry *ecryptfs_dentry); -u8 ecryptfs_code_for_cipher_string(struct ecryptfs_crypt_stat *crypt_stat); +u8 ecryptfs_code_for_cipher_string(char *cipher_name, size_t key_bytes); int ecryptfs_cipher_code_to_string(char *str, u8 cipher_code); void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat); int ecryptfs_generate_key_packet_set(char *dest_base, @@ -694,5 +749,17 @@ int ecryptfs_privileged_open(struct file **lower_file, struct vfsmount *lower_mnt, const struct cred *cred); int ecryptfs_init_persistent_file(struct dentry *ecryptfs_dentry); +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size); +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size); +int ecryptfs_derive_iv(char *iv, struct ecryptfs_crypt_stat *crypt_stat, + loff_t offset); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index eb3dc4c7ac0..9e944057001 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -77,27 +77,27 @@ struct ecryptfs_getdents_callback { /* Inspired by generic filldir in fs/readdir.c */ static int -ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, - u64 ino, unsigned int d_type) +ecryptfs_filldir(void *dirent, const char *lower_name, int lower_namelen, + loff_t offset, u64 ino, unsigned int d_type) { - struct ecryptfs_crypt_stat *crypt_stat; struct ecryptfs_getdents_callback *buf = (struct ecryptfs_getdents_callback *)dirent; + size_t name_size; + char *name; int rc; - int decoded_length; - char *decoded_name; - crypt_stat = ecryptfs_dentry_to_private(buf->dentry)->crypt_stat; buf->filldir_called++; - decoded_length = ecryptfs_decode_filename(crypt_stat, name, namelen, - &decoded_name); - if (decoded_length < 0) { - rc = decoded_length; + rc = ecryptfs_decode_and_decrypt_filename(&name, &name_size, + buf->dentry, lower_name, + lower_namelen); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and decrypt " + "filename [%s]; rc = [%d]\n", __func__, lower_name, + rc); goto out; } - rc = buf->filldir(buf->dirent, decoded_name, decoded_length, offset, - ino, d_type); - kfree(decoded_name); + rc = buf->filldir(buf->dirent, name, name_size, offset, ino, d_type); + kfree(name); if (rc >= 0) buf->entries_written++; out: @@ -106,8 +106,8 @@ out: /** * ecryptfs_readdir - * @file: The ecryptfs file struct - * @dirent: Directory entry + * @file: The eCryptfs directory file + * @dirent: Directory entry handle * @filldir: The filldir callback function */ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) @@ -275,18 +275,9 @@ static int ecryptfs_release(struct inode *inode, struct file *file) static int ecryptfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - struct file *lower_file = ecryptfs_file_to_lower(file); - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct inode *lower_inode = lower_dentry->d_inode; - int rc = -EINVAL; - - if (lower_inode->i_fop->fsync) { - mutex_lock(&lower_inode->i_mutex); - rc = lower_inode->i_fop->fsync(lower_file, lower_dentry, - datasync); - mutex_unlock(&lower_inode->i_mutex); - } - return rc; + return vfs_fsync(ecryptfs_file_to_lower(file), + ecryptfs_dentry_to_lower(dentry), + datasync); } static int ecryptfs_fasync(int fd, struct file *file, int flag) diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5e78fc17988..5697899a168 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -52,8 +52,7 @@ static void unlock_dir(struct dentry *dir) /** * ecryptfs_create_underlying_file * @lower_dir_inode: inode of the parent in the lower fs of the new file - * @lower_dentry: New file's dentry in the lower fs - * @ecryptfs_dentry: New file's dentry in ecryptfs + * @dentry: New file's dentry * @mode: The mode of the new file * @nd: nameidata of ecryptfs' parent's dentry & vfsmount * @@ -228,8 +227,7 @@ ecryptfs_create(struct inode *directory_inode, struct dentry *ecryptfs_dentry, { int rc; - /* ecryptfs_do_create() calls ecryptfs_interpose(), which opens - * the crypt_stat->lower_file (persistent file) */ + /* ecryptfs_do_create() calls ecryptfs_interpose() */ rc = ecryptfs_do_create(directory_inode, ecryptfs_dentry, mode, nd); if (unlikely(rc)) { ecryptfs_printk(KERN_WARNING, "Failed to create file in" @@ -244,141 +242,91 @@ out: } /** - * ecryptfs_lookup - * @dir: inode - * @dentry: The dentry - * @nd: nameidata, may be NULL - * - * Find a file on disk. If the file does not exist, then we'll add it to the - * dentry cache and continue on to read it from the disk. + * ecryptfs_lookup_and_interpose_lower - Perform a lookup */ -static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, - struct nameidata *nd) +int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, + struct dentry *lower_dentry, + struct ecryptfs_crypt_stat *crypt_stat, + struct inode *ecryptfs_dir_inode, + struct nameidata *ecryptfs_nd) { - int rc = 0; struct dentry *lower_dir_dentry; - struct dentry *lower_dentry; struct vfsmount *lower_mnt; - char *encoded_name; - int encoded_namelen; - struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; char *page_virt = NULL; - struct inode *lower_inode; u64 file_size; + int rc = 0; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - dentry->d_op = &ecryptfs_dops; - if ((dentry->d_name.len == 1 && !strcmp(dentry->d_name.name, ".")) - || (dentry->d_name.len == 2 - && !strcmp(dentry->d_name.name, ".."))) { - d_drop(dentry); - goto out; - } - encoded_namelen = ecryptfs_encode_filename(crypt_stat, - dentry->d_name.name, - dentry->d_name.len, - &encoded_name); - if (encoded_namelen < 0) { - rc = encoded_namelen; - d_drop(dentry); - goto out; - } - ecryptfs_printk(KERN_DEBUG, "encoded_name = [%s]; encoded_namelen " - "= [%d]\n", encoded_name, encoded_namelen); - lower_dentry = lookup_one_len(encoded_name, lower_dir_dentry, - encoded_namelen - 1); - kfree(encoded_name); - if (IS_ERR(lower_dentry)) { - ecryptfs_printk(KERN_ERR, "ERR from lower_dentry\n"); - rc = PTR_ERR(lower_dentry); - d_drop(dentry); - goto out; - } - lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt(dentry->d_parent)); - ecryptfs_printk(KERN_DEBUG, "lower_dentry = [%p]; lower_dentry->" - "d_name.name = [%s]\n", lower_dentry, - lower_dentry->d_name.name); + lower_dir_dentry = lower_dentry->d_parent; + lower_mnt = mntget(ecryptfs_dentry_to_lower_mnt( + ecryptfs_dentry->d_parent)); lower_inode = lower_dentry->d_inode; - fsstack_copy_attr_atime(dir, lower_dir_dentry->d_inode); + fsstack_copy_attr_atime(ecryptfs_dir_inode, lower_dir_dentry->d_inode); BUG_ON(!atomic_read(&lower_dentry->d_count)); - ecryptfs_set_dentry_private(dentry, + ecryptfs_set_dentry_private(ecryptfs_dentry, kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL)); - if (!ecryptfs_dentry_to_private(dentry)) { + if (!ecryptfs_dentry_to_private(ecryptfs_dentry)) { rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Out of memory whilst attempting " - "to allocate ecryptfs_dentry_info struct\n"); + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to allocate ecryptfs_dentry_info struct\n", + __func__); goto out_dput; } - ecryptfs_set_dentry_lower(dentry, lower_dentry); - ecryptfs_set_dentry_lower_mnt(dentry, lower_mnt); + ecryptfs_set_dentry_lower(ecryptfs_dentry, lower_dentry); + ecryptfs_set_dentry_lower_mnt(ecryptfs_dentry, lower_mnt); if (!lower_dentry->d_inode) { /* We want to add because we couldn't find in lower */ - d_add(dentry, NULL); + d_add(ecryptfs_dentry, NULL); goto out; } - rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb, - ECRYPTFS_INTERPOSE_FLAG_D_ADD); + rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, + ecryptfs_dir_inode->i_sb, 1); if (rc) { - ecryptfs_printk(KERN_ERR, "Error interposing\n"); + printk(KERN_ERR "%s: Error interposing; rc = [%d]\n", + __func__, rc); goto out; } - if (S_ISDIR(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a directory; returning\n"); + if (S_ISDIR(lower_inode->i_mode)) goto out; - } - if (S_ISLNK(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a symlink; returning\n"); + if (S_ISLNK(lower_inode->i_mode)) goto out; - } - if (special_file(lower_inode->i_mode)) { - ecryptfs_printk(KERN_DEBUG, "Is a special file; returning\n"); + if (special_file(lower_inode->i_mode)) goto out; - } - if (!nd) { - ecryptfs_printk(KERN_DEBUG, "We have a NULL nd, just leave" - "as we *think* we are about to unlink\n"); + if (!ecryptfs_nd) goto out; - } /* Released in this function */ - page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, - GFP_USER); + page_virt = kmem_cache_zalloc(ecryptfs_header_cache_2, GFP_USER); if (!page_virt) { + printk(KERN_ERR "%s: Cannot kmem_cache_zalloc() a page\n", + __func__); rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, - "Cannot ecryptfs_kmalloc a page\n"); goto out; } - crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - if (!ecryptfs_inode_to_private(dentry->d_inode)->lower_file) { - rc = ecryptfs_init_persistent_file(dentry); + if (!ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->lower_file) { + rc = ecryptfs_init_persistent_file(ecryptfs_dentry); if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the persistent file for the dentry with name " "[%s]; rc = [%d]\n", __func__, - dentry->d_name.name, rc); - goto out; + ecryptfs_dentry->d_name.name, rc); + goto out_free_kmem; } } rc = ecryptfs_read_and_validate_header_region(page_virt, - dentry->d_inode); + ecryptfs_dentry->d_inode); if (rc) { - rc = ecryptfs_read_and_validate_xattr_region(page_virt, dentry); + rc = ecryptfs_read_and_validate_xattr_region(page_virt, + ecryptfs_dentry); if (rc) { - printk(KERN_DEBUG "Valid metadata not found in header " - "region or xattr region; treating file as " - "unencrypted\n"); rc = 0; - kmem_cache_free(ecryptfs_header_cache_2, page_virt); - goto out; + goto out_free_kmem; } crypt_stat->flags |= ECRYPTFS_METADATA_IN_XATTR; } mount_crypt_stat = &ecryptfs_superblock_to_private( - dentry->d_sb)->mount_crypt_stat; + ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) { if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) file_size = (crypt_stat->num_header_bytes_at_front @@ -388,14 +336,103 @@ static struct dentry *ecryptfs_lookup(struct inode *dir, struct dentry *dentry, } else { file_size = get_unaligned_be64(page_virt); } - i_size_write(dentry->d_inode, (loff_t)file_size); + i_size_write(ecryptfs_dentry->d_inode, (loff_t)file_size); +out_free_kmem: kmem_cache_free(ecryptfs_header_cache_2, page_virt); goto out; - out_dput: dput(lower_dentry); - d_drop(dentry); + d_drop(ecryptfs_dentry); +out: + return rc; +} + +/** + * ecryptfs_lookup + * @ecryptfs_dir_inode: The eCryptfs directory inode + * @ecryptfs_dentry: The eCryptfs dentry that we are looking up + * @ecryptfs_nd: nameidata; may be NULL + * + * Find a file on disk. If the file does not exist, then we'll add it to the + * dentry cache and continue on to read it from the disk. + */ +static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, + struct dentry *ecryptfs_dentry, + struct nameidata *ecryptfs_nd) +{ + char *encrypted_and_encoded_name = NULL; + size_t encrypted_and_encoded_name_size; + struct ecryptfs_crypt_stat *crypt_stat = NULL; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; + struct ecryptfs_inode_info *inode_info; + struct dentry *lower_dir_dentry, *lower_dentry; + int rc = 0; + + ecryptfs_dentry->d_op = &ecryptfs_dops; + if ((ecryptfs_dentry->d_name.len == 1 + && !strcmp(ecryptfs_dentry->d_name.name, ".")) + || (ecryptfs_dentry->d_name.len == 2 + && !strcmp(ecryptfs_dentry->d_name.name, ".."))) { + goto out_d_drop; + } + lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); + lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, + lower_dir_dentry, + ecryptfs_dentry->d_name.len); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " + "lower_dentry = [%s]\n", __func__, rc, + ecryptfs_dentry->d_name.name); + goto out_d_drop; + } + if (lower_dentry->d_inode) + goto lookup_and_interpose; + inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); + if (inode_info) { + crypt_stat = &inode_info->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); + } + if (crypt_stat) + mount_crypt_stat = crypt_stat->mount_crypt_stat; + else + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) + && !(mount_crypt_stat && (mount_crypt_stat->flags + & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + goto lookup_and_interpose; + dput(lower_dentry); + rc = ecryptfs_encrypt_and_encode_filename( + &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, + crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt and encode " + "filename; rc = [%d]\n", __func__, rc); + goto out_d_drop; + } + lower_dentry = lookup_one_len(encrypted_and_encoded_name, + lower_dir_dentry, + encrypted_and_encoded_name_size - 1); + if (IS_ERR(lower_dentry)) { + rc = PTR_ERR(lower_dentry); + printk(KERN_ERR "%s: lookup_one_len() returned [%d] on " + "lower_dentry = [%s]\n", __func__, rc, + encrypted_and_encoded_name); + goto out_d_drop; + } +lookup_and_interpose: + rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, + crypt_stat, ecryptfs_dir_inode, + ecryptfs_nd); + goto out; +out_d_drop: + d_drop(ecryptfs_dentry); out: + kfree(encrypted_and_encoded_name); return ERR_PTR(rc); } @@ -466,19 +503,21 @@ static int ecryptfs_symlink(struct inode *dir, struct dentry *dentry, struct dentry *lower_dentry; struct dentry *lower_dir_dentry; char *encoded_symname; - int encoded_symlen; - struct ecryptfs_crypt_stat *crypt_stat = NULL; + size_t encoded_symlen; + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; lower_dentry = ecryptfs_dentry_to_lower(dentry); dget(lower_dentry); lower_dir_dentry = lock_parent(lower_dentry); - encoded_symlen = ecryptfs_encode_filename(crypt_stat, symname, - strlen(symname), - &encoded_symname); - if (encoded_symlen < 0) { - rc = encoded_symlen; + mount_crypt_stat = &ecryptfs_superblock_to_private( + dir->i_sb)->mount_crypt_stat; + rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, + &encoded_symlen, + NULL, + mount_crypt_stat, symname, + strlen(symname)); + if (rc) goto out_lock; - } rc = vfs_symlink(lower_dir_dentry->d_inode, lower_dentry, encoded_symname); kfree(encoded_symname); @@ -602,53 +641,54 @@ out_lock: } static int -ecryptfs_readlink(struct dentry *dentry, char __user * buf, int bufsiz) +ecryptfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz) { - int rc; - struct dentry *lower_dentry; - char *decoded_name; char *lower_buf; - mm_segment_t old_fs; + struct dentry *lower_dentry; struct ecryptfs_crypt_stat *crypt_stat; + char *plaintext_name; + size_t plaintext_name_size; + mm_segment_t old_fs; + int rc; lower_dentry = ecryptfs_dentry_to_lower(dentry); - if (!lower_dentry->d_inode->i_op || - !lower_dentry->d_inode->i_op->readlink) { + if (!lower_dentry->d_inode->i_op->readlink) { rc = -EINVAL; goto out; } + crypt_stat = &ecryptfs_inode_to_private(dentry->d_inode)->crypt_stat; /* Released in this function */ lower_buf = kmalloc(bufsiz, GFP_KERNEL); if (lower_buf == NULL) { - ecryptfs_printk(KERN_ERR, "Out of memory\n"); + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%d] bytes\n", __func__, bufsiz); rc = -ENOMEM; goto out; } old_fs = get_fs(); set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " - "lower_dentry->d_name.name = [%s]\n", - lower_dentry->d_name.name); rc = lower_dentry->d_inode->i_op->readlink(lower_dentry, (char __user *)lower_buf, bufsiz); set_fs(old_fs); if (rc >= 0) { - crypt_stat = NULL; - rc = ecryptfs_decode_filename(crypt_stat, lower_buf, rc, - &decoded_name); - if (rc == -ENOMEM) + rc = ecryptfs_decode_and_decrypt_filename(&plaintext_name, + &plaintext_name_size, + dentry, lower_buf, + rc); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decode and " + "decrypt filename; rc = [%d]\n", __func__, + rc); goto out_free_lower_buf; - if (rc > 0) { - ecryptfs_printk(KERN_DEBUG, "Copying [%d] bytes " - "to userspace: [%*s]\n", rc, - decoded_name); - if (copy_to_user(buf, decoded_name, rc)) - rc = -EFAULT; } - kfree(decoded_name); - fsstack_copy_attr_atime(dentry->d_inode, - lower_dentry->d_inode); + rc = copy_to_user(buf, plaintext_name, plaintext_name_size); + if (rc) + rc = -EFAULT; + else + rc = plaintext_name_size; + kfree(plaintext_name); + fsstack_copy_attr_atime(dentry->d_inode, lower_dentry->d_inode); } out_free_lower_buf: kfree(lower_buf); @@ -670,8 +710,6 @@ static void *ecryptfs_follow_link(struct dentry *dentry, struct nameidata *nd) } old_fs = get_fs(); set_fs(get_ds()); - ecryptfs_printk(KERN_DEBUG, "Calling readlink w/ " - "dentry->d_name.name = [%s]\n", dentry->d_name.name); rc = dentry->d_inode->i_op->readlink(dentry, (char __user *)buf, len); set_fs(old_fs); if (rc < 0) diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 0d713b69194..ff539420cc6 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -358,7 +358,7 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, /* verify that everything through the encrypted FEK size is present */ if (message_len < 4) { rc = -EIO; - printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " + printk(KERN_ERR "%s: message_len is [%zd]; minimum acceptable " "message length is [%d]\n", __func__, message_len, 4); goto out; } @@ -385,13 +385,13 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, i += data_len; if (message_len < (i + key_rec->enc_key_size)) { rc = -EIO; - printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", + printk(KERN_ERR "%s: message_len [%zd]; max len is [%zd]\n", __func__, message_len, (i + key_rec->enc_key_size)); goto out; } if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { rc = -EIO; - printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " + printk(KERN_ERR "%s: Encrypted key_size [%zd] larger than " "the maximum key size [%d]\n", __func__, key_rec->enc_key_size, ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); @@ -403,6 +403,580 @@ out: } static int +ecryptfs_find_global_auth_tok_for_sig( + struct ecryptfs_global_auth_tok **global_auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) +{ + struct ecryptfs_global_auth_tok *walker; + int rc = 0; + + (*global_auth_tok) = NULL; + mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); + list_for_each_entry(walker, + &mount_crypt_stat->global_auth_tok_list, + mount_crypt_stat_list) { + if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { + (*global_auth_tok) = walker; + goto out; + } + } + rc = -EINVAL; +out: + mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); + return rc; +} + +/** + * ecryptfs_find_auth_tok_for_sig + * @auth_tok: Set to the matching auth_tok; NULL if not found + * @crypt_stat: inode crypt_stat crypto context + * @sig: Sig of auth_tok to find + * + * For now, this function simply looks at the registered auth_tok's + * linked off the mount_crypt_stat, so all the auth_toks that can be + * used must be registered at mount time. This function could + * potentially try a lot harder to find auth_tok's (e.g., by calling + * out to ecryptfsd to dynamically retrieve an auth_tok object) so + * that static registration of auth_tok's will no longer be necessary. + * + * Returns zero on no error; non-zero on error + */ +static int +ecryptfs_find_auth_tok_for_sig( + struct ecryptfs_auth_tok **auth_tok, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *sig) +{ + struct ecryptfs_global_auth_tok *global_auth_tok; + int rc = 0; + + (*auth_tok) = NULL; + if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, + mount_crypt_stat, sig)) { + struct key *auth_tok_key; + + rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, + sig); + } else + (*auth_tok) = global_auth_tok->global_auth_tok; + return rc; +} + +/** + * write_tag_70_packet can gobble a lot of stack space. We stuff most + * of the function's parameters in a kmalloc'd struct to help reduce + * eCryptfs' overall stack usage. + */ +struct ecryptfs_write_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + size_t j; + size_t num_rand_bytes; + struct mutex *tfm_mutex; + char *block_aligned_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg; + struct scatterlist dst_sg; + struct blkcipher_desc desc; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + char tmp_hash[ECRYPTFS_TAG_70_DIGEST_SIZE]; + struct hash_desc hash_desc; + struct scatterlist hash_sg; +}; + +/** + * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK + * @filename: NULL-terminated filename string + * + * This is the simplest mechanism for achieving filename encryption in + * eCryptfs. It encrypts the given filename with the mount-wide + * filename encryption key (FNEK) and stores it in a packet to @dest, + * which the callee will encode and write directly into the dentry + * name. + */ +int +ecryptfs_write_tag_70_packet(char *dest, size_t *remaining_bytes, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *filename, size_t filename_size) +{ + struct ecryptfs_write_tag_70_packet_silly_stack *s; + int rc = 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + (*packet_size) = 0; + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name( + &s->desc.tfm, + &s->tfm_mutex, mount_crypt_stat->global_default_fn_cipher_name); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + s->block_size = crypto_blkcipher_blocksize(s->desc.tfm); + /* Plus one for the \0 separator between the random prefix + * and the plaintext filename */ + s->num_rand_bytes = (ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + 1); + s->block_aligned_filename_size = (s->num_rand_bytes + filename_size); + if ((s->block_aligned_filename_size % s->block_size) != 0) { + s->num_rand_bytes += (s->block_size + - (s->block_aligned_filename_size + % s->block_size)); + s->block_aligned_filename_size = (s->num_rand_bytes + + filename_size); + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random characters, a \0 + * separator, and then the filename */ + s->max_packet_size = (1 /* Tag 70 identifier */ + + 3 /* Max Tag 70 packet size */ + + ECRYPTFS_SIG_SIZE /* FNEK sig */ + + 1 /* Cipher identifier */ + + s->block_aligned_filename_size); + if (dest == NULL) { + (*packet_size) = s->max_packet_size; + goto out_unlock; + } + if (s->max_packet_size > (*remaining_bytes)) { + printk(KERN_WARNING "%s: Require [%zd] bytes to write; only " + "[%zd] available\n", __func__, s->max_packet_size, + (*remaining_bytes)); + rc = -EINVAL; + goto out_unlock; + } + s->block_aligned_filename = kzalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->block_aligned_filename) { + printk(KERN_ERR "%s: Out of kernel memory whilst attempting to " + "kzalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + s->i = 0; + dest[s->i++] = ECRYPTFS_TAG_70_PACKET_TYPE; + rc = ecryptfs_write_packet_length(&dest[s->i], + (ECRYPTFS_SIG_SIZE + + 1 /* Cipher code */ + + s->block_aligned_filename_size), + &s->packet_size_len); + if (rc) { + printk(KERN_ERR "%s: Error generating tag 70 packet " + "header; cannot generate packet length; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + s->i += s->packet_size_len; + ecryptfs_from_hex(&dest[s->i], + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_SIG_SIZE); + s->i += ECRYPTFS_SIG_SIZE; + s->cipher_code = ecryptfs_code_for_cipher_string( + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (s->cipher_code == 0) { + printk(KERN_WARNING "%s: Unable to generate code for " + "cipher [%s] with key bytes [%zd]\n", __func__, + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + rc = -EINVAL; + goto out_free_unlock; + } + dest[s->i++] = s->cipher_code; + rc = ecryptfs_find_auth_tok_for_sig( + &s->auth_tok, mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, + mount_crypt_stat->global_default_fnek_sig, rc); + goto out_free_unlock; + } + /* TODO: Support other key modules than passphrase for + * filename encryption */ + BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + sg_init_one( + &s->hash_sg, + (u8 *)s->auth_tok->token.password.session_key_encryption_key, + s->auth_tok->token.password.session_key_encryption_key_bytes); + s->hash_desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + s->hash_desc.tfm = crypto_alloc_hash(ECRYPTFS_TAG_70_DIGEST, 0, + CRYPTO_ALG_ASYNC); + if (IS_ERR(s->hash_desc.tfm)) { + rc = PTR_ERR(s->hash_desc.tfm); + printk(KERN_ERR "%s: Error attempting to " + "allocate hash crypto context; rc = [%d]\n", + __func__, rc); + goto out_free_unlock; + } + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update( + &s->hash_desc, &s->hash_sg, + s->auth_tok->token.password.session_key_encryption_key_bytes); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; rc = [%d]\n", + __func__, rc); + goto out_release_free_unlock; + } + for (s->j = 0; s->j < (s->num_rand_bytes - 1); s->j++) { + s->block_aligned_filename[s->j] = + s->hash[(s->j % ECRYPTFS_TAG_70_DIGEST_SIZE)]; + if ((s->j % ECRYPTFS_TAG_70_DIGEST_SIZE) + == (ECRYPTFS_TAG_70_DIGEST_SIZE - 1)) { + sg_init_one(&s->hash_sg, (u8 *)s->hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + rc = crypto_hash_init(&s->hash_desc); + if (rc) { + printk(KERN_ERR + "%s: Error initializing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_update(&s->hash_desc, &s->hash_sg, + ECRYPTFS_TAG_70_DIGEST_SIZE); + if (rc) { + printk(KERN_ERR + "%s: Error updating crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + rc = crypto_hash_final(&s->hash_desc, s->tmp_hash); + if (rc) { + printk(KERN_ERR + "%s: Error finalizing crypto hash; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + memcpy(s->hash, s->tmp_hash, + ECRYPTFS_TAG_70_DIGEST_SIZE); + } + if (s->block_aligned_filename[s->j] == '\0') + s->block_aligned_filename[s->j] = ECRYPTFS_NON_NULL; + } + memcpy(&s->block_aligned_filename[s->num_rand_bytes], filename, + filename_size); + rc = virt_to_scatterlist(s->block_aligned_filename, + s->block_aligned_filename_size, &s->src_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + rc = virt_to_scatterlist(&dest[s->i], s->block_aligned_filename_size, + &s->dst_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_release_free_unlock; + } + /* The characters in the first block effectively do the job + * of the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_release_free_unlock; + } + rc = crypto_blkcipher_encrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to encrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_release_free_unlock; + } + s->i += s->block_aligned_filename_size; + (*packet_size) = s->i; + (*remaining_bytes) -= (*packet_size); +out_release_free_unlock: + crypto_free_hash(s->hash_desc.tfm); +out_free_unlock: + memset(s->block_aligned_filename, 0, s->block_aligned_filename_size); + kfree(s->block_aligned_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + kfree(s); + return rc; +} + +struct ecryptfs_parse_tag_70_packet_silly_stack { + u8 cipher_code; + size_t max_packet_size; + size_t packet_size_len; + size_t parsed_tag_70_packet_size; + size_t block_aligned_filename_size; + size_t block_size; + size_t i; + struct mutex *tfm_mutex; + char *decrypted_filename; + struct ecryptfs_auth_tok *auth_tok; + struct scatterlist src_sg; + struct scatterlist dst_sg; + struct blkcipher_desc desc; + char fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX + 1]; + char iv[ECRYPTFS_MAX_IV_BYTES]; + char cipher_string[ECRYPTFS_MAX_CIPHER_NAME_SIZE]; +}; + +/** + * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * @filename: This function kmalloc's the memory for the filename + * @filename_size: This function sets this to the amount of memory + * kmalloc'd for the filename + * @packet_size: This function sets this to the the number of octets + * in the packet parsed + * @mount_crypt_stat: The mount-wide cryptographic context + * @data: The memory location containing the start of the tag 70 + * packet + * @max_packet_size: The maximum legal size of the packet to be parsed + * from @data + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_parse_tag_70_packet(char **filename, size_t *filename_size, + size_t *packet_size, + struct ecryptfs_mount_crypt_stat *mount_crypt_stat, + char *data, size_t max_packet_size) +{ + struct ecryptfs_parse_tag_70_packet_silly_stack *s; + int rc = 0; + + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) { + printk(KERN_ERR "%s: Out of memory whilst trying to kmalloc " + "[%zd] bytes of kernel memory\n", __func__, sizeof(*s)); + goto out; + } + s->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP; + if (max_packet_size < (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; it must be " + "at least [%d]\n", __func__, max_packet_size, + (1 + 1 + ECRYPTFS_SIG_SIZE + 1 + 1)); + rc = -EINVAL; + goto out; + } + /* Octet 0: Tag 70 identifier + * Octets 1-N1: Tag 70 packet size (includes cipher identifier + * and block-aligned encrypted filename size) + * Octets N1-N2: FNEK sig (ECRYPTFS_SIG_SIZE) + * Octet N2-N3: Cipher identifier (1 octet) + * Octets N3-N4: Block-aligned encrypted filename + * - Consists of a minimum number of random numbers, a \0 + * separator, and then the filename */ + if (data[(*packet_size)++] != ECRYPTFS_TAG_70_PACKET_TYPE) { + printk(KERN_WARNING "%s: Invalid packet tag [0x%.2x]; must be " + "tag [0x%.2x]\n", __func__, + data[((*packet_size) - 1)], ECRYPTFS_TAG_70_PACKET_TYPE); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], + &s->parsed_tag_70_packet_size, + &s->packet_size_len); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + goto out; + } + s->block_aligned_filename_size = (s->parsed_tag_70_packet_size + - ECRYPTFS_SIG_SIZE - 1); + if ((1 + s->packet_size_len + s->parsed_tag_70_packet_size) + > max_packet_size) { + printk(KERN_WARNING "%s: max_packet_size is [%zd]; real packet " + "size is [%zd]\n", __func__, max_packet_size, + (1 + s->packet_size_len + 1 + + s->block_aligned_filename_size)); + rc = -EINVAL; + goto out; + } + (*packet_size) += s->packet_size_len; + ecryptfs_to_hex(s->fnek_sig_hex, &data[(*packet_size)], + ECRYPTFS_SIG_SIZE); + s->fnek_sig_hex[ECRYPTFS_SIG_SIZE_HEX] = '\0'; + (*packet_size) += ECRYPTFS_SIG_SIZE; + s->cipher_code = data[(*packet_size)++]; + rc = ecryptfs_cipher_code_to_string(s->cipher_string, s->cipher_code); + if (rc) { + printk(KERN_WARNING "%s: Cipher code [%d] is invalid\n", + __func__, s->cipher_code); + goto out; + } + rc = ecryptfs_get_tfm_and_mutex_for_cipher_name(&s->desc.tfm, + &s->tfm_mutex, + s->cipher_string); + if (unlikely(rc)) { + printk(KERN_ERR "Internal error whilst attempting to get " + "tfm and mutex for cipher name [%s]; rc = [%d]\n", + s->cipher_string, rc); + goto out; + } + mutex_lock(s->tfm_mutex); + rc = virt_to_scatterlist(&data[(*packet_size)], + s->block_aligned_filename_size, &s->src_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert encrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_unlock; + } + (*packet_size) += s->block_aligned_filename_size; + s->decrypted_filename = kmalloc(s->block_aligned_filename_size, + GFP_KERNEL); + if (!s->decrypted_filename) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + s->block_aligned_filename_size); + rc = -ENOMEM; + goto out_unlock; + } + rc = virt_to_scatterlist(s->decrypted_filename, + s->block_aligned_filename_size, &s->dst_sg, 1); + if (rc != 1) { + printk(KERN_ERR "%s: Internal error whilst attempting to " + "convert decrypted filename memory to scatterlist; " + "expected rc = 1; got rc = [%d]. " + "block_aligned_filename_size = [%zd]\n", __func__, rc, + s->block_aligned_filename_size); + goto out_free_unlock; + } + /* The characters in the first block effectively do the job of + * the IV here, so we just use 0's for the IV. Note the + * constraint that ECRYPTFS_FILENAME_MIN_RANDOM_PREPEND_BYTES + * >= ECRYPTFS_MAX_IV_BYTES. */ + memset(s->iv, 0, ECRYPTFS_MAX_IV_BYTES); + s->desc.info = s->iv; + rc = ecryptfs_find_auth_tok_for_sig(&s->auth_tok, mount_crypt_stat, + s->fnek_sig_hex); + if (rc) { + printk(KERN_ERR "%s: Error attempting to find auth tok for " + "fnek sig [%s]; rc = [%d]\n", __func__, s->fnek_sig_hex, + rc); + goto out_free_unlock; + } + /* TODO: Support other key modules than passphrase for + * filename encryption */ + BUG_ON(s->auth_tok->token_type != ECRYPTFS_PASSWORD); + rc = crypto_blkcipher_setkey( + s->desc.tfm, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc < 0) { + printk(KERN_ERR "%s: Error setting key for crypto context; " + "rc = [%d]. s->auth_tok->token.password.session_key_" + "encryption_key = [0x%p]; mount_crypt_stat->" + "global_default_fn_cipher_key_bytes = [%zd]\n", __func__, + rc, + s->auth_tok->token.password.session_key_encryption_key, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + goto out_free_unlock; + } + rc = crypto_blkcipher_decrypt_iv(&s->desc, &s->dst_sg, &s->src_sg, + s->block_aligned_filename_size); + if (rc) { + printk(KERN_ERR "%s: Error attempting to decrypt filename; " + "rc = [%d]\n", __func__, rc); + goto out_free_unlock; + } + s->i = 0; + while (s->decrypted_filename[s->i] != '\0' + && s->i < s->block_aligned_filename_size) + s->i++; + if (s->i == s->block_aligned_filename_size) { + printk(KERN_WARNING "%s: Invalid tag 70 packet; could not " + "find valid separator between random characters and " + "the filename\n", __func__); + rc = -EINVAL; + goto out_free_unlock; + } + s->i++; + (*filename_size) = (s->block_aligned_filename_size - s->i); + if (!((*filename_size) > 0 && (*filename_size < PATH_MAX))) { + printk(KERN_WARNING "%s: Filename size is [%zd], which is " + "invalid\n", __func__, (*filename_size)); + rc = -EINVAL; + goto out_free_unlock; + } + (*filename) = kmalloc(((*filename_size) + 1), GFP_KERNEL); + if (!(*filename)) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc [%zd] bytes\n", __func__, + ((*filename_size) + 1)); + rc = -ENOMEM; + goto out_free_unlock; + } + memcpy((*filename), &s->decrypted_filename[s->i], (*filename_size)); + (*filename)[(*filename_size)] = '\0'; +out_free_unlock: + kfree(s->decrypted_filename); +out_unlock: + mutex_unlock(s->tfm_mutex); +out: + if (rc) { + (*packet_size) = 0; + (*filename_size) = 0; + (*filename) = NULL; + } + kfree(s); + return rc; +} + +static int ecryptfs_get_auth_tok_sig(char **sig, struct ecryptfs_auth_tok *auth_tok) { int rc = 0; @@ -897,30 +1471,6 @@ out: return rc; } -static int -ecryptfs_find_global_auth_tok_for_sig( - struct ecryptfs_global_auth_tok **global_auth_tok, - struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig) -{ - struct ecryptfs_global_auth_tok *walker; - int rc = 0; - - (*global_auth_tok) = NULL; - mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); - list_for_each_entry(walker, - &mount_crypt_stat->global_auth_tok_list, - mount_crypt_stat_list) { - if (memcmp(walker->sig, sig, ECRYPTFS_SIG_SIZE_HEX) == 0) { - (*global_auth_tok) = walker; - goto out; - } - } - rc = -EINVAL; -out: - mutex_unlock(&mount_crypt_stat->global_auth_tok_list_mutex); - return rc; -} - /** * ecryptfs_verify_version * @version: The version number to confirm @@ -990,43 +1540,6 @@ out: } /** - * ecryptfs_find_auth_tok_for_sig - * @auth_tok: Set to the matching auth_tok; NULL if not found - * @crypt_stat: inode crypt_stat crypto context - * @sig: Sig of auth_tok to find - * - * For now, this function simply looks at the registered auth_tok's - * linked off the mount_crypt_stat, so all the auth_toks that can be - * used must be registered at mount time. This function could - * potentially try a lot harder to find auth_tok's (e.g., by calling - * out to ecryptfsd to dynamically retrieve an auth_tok object) so - * that static registration of auth_tok's will no longer be necessary. - * - * Returns zero on no error; non-zero on error - */ -static int -ecryptfs_find_auth_tok_for_sig( - struct ecryptfs_auth_tok **auth_tok, - struct ecryptfs_crypt_stat *crypt_stat, char *sig) -{ - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - crypt_stat->mount_crypt_stat; - struct ecryptfs_global_auth_tok *global_auth_tok; - int rc = 0; - - (*auth_tok) = NULL; - if (ecryptfs_find_global_auth_tok_for_sig(&global_auth_tok, - mount_crypt_stat, sig)) { - struct key *auth_tok_key; - - rc = ecryptfs_keyring_auth_tok_for_sig(&auth_tok_key, auth_tok, - sig); - } else - (*auth_tok) = global_auth_tok->global_auth_tok; - return rc; -} - -/** * decrypt_passphrase_encrypted_session_key - Decrypt the session key with the given auth_tok. * @auth_tok: The passphrase authentication token to use to encrypt the FEK * @crypt_stat: The cryptographic context @@ -1256,7 +1769,8 @@ find_next_matching_auth_tok: rc = -EINVAL; goto out_wipe_list; } - ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, crypt_stat, + ecryptfs_find_auth_tok_for_sig(&matching_auth_tok, + crypt_stat->mount_crypt_stat, candidate_auth_tok_sig); if (matching_auth_tok) { found_auth_tok = 1; @@ -1336,7 +1850,9 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, int rc; rc = write_tag_66_packet(auth_tok->token.private_key.signature, - ecryptfs_code_for_cipher_string(crypt_stat), + ecryptfs_code_for_cipher_string( + crypt_stat->cipher, + crypt_stat->key_size), crypt_stat, &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); @@ -1696,7 +2212,8 @@ encrypted_session_key_set: dest[(*packet_size)++] = 0x04; /* version 4 */ /* TODO: Break from RFC2440 so that arbitrary ciphers can be * specified with strings */ - cipher_code = ecryptfs_code_for_cipher_string(crypt_stat); + cipher_code = ecryptfs_code_for_cipher_string(crypt_stat->cipher, + crypt_stat->key_size); if (cipher_code == 0) { ecryptfs_printk(KERN_WARNING, "Unable to generate code for " "cipher [%s]\n", crypt_stat->cipher); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index fd630713c5c..789cf2e1be1 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -206,7 +206,9 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_cipher, ecryptfs_opt_ecryptfs_cipher, ecryptfs_opt_ecryptfs_key_bytes, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, - ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; + ecryptfs_opt_encrypted_view, ecryptfs_opt_fnek_sig, + ecryptfs_opt_fn_cipher, ecryptfs_opt_fn_cipher_key_bytes, + ecryptfs_opt_err }; static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, @@ -217,6 +219,9 @@ static const match_table_t tokens = { {ecryptfs_opt_passthrough, "ecryptfs_passthrough"}, {ecryptfs_opt_xattr_metadata, "ecryptfs_xattr_metadata"}, {ecryptfs_opt_encrypted_view, "ecryptfs_encrypted_view"}, + {ecryptfs_opt_fnek_sig, "ecryptfs_fnek_sig=%s"}, + {ecryptfs_opt_fn_cipher, "ecryptfs_fn_cipher=%s"}, + {ecryptfs_opt_fn_cipher_key_bytes, "ecryptfs_fn_key_bytes=%u"}, {ecryptfs_opt_err, NULL} }; @@ -281,8 +286,11 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int rc = 0; int sig_set = 0; int cipher_name_set = 0; + int fn_cipher_name_set = 0; int cipher_key_bytes; int cipher_key_bytes_set = 0; + int fn_cipher_key_bytes; + int fn_cipher_key_bytes_set = 0; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = &ecryptfs_superblock_to_private(sb)->mount_crypt_stat; substring_t args[MAX_OPT_ARGS]; @@ -290,7 +298,12 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) char *sig_src; char *cipher_name_dst; char *cipher_name_src; + char *fn_cipher_name_dst; + char *fn_cipher_name_src; + char *fnek_dst; + char *fnek_src; char *cipher_key_bytes_src; + char *fn_cipher_key_bytes_src; if (!options) { rc = -EINVAL; @@ -322,10 +335,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) global_default_cipher_name; strncpy(cipher_name_dst, cipher_name_src, ECRYPTFS_MAX_CIPHER_NAME_SIZE); - ecryptfs_printk(KERN_DEBUG, - "The mount_crypt_stat " - "global_default_cipher_name set to: " - "[%s]\n", cipher_name_dst); + cipher_name_dst[ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; cipher_name_set = 1; break; case ecryptfs_opt_ecryptfs_key_bytes: @@ -335,11 +345,6 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) &cipher_key_bytes_src, 0); mount_crypt_stat->global_default_cipher_key_size = cipher_key_bytes; - ecryptfs_printk(KERN_DEBUG, - "The mount_crypt_stat " - "global_default_cipher_key_size " - "set to: [%d]\n", mount_crypt_stat-> - global_default_cipher_key_size); cipher_key_bytes_set = 1; break; case ecryptfs_opt_passthrough: @@ -356,11 +361,51 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) mount_crypt_stat->flags |= ECRYPTFS_ENCRYPTED_VIEW_ENABLED; break; + case ecryptfs_opt_fnek_sig: + fnek_src = args[0].from; + fnek_dst = + mount_crypt_stat->global_default_fnek_sig; + strncpy(fnek_dst, fnek_src, ECRYPTFS_SIG_SIZE_HEX); + mount_crypt_stat->global_default_fnek_sig[ + ECRYPTFS_SIG_SIZE_HEX] = '\0'; + rc = ecryptfs_add_global_auth_tok( + mount_crypt_stat, + mount_crypt_stat->global_default_fnek_sig); + if (rc) { + printk(KERN_ERR "Error attempting to register " + "global fnek sig [%s]; rc = [%d]\n", + mount_crypt_stat->global_default_fnek_sig, + rc); + goto out; + } + mount_crypt_stat->flags |= + (ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES + | ECRYPTFS_GLOBAL_ENCFN_USE_MOUNT_FNEK); + break; + case ecryptfs_opt_fn_cipher: + fn_cipher_name_src = args[0].from; + fn_cipher_name_dst = + mount_crypt_stat->global_default_fn_cipher_name; + strncpy(fn_cipher_name_dst, fn_cipher_name_src, + ECRYPTFS_MAX_CIPHER_NAME_SIZE); + mount_crypt_stat->global_default_fn_cipher_name[ + ECRYPTFS_MAX_CIPHER_NAME_SIZE] = '\0'; + fn_cipher_name_set = 1; + break; + case ecryptfs_opt_fn_cipher_key_bytes: + fn_cipher_key_bytes_src = args[0].from; + fn_cipher_key_bytes = + (int)simple_strtol(fn_cipher_key_bytes_src, + &fn_cipher_key_bytes_src, 0); + mount_crypt_stat->global_default_fn_cipher_key_bytes = + fn_cipher_key_bytes; + fn_cipher_key_bytes_set = 1; + break; case ecryptfs_opt_err: default: - ecryptfs_printk(KERN_WARNING, - "eCryptfs: unrecognized option '%s'\n", - p); + printk(KERN_WARNING + "%s: eCryptfs: unrecognized option [%s]\n", + __func__, p); } } if (!sig_set) { @@ -374,33 +419,60 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) int cipher_name_len = strlen(ECRYPTFS_DEFAULT_CIPHER); BUG_ON(cipher_name_len >= ECRYPTFS_MAX_CIPHER_NAME_SIZE); - strcpy(mount_crypt_stat->global_default_cipher_name, ECRYPTFS_DEFAULT_CIPHER); } - if (!cipher_key_bytes_set) { + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_name_set) + strcpy(mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_cipher_name); + if (!cipher_key_bytes_set) mount_crypt_stat->global_default_cipher_key_size = 0; - } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !fn_cipher_key_bytes_set) + mount_crypt_stat->global_default_fn_cipher_key_bytes = + mount_crypt_stat->global_default_cipher_key_size; mutex_lock(&key_tfm_list_mutex); if (!ecryptfs_tfm_exists(mount_crypt_stat->global_default_cipher_name, - NULL)) + NULL)) { rc = ecryptfs_add_new_key_tfm( NULL, mount_crypt_stat->global_default_cipher_name, mount_crypt_stat->global_default_cipher_key_size); - mutex_unlock(&key_tfm_list_mutex); - if (rc) { - printk(KERN_ERR "Error attempting to initialize cipher with " - "name = [%s] and key size = [%td]; rc = [%d]\n", - mount_crypt_stat->global_default_cipher_name, - mount_crypt_stat->global_default_cipher_key_size, rc); - rc = -EINVAL; - goto out; + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_cipher_name, + mount_crypt_stat->global_default_cipher_key_size, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } } + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !ecryptfs_tfm_exists( + mount_crypt_stat->global_default_fn_cipher_name, NULL)) { + rc = ecryptfs_add_new_key_tfm( + NULL, mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes); + if (rc) { + printk(KERN_ERR "Error attempting to initialize " + "cipher with name = [%s] and key size = [%td]; " + "rc = [%d]\n", + mount_crypt_stat->global_default_fn_cipher_name, + mount_crypt_stat->global_default_fn_cipher_key_bytes, + rc); + rc = -EINVAL; + mutex_unlock(&key_tfm_list_mutex); + goto out; + } + } + mutex_unlock(&key_tfm_list_mutex); rc = ecryptfs_init_global_auth_toks(mount_crypt_stat); - if (rc) { + if (rc) printk(KERN_WARNING "One or more global auth toks could not " "properly register; rc = [%d]\n", rc); - } out: return rc; } diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 6913f727624..96ef51489e0 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -193,7 +193,7 @@ ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); if (!(*daemon)) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); goto out; } @@ -435,7 +435,7 @@ int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; - printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + printk(KERN_ERR "%s: Failed to allocate [%zd] bytes of " "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index efd95a0ed1e..a67fea655f4 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -199,7 +199,7 @@ int ecryptfs_send_miscdev(char *data, size_t data_size, if (!msg_ctx->msg) { rc = -ENOMEM; printk(KERN_ERR "%s: Out of memory whilst attempting " - "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, + "to kmalloc(%zd, GFP_KERNEL)\n", __func__, (sizeof(*msg_ctx->msg) + data_size)); goto out_unlock; } @@ -322,7 +322,7 @@ check_list: if (count < total_length) { rc = 0; printk(KERN_WARNING "%s: Only given user buffer of " - "size [%Zd], but we need [%Zd] to read the " + "size [%zd], but we need [%zd] to read the " "pending message\n", __func__, count, total_length); goto out_unlock_msg_ctx; } @@ -376,7 +376,7 @@ static int ecryptfs_miscdev_response(char *data, size_t data_size, if ((sizeof(*msg) + msg->data_len) != data_size) { printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " - "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, + "[%zd]; data_size = [%zd]. Invalid packet.\n", __func__, (sizeof(*msg) + msg->data_len), data_size); rc = -EINVAL; goto out; @@ -421,7 +421,7 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, data = kmalloc(count, GFP_KERNEL); if (!data) { printk(KERN_ERR "%s: Out of memory whilst attempting to " - "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); + "kmalloc([%zd], GFP_KERNEL)\n", __func__, count); goto out; } rc = copy_from_user(data, buf, count); @@ -436,8 +436,8 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, case ECRYPTFS_MSG_RESPONSE: if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { printk(KERN_WARNING "%s: Minimum acceptable packet " - "size is [%Zd], but amount of data written is " - "only [%Zd]. Discarding response packet.\n", + "size is [%zd], but amount of data written is " + "only [%zd]. Discarding response packet.\n", __func__, (1 + 4 + 1 + sizeof(struct ecryptfs_message)), count); @@ -455,9 +455,9 @@ ecryptfs_miscdev_write(struct file *file, const char __user *buf, } i += packet_size_length; if ((1 + 4 + packet_size_length + packet_size) != count) { - printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" - " + packet_size([%Zd]))([%Zd]) != " - "count([%Zd]). Invalid packet format.\n", + printk(KERN_WARNING "%s: (1 + packet_size_length([%zd])" + " + packet_size([%zd]))([%zd]) != " + "count([%zd]). Invalid packet format.\n", __func__, packet_size_length, packet_size, (1 + packet_size_length + packet_size), count); goto out_free; diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 04d7b3fa1ac..46cec2b6979 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -288,7 +288,7 @@ static int ecryptfs_write_begin(struct file *file, loff_t prev_page_end_size; int rc = 0; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/exec.c b/fs/exec.c index 3ef9cf9b187..71a6efe5d8b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -51,6 +51,7 @@ #include <linux/audit.h> #include <linux/tracehook.h> #include <linux/kmod.h> +#include <linux/fsnotify.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -132,6 +133,8 @@ asmlinkage long sys_uselib(const char __user * library) if (IS_ERR(file)) goto out; + fsnotify_open(file->f_path.dentry); + error = -ENOEXEC; if(file->f_op) { struct linux_binfmt * fmt; @@ -229,13 +232,13 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, static int __bprm_mm_init(struct linux_binprm *bprm) { - int err = -ENOMEM; + int err; struct vm_area_struct *vma = NULL; struct mm_struct *mm = bprm->mm; bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); if (!vma) - goto err; + return -ENOMEM; down_write(&mm->mmap_sem); vma->vm_mm = mm; @@ -248,28 +251,20 @@ static int __bprm_mm_init(struct linux_binprm *bprm) */ vma->vm_end = STACK_TOP_MAX; vma->vm_start = vma->vm_end - PAGE_SIZE; - vma->vm_flags = VM_STACK_FLAGS; vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); err = insert_vm_struct(mm, vma); - if (err) { - up_write(&mm->mmap_sem); + if (err) goto err; - } mm->stack_vm = mm->total_vm = 1; up_write(&mm->mmap_sem); - bprm->p = vma->vm_end - sizeof(void *); - return 0; - err: - if (vma) { - bprm->vma = NULL; - kmem_cache_free(vm_area_cachep, vma); - } - + up_write(&mm->mmap_sem); + bprm->vma = NULL; + kmem_cache_free(vm_area_cachep, vma); return err; } @@ -684,6 +679,8 @@ struct file *open_exec(const char *name) if (IS_ERR(file)) return file; + fsnotify_open(file->f_path.dentry); + err = deny_write_access(file); if (err) { fput(file); @@ -1689,7 +1686,7 @@ int get_dumpable(struct mm_struct *mm) return (ret >= 2) ? 2 : ret; } -int do_coredump(long signr, int exit_code, struct pt_regs * regs) +void do_coredump(long signr, int exit_code, struct pt_regs *regs) { struct core_state core_state; char corename[CORENAME_MAX_SIZE + 1]; @@ -1773,6 +1770,11 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) if (ispipe) { helper_argv = argv_split(GFP_KERNEL, corename+1, &helper_argc); + if (!helper_argv) { + printk(KERN_WARNING "%s failed to allocate memory\n", + __func__); + goto fail_unlock; + } /* Terminate the string before the first option */ delimit = strchr(corename, ' '); if (delimit) @@ -1840,5 +1842,5 @@ fail_unlock: put_cred(cred); coredump_finish(mm); fail: - return retval; + return; } diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index c454d5db28a..66321a877e7 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -565,12 +565,8 @@ got: inode->i_blocks = 0; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; memset(ei->i_data, 0, sizeof(ei->i_data)); - ei->i_flags = EXT2_I(dir)->i_flags & ~EXT2_BTREE_FL; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT2_IMMUTABLE_FL|EXT2_APPEND_FL); - /* dirsync is only applied to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT2_DIRSYNC_FL; + ei->i_flags = + ext2_mask_flags(mode, EXT2_I(dir)->i_flags & EXT2_FL_INHERITED); ei->i_faddr = 0; ei->i_frag_no = 0; ei->i_frag_size = 0; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 02b39a5deb7..23fff2f8778 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -498,8 +498,6 @@ static int ext2_alloc_branch(struct inode *inode, * ext2_splice_branch - splice the allocated branch onto inode. * @inode: owner * @block: (logical) number of block we are adding - * @chain: chain of indirect blocks (with a missing link - see - * ext2_alloc_branch) * @where: location of missing link * @num: number of indirect blocks we are adding * @blks: number of direct blocks we are adding diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index de876fa793e..7cb4badef92 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -50,8 +50,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto setflags_out; } - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT2_DIRSYNC_FL; + flags = ext2_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 647cd888ac8..da8bdeaa2e6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -132,6 +132,7 @@ static void ext2_put_super (struct super_block * sb) percpu_counter_destroy(&sbi->s_dirs_counter); brelse (sbi->s_sbh); sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; @@ -756,6 +757,13 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_sb_block = sb_block; @@ -983,7 +991,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) printk ("EXT2-fs: not enough memory\n"); goto failed_mount; } - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); sbi->s_debts = kcalloc(sbi->s_groups_count, sizeof(*sbi->s_debts), GFP_KERNEL); if (!sbi->s_debts) { printk ("EXT2-fs: not enough memory\n"); diff --git a/fs/ext3/hash.c b/fs/ext3/hash.c index c30e149fbd2..7d215b4d4f2 100644 --- a/fs/ext3/hash.c +++ b/fs/ext3/hash.c @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - if (hash & 0x80000000) hash -= 0x7fffffff; + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; +} + +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i=0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + hash = dx_hack_hash_signed(name, len); break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext3fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 5655fbcbd11..8de6c720e51 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -559,12 +559,8 @@ got: ei->i_dir_start_lookup = 0; ei->i_disksize = 0; - ei->i_flags = EXT3_I(dir)->i_flags & ~EXT3_INDEX_FL; - if (S_ISLNK(mode)) - ei->i_flags &= ~(EXT3_IMMUTABLE_FL|EXT3_APPEND_FL); - /* dirsync only applies to directories */ - if (!S_ISDIR(mode)) - ei->i_flags &= ~EXT3_DIRSYNC_FL; + ei->i_flags = + ext3_mask_flags(mode, EXT3_I(dir)->i_flags & EXT3_FL_INHERITED); #ifdef EXT3_FRAGMENTS ei->i_faddr = 0; ei->i_frag_no = 0; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index c4bdccf976b..5fa453b49a6 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1161,7 +1161,7 @@ static int ext3_write_begin(struct file *file, struct address_space *mapping, to = from + len; retry: - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/ext3/ioctl.c b/fs/ext3/ioctl.c index b7394d05ee8..5e86ce9a86e 100644 --- a/fs/ext3/ioctl.c +++ b/fs/ext3/ioctl.c @@ -53,8 +53,7 @@ int ext3_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, goto flags_out; } - if (!S_ISDIR(inode->i_mode)) - flags &= ~EXT3_DIRSYNC_FL; + flags = ext3_mask_flags(inode->i_mode, flags); mutex_lock(&inode->i_mutex); /* Is it quota file? Do not allow user to mess with it */ diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index 297ea8dfac7..69a3d19ca9f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -74,10 +74,6 @@ static struct buffer_head *ext3_append(handle_t *handle, #define assert(test) J_ASSERT(test) #endif -#ifndef swap -#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else @@ -368,6 +364,8 @@ dx_probe(struct qstr *entry, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT3_SB(dir->i_sb)->s_hash_seed; if (entry) ext3fs_dirhash(entry->name, entry->len, hinfo); @@ -636,6 +634,9 @@ int ext3_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT3_I(dir)->i_flags & EXT3_INDEX_FL)) { hinfo.hash_version = EXT3_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -1156,9 +1157,9 @@ static struct ext3_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext3_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext3_append (handle, dir, &newblock, &err); if (!(bh2)) { @@ -1398,6 +1399,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT3_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT3_SB(dir->i_sb)->s_hash_seed; ext3fs_dirhash(name, namelen, &hinfo); frame = frames; @@ -2175,8 +2178,7 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = __page_symlink(inode, symname, l, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + err = __page_symlink(inode, symname, l, 1); if (err) { drop_nlink(inode); unlock_new_inode(inode); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f6c94f232ec..5d047a030a7 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -439,6 +439,7 @@ static void ext3_put_super (struct super_block * sb) ext3_blkdev_remove(sbi); } sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return; } @@ -682,6 +683,26 @@ static struct dentry *ext3_fh_to_parent(struct super_block *sb, struct fid *fid, ext3_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, + gfp_t wait) +{ + journal_t *journal = EXT3_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t)==USRQUOTA?"user":"group") #define QTYPE2MOPT(on, t) ((t)==USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -713,7 +734,9 @@ static struct dquot_operations ext3_quota_operations = { .acquire_dquot = ext3_acquire_dquot, .release_dquot = ext3_release_dquot, .mark_dirty = ext3_mark_dquot_dirty, - .write_info = ext3_write_info + .write_info = ext3_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; static struct quotactl_ops ext3_qctl_operations = { @@ -746,6 +769,7 @@ static const struct super_operations ext3_sops = { .quota_read = ext3_quota_read, .quota_write = ext3_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext3_export_ops = { @@ -1035,8 +1059,7 @@ static int parse_options (char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT3-fs: Cannot change journaled " @@ -1075,8 +1098,7 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT3-fs: Cannot change " "journaled quota options when " @@ -1095,8 +1117,7 @@ clear_qf_name: case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; set_qf_format: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { printk(KERN_ERR "EXT3-fs: Cannot change " "journaled quota options when " @@ -1115,8 +1136,7 @@ set_qf_format: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) { + if (sb_any_quota_loaded(sb)) { printk(KERN_ERR "EXT3-fs: Cannot change quota " "options when quota turned on.\n"); return 0; @@ -1548,6 +1568,13 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) return -ENOMEM; + + sbi->s_blockgroup_lock = + kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); + if (!sbi->s_blockgroup_lock) { + kfree(sbi); + return -ENOMEM; + } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; sbi->s_resuid = EXT3_DEF_RESUID; @@ -1744,6 +1771,18 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) for (i=0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk (KERN_ERR @@ -1788,7 +1827,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - bgl_lock_init(&sbi->s_blockgroup_lock); + bgl_lock_init(sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logic_sb_block, i); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 38b3acf5683..6bba06b09dd 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -20,6 +20,7 @@ #include "ext4.h" #include "ext4_jbd2.h" #include "group.h" +#include "mballoc.h" /* * balloc.c contains the blocks allocation and deallocation routines @@ -100,10 +101,10 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { ext4_error(sb, __func__, - "Checksum bad for group %lu\n", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + "Checksum bad for group %u", block_group); + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } @@ -205,15 +206,15 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, struct buffer_head **bh) { - unsigned long group_desc; - unsigned long offset; + unsigned int group_desc; + unsigned int offset; struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= sbi->s_groups_count) { ext4_error(sb, "ext4_get_group_desc", "block_group >= groups_count - " - "block_group = %lu, groups_count = %lu", + "block_group = %u, groups_count = %u", block_group, sbi->s_groups_count); return NULL; @@ -225,7 +226,7 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, if (!sbi->s_group_desc[group_desc]) { ext4_error(sb, "ext4_get_group_desc", "Group descriptor not loaded - " - "block_group = %lu, group_desc = %lu, desc = %lu", + "block_group = %u, group_desc = %u, desc = %u", block_group, group_desc, offset); return NULL; } @@ -315,29 +316,50 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %lu, block_bitmap = %llu", + "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) + + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, "Cannot read block bitmap - " - "block_group = %lu, block_bitmap = %llu", + "block_group = %u, block_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -350,62 +372,44 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } /** - * ext4_free_blocks_sb() -- Free given blocks and update quota + * ext4_add_groupblocks() -- Add given blocks to an existing group * @handle: handle to this transaction * @sb: super block - * @block: start physcial block to free + * @block: start physcial block to add to the block group * @count: number of blocks to free - * @pdquot_freed_blocks: pointer to quota * - * XXX This function is only used by the on-line resizing code, which - * should probably be fixed up to call the mballoc variant. There - * this needs to be cleaned up later; in fact, I'm not convinced this - * is 100% correct in the face of the mballoc code. The online resizing - * code needs to be fixed up to more tightly (and correctly) interlock - * with the mballoc code. + * This marks the blocks as free in the bitmap. We ask the + * mballoc to reload the buddy after this by setting group + * EXT4_GROUP_INFO_NEED_INIT_BIT flag */ -void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks) +void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count) { struct buffer_head *bitmap_bh = NULL; struct buffer_head *gd_bh; ext4_group_t block_group; ext4_grpblk_t bit; - unsigned long i; - unsigned long overflow; + unsigned int i; struct ext4_group_desc *desc; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int err = 0, ret; - ext4_grpblk_t group_freed; + int err = 0, ret, blk_free_count; + ext4_grpblk_t blocks_freed; + struct ext4_group_info *grp; - *pdquot_freed_blocks = 0; sbi = EXT4_SB(sb); es = sbi->s_es; - if (block < le32_to_cpu(es->s_first_data_block) || - block + count < block || - block + count > ext4_blocks_count(es)) { - ext4_error(sb, "ext4_free_blocks", - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); - goto error_return; - } - - ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1); + ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); -do_more: - overflow = 0; ext4_get_group_no_and_offset(sb, block, &block_group, &bit); + grp = ext4_get_group_info(sb, block_group); /* * Check to see if we are freeing blocks across a group * boundary. */ if (bit + count > EXT4_BLOCKS_PER_GROUP(sb)) { - overflow = bit + count - EXT4_BLOCKS_PER_GROUP(sb); - count -= overflow; + goto error_return; } - brelse(bitmap_bh); bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; @@ -418,18 +422,17 @@ do_more: in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error(sb, "ext4_free_blocks", - "Freeing blocks in system zones - " + ext4_error(sb, __func__, + "Adding blocks in system zones - " "Block = %llu, count = %lu", block, count); goto error_return; } /* - * We are about to start releasing blocks in the bitmap, + * We are about to add blocks to the bitmap, * so we need undo access. */ - /* @@@ check errors */ BUFFER_TRACE(bitmap_bh, "getting undo access"); err = ext4_journal_get_undo_access(handle, bitmap_bh); if (err) @@ -444,107 +447,55 @@ do_more: err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - - jbd_lock_bh_state(bitmap_bh); - - for (i = 0, group_freed = 0; i < count; i++) { - /* - * An HJ special. This is expensive... - */ -#ifdef CONFIG_JBD2_DEBUG - jbd_unlock_bh_state(bitmap_bh); - { - struct buffer_head *debug_bh; - debug_bh = sb_find_get_block(sb, block + i); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "Deleted!"); - if (!bh2jh(bitmap_bh)->b_committed_data) - BUFFER_TRACE(debug_bh, - "No commited data in bitmap"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap"); - __brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); -#endif - if (need_resched()) { - jbd_unlock_bh_state(bitmap_bh); - cond_resched(); - jbd_lock_bh_state(bitmap_bh); - } - /* @@@ This prevents newly-allocated data from being - * freed and then reallocated within the same - * transaction. - * - * Ideally we would want to allow that to happen, but to - * do so requires making jbd2_journal_forget() capable of - * revoking the queued write of a data block, which - * implies blocking on the journal lock. *forget() - * cannot block due to truncate races. - * - * Eventually we can fix this by making jbd2_journal_forget() - * return a status indicating whether or not it was able - * to revoke the buffer. On successful revoke, it is - * safe not to set the allocation bit in the committed - * bitmap, because we know that there is no outstanding - * activity on the buffer any more and so it is safe to - * reallocate it. - */ - BUFFER_TRACE(bitmap_bh, "set in b_committed_data"); - J_ASSERT_BH(bitmap_bh, - bh2jh(bitmap_bh)->b_committed_data != NULL); - ext4_set_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, - bh2jh(bitmap_bh)->b_committed_data); - - /* - * We clear the bit in the bitmap after setting the committed - * data bit, because this is the reverse order to that which - * the allocator uses. - */ + /* + * make sure we don't allow a parallel init on other groups in the + * same buddy cache + */ + down_write(&grp->alloc_sem); + for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { - jbd_unlock_bh_state(bitmap_bh); ext4_error(sb, __func__, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); - jbd_lock_bh_state(bitmap_bh); BUFFER_TRACE(bitmap_bh, "bit already cleared"); } else { - group_freed++; + blocks_freed++; } } - jbd_unlock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&desc->bg_free_blocks_count, group_freed); + blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); + ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_add(&sbi->s_freeblocks_counter, count); + percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, block_group); spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks += count; + sbi->s_flex_groups[flex_group].free_blocks += blocks_freed; spin_unlock(sb_bgl_lock(sbi, flex_group)); } + /* + * request to reload the buddy with the + * new bitmap information + */ + set_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); + ext4_mb_update_group_info(grp, blocks_freed); + up_write(&grp->alloc_sem); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); - if (!err) err = ret; - *pdquot_freed_blocks += group_freed; - - if (overflow && !err) { - block += count; - count = overflow; - goto do_more; - } + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); + if (!err) + err = ret; sb->s_dirt = 1; + error_return: brelse(bitmap_bh); ext4_std_error(sb, err); @@ -614,7 +565,7 @@ int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks) if (dirty_blocks < 0) { printk(KERN_CRIT "Dirty block accounting " "went wrong %lld\n", - dirty_blocks); + (long long)dirty_blocks); } } /* Check whether we have space after @@ -666,101 +617,45 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); } -#define EXT4_META_BLOCK 0x1 - -static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp, int flags) -{ - struct ext4_allocation_request ar; - ext4_fsblk_t ret; - - memset(&ar, 0, sizeof(ar)); - /* Fill with neighbour allocated blocks */ - - ar.inode = inode; - ar.goal = goal; - ar.len = *count; - ar.logical = iblock; - - if (S_ISREG(inode->i_mode) && !(flags & EXT4_META_BLOCK)) - /* enable in-core preallocation for data block allocation */ - ar.flags = EXT4_MB_HINT_DATA; - else - /* disable in-core preallocation for non-regular files */ - ar.flags = 0; - - ret = ext4_mb_new_blocks(handle, &ar, errp); - *count = ar.len; - return ret; -} - /* * ext4_new_meta_blocks() -- allocate block for meta data (indexing) blocks * * @handle: handle to this transaction * @inode: file inode * @goal: given target block(filesystem wide) - * @count: total number of blocks need + * @count: pointer to total number of blocks needed * @errp: error code * - * Return 1st allocated block numberon success, *count stores total account + * Return 1st allocated block number on success, *count stores total account * error stores in errp pointer */ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp) { + struct ext4_allocation_request ar; ext4_fsblk_t ret; - ret = do_blk_alloc(handle, inode, 0, goal, - count, errp, EXT4_META_BLOCK); + + memset(&ar, 0, sizeof(ar)); + /* Fill with neighbour allocated blocks */ + ar.inode = inode; + ar.goal = goal; + ar.len = count ? *count : 1; + + ret = ext4_mb_new_blocks(handle, &ar, errp); + if (count) + *count = ar.len; + /* * Account for the allocated meta blocks */ if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); - EXT4_I(inode)->i_allocated_meta_blocks += *count; + EXT4_I(inode)->i_allocated_meta_blocks += ar.len; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } return ret; } -/* - * ext4_new_meta_block() -- allocate block for meta data (indexing) blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @errp: error code - * - * Return allocated block number on success - */ -ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp) -{ - unsigned long count = 1; - return ext4_new_meta_blocks(handle, inode, goal, &count, errp); -} - -/* - * ext4_new_blocks() -- allocate data blocks - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: total number of blocks need - * @errp: error code - * - * Return 1st allocated block numberon success, *count stores total account - * error stores in errp pointer - */ - -ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp) -{ - return do_blk_alloc(handle, inode, iblock, goal, count, errp, 0); -} - /** * ext4_count_free_blocks() -- count filesystem free blocks * @sb: superblock @@ -776,7 +671,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; - unsigned long x; + unsigned int x; struct buffer_head *bitmap_bh = NULL; es = EXT4_SB(sb)->s_es; @@ -796,7 +691,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) continue; x = ext4_count_free(bitmap_bh, sb->s_blocksize); - printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", + printk(KERN_DEBUG "group %lu: stored = %d, counted = %u\n", i, le16_to_cpu(gdp->bg_free_blocks_count), x); bitmap_count += x; } @@ -812,7 +707,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_blocks_count); + desc_count += ext4_free_blks_count(sb, gdp); } return desc_count; diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 0a7a6663c19..fa3af81ac56 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,10 +15,9 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) +unsigned int ext4_count_free(struct buffer_head *map, unsigned int numchars) { - unsigned int i; - unsigned long sum = 0; + unsigned int i, sum = 0; if (!map) return 0; diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index fed5b610df5..2df2e40b01a 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -64,7 +64,7 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) int ext4_check_dir_entry(const char *function, struct inode *dir, struct ext4_dir_entry_2 *de, struct buffer_head *bh, - unsigned long offset) + unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len); @@ -84,9 +84,9 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, if (error_msg != NULL) ext4_error(dir->i_sb, function, "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + "offset=%u, inode=%u, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, - (unsigned long) le32_to_cpu(de->inode), + le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; } @@ -95,7 +95,7 @@ static int ext4_readdir(struct file *filp, void *dirent, filldir_t filldir) { int error = 0; - unsigned long offset; + unsigned int offset; int i, stored; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -405,7 +405,7 @@ static int call_filldir(struct file *filp, void *dirent, sb = inode->i_sb; if (!fname) { - printk(KERN_ERR "ext4: call_filldir: called with " + printk(KERN_ERR "EXT4-fs: call_filldir: called with " "null fname?!?\n"); return 0; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0537c82702..c668e4377d7 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -19,6 +19,7 @@ #include <linux/types.h> #include <linux/blkdev.h> #include <linux/magic.h> +#include <linux/jbd2.h> #include "ext4_i.h" /* @@ -94,9 +95,9 @@ struct ext4_allocation_request { /* phys. block for ^^^ */ ext4_fsblk_t pright; /* how many blocks we want to allocate */ - unsigned long len; + unsigned int len; /* flags. see above EXT4_MB_HINT_* */ - unsigned long flags; + unsigned int flags; }; /* @@ -156,12 +157,12 @@ struct ext4_group_desc __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ __le32 bg_inode_table_lo; /* Inodes table block */ - __le16 bg_free_blocks_count; /* Free blocks count */ - __le16 bg_free_inodes_count; /* Free inodes count */ - __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_free_blocks_count_lo;/* Free blocks count */ + __le16 bg_free_inodes_count_lo;/* Free inodes count */ + __le16 bg_used_dirs_count_lo; /* Directories count */ __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ - __le16 bg_itable_unused; /* Unused inodes count */ + __le16 bg_itable_unused_lo; /* Unused inodes count */ __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ @@ -169,7 +170,7 @@ struct ext4_group_desc __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ __le16 bg_used_dirs_count_hi; /* Directories count MSB */ - __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ __u32 bg_reserved2[3]; }; @@ -328,6 +329,7 @@ struct ext4_mount_options { uid_t s_resuid; gid_t s_resgid; unsigned long s_commit_interval; + u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; char *s_qf_names[MAXQUOTAS]; @@ -534,7 +536,6 @@ do { \ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ -#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ @@ -726,11 +727,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) != 0) #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) + ((EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) != 0) #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ @@ -806,6 +807,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEFM_JMODE_WBACK 0x0060 /* + * Default journal batch times + */ +#define EXT4_DEF_MIN_BATCH_TIME 0 +#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ + +/* * Structure of a directory entry */ #define EXT4_NAME_LEN 255 @@ -891,6 +898,9 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len) #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 #ifdef __KERNEL__ @@ -955,7 +965,7 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) #define ERR_BAD_DX_DIR -75000 void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, - unsigned long *blockgrpp, ext4_grpblk_t *offsetp); + ext4_group_t *blockgrpp, ext4_grpblk_t *offsetp); extern struct proc_dir_entry *ext4_proc_root; @@ -987,6 +997,9 @@ do { \ # define ATTRIB_NORET __attribute__((noreturn)) # define NORET_AND noreturn, +/* bitmap.c */ +extern unsigned int ext4_count_free(struct buffer_head *, unsigned); + /* balloc.c */ extern unsigned int ext4_block_group(struct super_block *sb, ext4_fsblk_t blocknr); @@ -995,20 +1008,14 @@ extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); extern unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group); -extern ext4_fsblk_t ext4_new_meta_block(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, int *errp); extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, ext4_fsblk_t goal, - unsigned long *count, int *errp); extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern int ext4_has_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); -extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, - unsigned long *pdquot_freed_blocks); +extern void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count); extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, @@ -1019,7 +1026,7 @@ extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned long); + struct buffer_head *, unsigned int); extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1039,7 +1046,6 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); -extern unsigned long ext4_count_free(struct buffer_head *, unsigned); /* mballoc.c */ extern long ext4_mb_stats; @@ -1054,12 +1060,13 @@ extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, unsigned long, unsigned long, int, unsigned long *); -extern int ext4_mb_add_more_groupinfo(struct super_block *sb, +extern int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t i, struct ext4_group_desc *desc); extern void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add); - - +extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t); +extern void ext4_mb_put_buddy_cache_lock(struct super_block *, + ext4_group_t, int); /* inode.c */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr); @@ -1069,10 +1076,6 @@ struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize); extern struct inode *ext4_iget(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, int); @@ -1123,6 +1126,9 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); extern void ext4_warning(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); +extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, + const char *, const char *, ...) + __attribute__ ((format (printf, 4, 5))); extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -1136,12 +1142,28 @@ extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, struct ext4_group_desc *bg); extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, struct ext4_group_desc *bg); +extern __u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg); +extern __u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg); extern void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); extern void ext4_inode_table_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); +extern void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { @@ -1225,11 +1247,11 @@ do { \ } while (0) #ifdef CONFIG_SMP -/* Each CPU can accumulate FBC_BATCH blocks in their local +/* Each CPU can accumulate percpu_counter_batch blocks in their local * counters. So we need to make sure we have free blocks more - * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. + * than percpu_counter_batch * nr_cpu_ids. Also add a window of 4 times. */ -#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) +#define EXT4_FREEBLOCKS_WATERMARK (4 * (percpu_counter_batch * nr_cpu_ids)) #else #define EXT4_FREEBLOCKS_WATERMARK 0 #endif @@ -1246,6 +1268,50 @@ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) return ; } +struct ext4_group_info { + unsigned long bb_state; + struct rb_root bb_free_root; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + struct rw_semaphore alloc_sem; + unsigned short bb_counters[]; +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_LOCKED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline int ext4_is_group_locked(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, + &(grinfo->bb_state)); +} + /* * Inodes and files operations */ @@ -1271,18 +1337,38 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int); extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize); + ext4_lblk_t iblock, unsigned int max_blocks, + struct buffer_head *bh_result, + int create, int extend_disksize); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, - sector_t block, unsigned long max_blocks, + sector_t block, unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag); +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + +/* + * Add new method to test wether block and inode bitmaps are properly + * initialized. With uninit_bg reading the block from disk is not enough + * to mark the bitmap uptodate. We need to also zero-out the bitmap + */ +#define BH_BITMAP_UPTODATE BH_JBDPrivateStart + +static inline int bitmap_uptodate(struct buffer_head *bh) +{ + return (buffer_uptodate(bh) && + test_bit(BH_BITMAP_UPTODATE, &(bh)->b_state)); +} +static inline void set_bitmap_uptodate(struct buffer_head *bh) +{ + set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); +} + #endif /* __KERNEL__ */ #endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index bec7ce59fc0..18cb67b2cbb 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -194,11 +194,6 @@ static inline unsigned short ext_depth(struct inode *inode) return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); } -static inline void ext4_ext_tree_changed(struct inode *inode) -{ - EXT4_I(inode)->i_ext_generation++; -} - static inline void ext4_ext_invalidate_cache(struct inode *inode) { diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index 5c124c0ac6d..e69acc16f5c 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -31,7 +31,7 @@ typedef unsigned long long ext4_fsblk_t; typedef __u32 ext4_lblk_t; /* data type for block group number */ -typedef unsigned long ext4_group_t; +typedef unsigned int ext4_group_t; #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end @@ -100,9 +100,6 @@ struct ext4_inode_info { */ loff_t i_disksize; - /* on-disk additional length */ - __u16 i_extra_isize; - /* * i_data_sem is for serialising ext4_truncate() against * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's @@ -117,7 +114,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode jinode; - unsigned long i_ext_generation; struct ext4_ext_cache i_cached_extent; /* * File creation time. Its function is same as that of @@ -130,10 +126,14 @@ struct ext4_inode_info { spinlock_t i_prealloc_lock; /* allocation reservation info for delalloc */ - unsigned long i_reserved_data_blocks; - unsigned long i_reserved_meta_blocks; - unsigned long i_allocated_meta_blocks; + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; unsigned short i_delalloc_reserved_flag; + + /* on-disk additional length */ + __u16 i_extra_isize; + spinlock_t i_block_reservation_lock; }; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index c75384b34f2..ad13a84644e 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -7,53 +7,96 @@ int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_undo_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_undo_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_get_write_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_write_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_write_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_forget(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_forget(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_forget(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_revoke(const char *where, handle_t *handle, ext4_fsblk_t blocknr, struct buffer_head *bh) { - int err = jbd2_journal_revoke(handle, blocknr, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_revoke(handle, blocknr, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh) { - int err = jbd2_journal_get_create_access(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_get_create_access(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } return err; } -int __ext4_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh) +int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, + struct inode *inode, struct buffer_head *bh) { - int err = jbd2_journal_dirty_metadata(handle, bh); - if (err) - ext4_journal_abort_handle(where, __func__, bh, handle, err); + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, __func__, bh, + handle, err); + } else { + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long) bh->b_blocknr); + err = -EIO; + } + } + } return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index b455c685a98..be2f426f680 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -32,8 +32,8 @@ * 5 levels of tree + root which are stored in the inode. */ #define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ - (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ - || test_opt(sb, EXTENTS) ? 27U : 8U) + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + ? 27U : 8U) /* Extended attribute operations touch at most two data buffers, * two bitmap buffers, and two group summaries, in addition to the inode @@ -122,12 +122,6 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); * been done yet. */ -static inline void ext4_journal_release_buffer(handle_t *handle, - struct buffer_head *bh) -{ - jbd2_journal_release_buffer(handle, bh); -} - void ext4_journal_abort_handle(const char *caller, const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); @@ -146,8 +140,8 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, int __ext4_journal_get_create_access(const char *where, handle_t *handle, struct buffer_head *bh); -int __ext4_journal_dirty_metadata(const char *where, - handle_t *handle, struct buffer_head *bh); +int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, + struct inode *inode, struct buffer_head *bh); #define ext4_journal_get_undo_access(handle, bh) \ __ext4_journal_get_undo_access(__func__, (handle), (bh)) @@ -157,14 +151,57 @@ int __ext4_journal_dirty_metadata(const char *where, __ext4_journal_revoke(__func__, (handle), (blocknr), (bh)) #define ext4_journal_get_create_access(handle, bh) \ __ext4_journal_get_create_access(__func__, (handle), (bh)) -#define ext4_journal_dirty_metadata(handle, bh) \ - __ext4_journal_dirty_metadata(__func__, (handle), (bh)) #define ext4_journal_forget(handle, bh) \ __ext4_journal_forget(__func__, (handle), (bh)) +#define ext4_handle_dirty_metadata(handle, inode, bh) \ + __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); int __ext4_journal_stop(const char *where, handle_t *handle); +#define EXT4_NOJOURNAL_HANDLE ((handle_t *) 0x1) + +static inline int ext4_handle_valid(handle_t *handle) +{ + if (handle == EXT4_NOJOURNAL_HANDLE) + return 0; + return 1; +} + +static inline void ext4_handle_sync(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + handle->h_sync = 1; +} + +static inline void ext4_handle_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + +static inline int ext4_handle_is_aborted(handle_t *handle) +{ + if (ext4_handle_valid(handle)) + return is_handle_aborted(handle); + return 0; +} + +static inline int ext4_handle_has_enough_credits(handle_t *handle, int needed) +{ + if (ext4_handle_valid(handle) && handle->h_buffer_credits < needed) + return 0; + return 1; +} + +static inline void ext4_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + if (ext4_handle_valid(handle)) + jbd2_journal_release_buffer(handle, bh); +} + static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) { return ext4_journal_start_sb(inode->i_sb, nblocks); @@ -180,27 +217,37 @@ static inline handle_t *ext4_journal_current_handle(void) static inline int ext4_journal_extend(handle_t *handle, int nblocks) { - return jbd2_journal_extend(handle, nblocks); + if (ext4_handle_valid(handle)) + return jbd2_journal_extend(handle, nblocks); + return 0; } static inline int ext4_journal_restart(handle_t *handle, int nblocks) { - return jbd2_journal_restart(handle, nblocks); + if (ext4_handle_valid(handle)) + return jbd2_journal_restart(handle, nblocks); + return 0; } static inline int ext4_journal_blocks_per_page(struct inode *inode) { - return jbd2_journal_blocks_per_page(inode); + if (EXT4_JOURNAL(inode) != NULL) + return jbd2_journal_blocks_per_page(inode); + return 0; } static inline int ext4_journal_force_commit(journal_t *journal) { - return jbd2_journal_force_commit(journal); + if (journal) + return jbd2_journal_force_commit(journal); + return 0; } static inline int ext4_jbd2_file_inode(handle_t *handle, struct inode *inode) { - return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + if (ext4_handle_valid(handle)) + return jbd2_journal_file_inode(handle, &EXT4_I(inode)->jinode); + return 0; } /* super.c */ @@ -208,6 +255,8 @@ int ext4_force_commit(struct super_block *sb); static inline int ext4_should_journal_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 1; if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -219,6 +268,8 @@ static inline int ext4_should_journal_data(struct inode *inode) static inline int ext4_should_order_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 0; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) @@ -230,6 +281,8 @@ static inline int ext4_should_order_data(struct inode *inode) static inline int ext4_should_writeback_data(struct inode *inode) { + if (EXT4_JOURNAL(inode) == NULL) + return 0; if (!S_ISREG(inode->i_mode)) return 0; if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 445fde603df..039b6ea1a04 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -57,6 +57,7 @@ struct ext4_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; @@ -73,6 +74,8 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_JBD2_DEBUG struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ @@ -101,7 +104,8 @@ struct ext4_sb_info { spinlock_t s_reserve_lock; spinlock_t s_md_lock; tid_t s_last_transaction; - unsigned short *s_mb_offsets, *s_mb_maxs; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; /* tunables */ unsigned long s_stripe; @@ -146,4 +150,10 @@ struct ext4_sb_info { struct flex_groups *s_flex_groups; }; +static inline spinlock_t * +sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group); +} + #endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea2ce3c0ae6..54bf0623a9a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -97,6 +97,8 @@ static int ext4_ext_journal_restart(handle_t *handle, int needed) { int err; + if (!ext4_handle_valid(handle)) + return 0; if (handle->h_buffer_credits > needed) return 0; err = ext4_journal_extend(handle, needed); @@ -134,7 +136,7 @@ static int ext4_ext_dirty(handle_t *handle, struct inode *inode, int err; if (path->p_bh) { /* path points to block */ - err = ext4_journal_dirty_metadata(handle, path->p_bh); + err = ext4_handle_dirty_metadata(handle, inode, path->p_bh); } else { /* path points to leaf/index in inode body */ err = ext4_mark_inode_dirty(handle, inode); @@ -191,7 +193,7 @@ ext4_ext_new_meta_block(handle_t *handle, struct inode *inode, ext4_fsblk_t goal, newblock; goal = ext4_ext_find_goal(inode, path, le32_to_cpu(ex->ee_block)); - newblock = ext4_new_meta_block(handle, inode, goal, err); + newblock = ext4_new_meta_blocks(handle, inode, goal, NULL, err); return newblock; } @@ -780,7 +782,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -859,7 +861,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto cleanup; brelse(bh); @@ -955,7 +957,7 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto out; @@ -1160,15 +1162,13 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, while (--depth >= 0) { ix = path[depth].p_idx; if (ix != EXT_LAST_INDEX(path[depth].p_hdr)) - break; + goto got_index; } - if (depth < 0) { - /* we've gone up to the root and - * found no index to the right */ - return 0; - } + /* we've gone up to the root and found no index to the right */ + return 0; +got_index: /* we've found index to the right, let's * follow it and find the closest allocated * block to the right */ @@ -1201,7 +1201,6 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, *phys = ext_pblock(ex); put_bh(bh); return 0; - } /* @@ -1622,7 +1621,6 @@ cleanup: ext4_ext_drop_refs(npath); kfree(npath); } - ext4_ext_tree_changed(inode); ext4_ext_invalidate_cache(inode); return err; } @@ -2233,7 +2231,6 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) } } out: - ext4_ext_tree_changed(inode); ext4_ext_drop_refs(path); kfree(path); ext4_journal_stop(handle); @@ -2250,7 +2247,7 @@ void ext4_ext_init(struct super_block *sb) * possible initialization would be here */ - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); @@ -2275,7 +2272,7 @@ void ext4_ext_init(struct super_block *sb) */ void ext4_ext_release(struct super_block *sb) { - if (!test_opt(sb, EXTENTS)) + if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) return; #ifdef EXTENTS_STATS @@ -2380,7 +2377,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t iblock, - unsigned long max_blocks) + unsigned int max_blocks) { struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; @@ -2536,7 +2533,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, */ newdepth = ext_depth(inode); /* - * update the extent length after successfull insert of the + * update the extent length after successful insert of the * split extent */ orig_ex.ee_len = cpu_to_le16(ee_len - @@ -2678,26 +2675,26 @@ fix_extent_len: */ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, - unsigned long max_blocks, struct buffer_head *bh_result, + unsigned int max_blocks, struct buffer_head *bh_result, int create, int extend_disksize) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; struct ext4_extent newex, *ex; - ext4_fsblk_t goal, newblock; - int err = 0, depth, ret; - unsigned long allocated = 0; + ext4_fsblk_t newblock; + int err = 0, depth, ret, cache_type; + unsigned int allocated = 0; struct ext4_allocation_request ar; loff_t disksize; __clear_bit(BH_New, &bh_result->b_state); - ext_debug("blocks %u/%lu requested for inode %u\n", + ext_debug("blocks %u/%u requested for inode %u\n", iblock, max_blocks, inode->i_ino); /* check in cache */ - goal = ext4_ext_in_cache(inode, iblock, &newex); - if (goal) { - if (goal == EXT4_EXT_CACHE_GAP) { + cache_type = ext4_ext_in_cache(inode, iblock, &newex); + if (cache_type) { + if (cache_type == EXT4_EXT_CACHE_GAP) { if (!create) { /* * block isn't allocated yet and @@ -2706,7 +2703,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* we should allocate requested block */ - } else if (goal == EXT4_EXT_CACHE_EXTENT) { + } else if (cache_type == EXT4_EXT_CACHE_EXTENT) { /* block is already allocated */ newblock = iblock - le32_to_cpu(newex.ee_block) @@ -2854,7 +2851,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (!newblock) goto out2; ext_debug("allocate new block: goal %llu, found %llu/%lu\n", - goal, newblock, allocated); + ar.goal, newblock, allocated); /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); @@ -2950,7 +2947,7 @@ void ext4_ext_truncate(struct inode *inode) * transaction synchronous. */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: up_write(&EXT4_I(inode)->i_data_sem); @@ -3004,7 +3001,7 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) handle_t *handle; ext4_lblk_t block; loff_t new_size; - unsigned long max_blocks; + unsigned int max_blocks; int ret = 0; int ret2 = 0; int retries = 0; @@ -3083,7 +3080,7 @@ retry: /* * Callback function called for each extent to gather FIEMAP information. */ -int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, +static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, struct ext4_ext_cache *newex, struct ext4_extent *ex, void *data) { @@ -3152,7 +3149,8 @@ int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, /* fiemap flags we can handle specified here */ #define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) -int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) +static int ext4_xattr_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo) { __u64 physical = 0; __u64 length; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 6bd11fba71f..f731cb545a0 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -140,9 +140,6 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len); - const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 556ca8eba3d..ac8f168c8ab 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -35,23 +35,71 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash(const char *name, int len) +static __u32 dx_hack_hash_unsigned(const char *name, int len) { - __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const unsigned char *ucp = (const unsigned char *) name; + + while (len--) { + hash = hash1 + (hash0 ^ (((int) *ucp++) * 7152373)); + + if (hash & 0x80000000) + hash -= 0x7fffffff; + hash1 = hash0; + hash0 = hash; + } + return hash0 << 1; +} + +static __u32 dx_hack_hash_signed(const char *name, int len) +{ + __u32 hash, hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; + const signed char *scp = (const signed char *) name; + while (len--) { - __u32 hash = hash1 + (hash0 ^ (*name++ * 7152373)); + hash = hash1 + (hash0 ^ (((int) *scp++) * 7152373)); - if (hash & 0x80000000) hash -= 0x7fffffff; + if (hash & 0x80000000) + hash -= 0x7fffffff; hash1 = hash0; hash0 = hash; } - return (hash0 << 1); + return hash0 << 1; +} + +static void str2hashbuf_signed(const char *msg, int len, __u32 *buf, int num) +{ + __u32 pad, val; + int i; + const signed char *scp = (const signed char *) msg; + + pad = (__u32)len | ((__u32)len << 8); + pad |= pad << 16; + + val = pad; + if (len > num*4) + len = num * 4; + for (i = 0; i < len; i++) { + if ((i % 4) == 0) + val = pad; + val = ((int) scp[i]) + (val << 8); + if ((i % 4) == 3) { + *buf++ = val; + val = pad; + num--; + } + } + if (--num >= 0) + *buf++ = val; + while (--num >= 0) + *buf++ = pad; } -static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) +static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) { __u32 pad, val; int i; + const unsigned char *ucp = (const unsigned char *) msg; pad = (__u32)len | ((__u32)len << 8); pad |= pad << 16; @@ -62,7 +110,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; - val = msg[i] + (val << 8); + val = ((int) ucp[i]) + (val << 8); if ((i % 4) == 3) { *buf++ = val; val = pad; @@ -95,6 +143,8 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) const char *p; int i; __u32 in[8], buf[4]; + void (*str2hashbuf)(const char *, int, __u32 *, int) = + str2hashbuf_signed; /* Initialize the default seed for the hash checksum functions */ buf[0] = 0x67452301; @@ -113,13 +163,18 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) } switch (hinfo->hash_version) { + case DX_HASH_LEGACY_UNSIGNED: + hash = dx_hack_hash_unsigned(name, len); + break; case DX_HASH_LEGACY: - hash = dx_hack_hash(name, len); + hash = dx_hack_hash_signed(name, len); break; + case DX_HASH_HALF_MD4_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_HALF_MD4: p = name; while (len > 0) { - str2hashbuf(p, len, in, 8); + (*str2hashbuf)(p, len, in, 8); half_md4_transform(buf, in); len -= 32; p += 32; @@ -127,10 +182,12 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) minor_hash = buf[2]; hash = buf[1]; break; + case DX_HASH_TEA_UNSIGNED: + str2hashbuf = str2hashbuf_unsigned; case DX_HASH_TEA: p = name; while (len > 0) { - str2hashbuf(p, len, in, 4); + (*str2hashbuf)(p, len, in, 4); TEA_transform(buf, in); len -= 16; p += 16; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 6e6052879aa..4fb86a0061d 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -74,17 +74,17 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __func__, "Checksum bad for group %lu\n", + ext4_error(sb, __func__, "Checksum bad for group %u", block_group); - gdp->bg_free_blocks_count = 0; - gdp->bg_free_inodes_count = 0; - gdp->bg_itable_unused = 0; + ext4_free_blks_set(sb, gdp, 0); + ext4_free_inodes_set(sb, gdp, 0); + ext4_itable_unused_set(sb, gdp, 0); memset(bh->b_data, 0xff, sb->s_blocksize); return 0; } memset(bh->b_data, 0, (EXT4_INODES_PER_GROUP(sb) + 7) / 8); - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); return EXT4_INODES_PER_GROUP(sb); @@ -111,29 +111,49 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) if (unlikely(!bh)) { ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } - if (buffer_uptodate(bh) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) + if (bitmap_uptodate(bh)) return bh; lock_buffer(bh); + if (bitmap_uptodate(bh)) { + unlock_buffer(bh); + return bh; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); + set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - unlock_buffer(bh); spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + unlock_buffer(bh); return bh; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + if (buffer_uptodate(bh)) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh); + unlock_buffer(bh); + return bh; + } + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh); if (bh_submit_read(bh) < 0) { put_bh(bh); ext4_error(sb, __func__, "Cannot read inode bitmap - " - "block_group = %lu, inode_bitmap = %llu", + "block_group = %u, inode_bitmap = %llu", block_group, bitmap_blk); return NULL; } @@ -168,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err; + int fatal = 0, err, count; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -190,6 +210,11 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) ino = inode->i_ino; ext4_debug("freeing inode %lu\n", ino); + trace_mark(ext4_free_inode, + "dev %s ino %lu mode %d uid %lu gid %lu bocks %llu", + sb->s_id, inode->i_ino, inode->i_mode, + (unsigned long) inode->i_uid, (unsigned long) inode->i_gid, + (unsigned long long) inode->i_blocks); /* * Note: we must free any quota before locking the superblock, @@ -236,9 +261,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (gdp) { spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_inodes_count, 1); - if (is_directory) - le16_add_cpu(&gdp->bg_used_dirs_count, -1); + count = ext4_free_inodes_count(sb, gdp) + 1; + ext4_free_inodes_set(sb, gdp, count); + if (is_directory) { + count = ext4_used_dirs_count(sb, gdp) - 1; + ext4_used_dirs_set(sb, gdp, count); + } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); @@ -253,12 +281,12 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) spin_unlock(sb_bgl_lock(sbi, flex_group)); } } - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); + BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bh2); if (!fatal) fatal = err; } - BUFFER_TRACE(bitmap_bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + BUFFER_TRACE(bitmap_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; sb->s_dirt = 1; @@ -291,13 +319,13 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, for (group = 0; group < ngroups; group++) { desc = ext4_get_group_desc(sb, group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; if (!best_desc || - (le16_to_cpu(desc->bg_free_blocks_count) > - le16_to_cpu(best_desc->bg_free_blocks_count))) { + (ext4_free_blks_count(sb, desc) > + ext4_free_blks_count(sb, best_desc))) { *best_group = group; best_desc = desc; ret = 0; @@ -369,7 +397,7 @@ found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { desc = ext4_get_group_desc(sb, i, &bh); - if (le16_to_cpu(desc->bg_free_inodes_count)) { + if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; } @@ -443,17 +471,17 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { grp = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, grp, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= best_ndir) + if (ext4_used_dirs_count(sb, desc) >= best_ndir) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) + if (ext4_free_inodes_count(sb, desc) < avefreei) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < avefreeb) + if (ext4_free_blks_count(sb, desc) < avefreeb) continue; *group = grp; ret = 0; - best_ndir = le16_to_cpu(desc->bg_used_dirs_count); + best_ndir = ext4_used_dirs_count(sb, desc); } if (ret == 0) return ret; @@ -479,13 +507,13 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (!desc || !desc->bg_free_inodes_count) + if (!desc || !ext4_free_inodes_count(sb, desc)) continue; - if (le16_to_cpu(desc->bg_used_dirs_count) >= max_dirs) + if (ext4_used_dirs_count(sb, desc) >= max_dirs) continue; - if (le16_to_cpu(desc->bg_free_inodes_count) < min_inodes) + if (ext4_free_inodes_count(sb, desc) < min_inodes) continue; - if (le16_to_cpu(desc->bg_free_blocks_count) < min_blocks) + if (ext4_free_blks_count(sb, desc) < min_blocks) continue; return 0; } @@ -494,8 +522,8 @@ fallback: for (i = 0; i < ngroups; i++) { *group = (parent_group + i) % ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && desc->bg_free_inodes_count && - le16_to_cpu(desc->bg_free_inodes_count) >= avefreei) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_inodes_count(sb, desc) >= avefreei) return 0; } @@ -524,8 +552,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, */ *group = parent_group; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; /* @@ -548,8 +576,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (*group >= ngroups) *group -= ngroups; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count) && - le16_to_cpu(desc->bg_free_blocks_count)) + if (desc && ext4_free_inodes_count(sb, desc) && + ext4_free_blks_count(sb, desc)) return 0; } @@ -562,7 +590,7 @@ static int find_group_other(struct super_block *sb, struct inode *parent, if (++*group >= ngroups) *group = 0; desc = ext4_get_group_desc(sb, *group, NULL); - if (desc && le16_to_cpu(desc->bg_free_inodes_count)) + if (desc && ext4_free_inodes_count(sb, desc)) return 0; } @@ -570,6 +598,79 @@ static int find_group_other(struct super_block *sb, struct inode *parent, } /* + * claim the inode from the inode bitmap. If the group + * is uninit we need to take the groups's sb_bgl_lock + * and clear the uninit flag. The inode bitmap update + * and group desc uninit flag clear should be done + * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * doesn't race with the ext4_claim_inode + */ +static int ext4_claim_inode(struct super_block *sb, + struct buffer_head *inode_bitmap_bh, + unsigned long ino, ext4_group_t group, int mode) +{ + int free = 0, retval = 0, count; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); + + spin_lock(sb_bgl_lock(sbi, group)); + if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { + /* not a free inode */ + retval = 1; + goto err_ret; + } + ino++; + if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || + ino > EXT4_INODES_PER_GROUP(sb)) { + spin_unlock(sb_bgl_lock(sbi, group)); + ext4_error(sb, __func__, + "reserved inode or inode > inodes count - " + "block_group = %u, inode=%lu", group, + ino + group * EXT4_INODES_PER_GROUP(sb)); + return 1; + } + /* If we didn't allocate from within the initialized part of the inode + * table then we need to initialize up to this inode. */ + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { + + if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); + /* When marking the block group with + * ~EXT4_BG_INODE_UNINIT we don't want to depend + * on the value of bg_itable_unused even though + * mke2fs could have initialized the same for us. + * Instead we calculated the value below + */ + + free = 0; + } else { + free = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + } + + /* + * Check the relative inode number against the last used + * relative inode number in this group. if it is greater + * we need to update the bg_itable_unused count + * + */ + if (ino > free) + ext4_itable_unused_set(sb, gdp, + (EXT4_INODES_PER_GROUP(sb) - ino)); + } + count = ext4_free_inodes_count(sb, gdp) - 1; + ext4_free_inodes_set(sb, gdp, count); + if (S_ISDIR(mode)) { + count = ext4_used_dirs_count(sb, gdp) + 1; + ext4_used_dirs_set(sb, gdp, count); + } + gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); +err_ret: + spin_unlock(sb_bgl_lock(sbi, group)); + return retval; +} + +/* * There are two policies for allocating an inode. If the new inode is * a directory, then a forward search is made for a block group with both * free space and a low directory-to-inode ratio; if that fails, then of @@ -582,8 +683,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) { struct super_block *sb; - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *bh2; + struct buffer_head *inode_bitmap_bh = NULL; + struct buffer_head *group_desc_bh; ext4_group_t group = 0; unsigned long ino = 0; struct inode *inode; @@ -602,6 +703,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, + dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); @@ -631,40 +734,52 @@ got_group: for (i = 0; i < sbi->s_groups_count; i++) { err = -EIO; - gdp = ext4_get_group_desc(sb, group, &bh2); + gdp = ext4_get_group_desc(sb, group, &group_desc_bh); if (!gdp) goto fail; - brelse(bitmap_bh); - bitmap_bh = ext4_read_inode_bitmap(sb, group); - if (!bitmap_bh) + brelse(inode_bitmap_bh); + inode_bitmap_bh = ext4_read_inode_bitmap(sb, group); + if (!inode_bitmap_bh) goto fail; ino = 0; repeat_in_this_group: ino = ext4_find_next_zero_bit((unsigned long *) - bitmap_bh->b_data, EXT4_INODES_PER_GROUP(sb), ino); + inode_bitmap_bh->b_data, + EXT4_INODES_PER_GROUP(sb), ino); + if (ino < EXT4_INODES_PER_GROUP(sb)) { - BUFFER_TRACE(bitmap_bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, bitmap_bh); + BUFFER_TRACE(inode_bitmap_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + inode_bitmap_bh); if (err) goto fail; - if (!ext4_set_bit_atomic(sb_bgl_lock(sbi, group), - ino, bitmap_bh->b_data)) { + BUFFER_TRACE(group_desc_bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, + group_desc_bh); + if (err) + goto fail; + if (!ext4_claim_inode(sb, inode_bitmap_bh, + ino, group, mode)) { /* we won it */ - BUFFER_TRACE(bitmap_bh, - "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, - bitmap_bh); + BUFFER_TRACE(inode_bitmap_bh, + "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, + inode, + inode_bitmap_bh); if (err) goto fail; + /* zero bit is inode number 1*/ + ino++; goto got; } /* we lost it */ - jbd2_journal_release_buffer(handle, bitmap_bh); + ext4_handle_release_buffer(handle, inode_bitmap_bh); + ext4_handle_release_buffer(handle, group_desc_bh); if (++ino < EXT4_INODES_PER_GROUP(sb)) goto repeat_in_this_group; @@ -684,30 +799,16 @@ repeat_in_this_group: goto out; got: - ino++; - if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || - ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_error(sb, __func__, - "reserved inode or inode > inodes count - " - "block_group = %lu, inode=%lu", group, - ino + group * EXT4_INODES_PER_GROUP(sb)); - err = -EIO; - goto fail; - } - - BUFFER_TRACE(bh2, "get_write_access"); - err = ext4_journal_get_write_access(handle, bh2); - if (err) goto fail; - /* We may have to initialize the block bitmap if it isn't already */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM) && gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - struct buffer_head *block_bh = ext4_read_block_bitmap(sb, group); + struct buffer_head *block_bitmap_bh; - BUFFER_TRACE(block_bh, "get block bitmap access"); - err = ext4_journal_get_write_access(handle, block_bh); + block_bitmap_bh = ext4_read_block_bitmap(sb, group); + BUFFER_TRACE(block_bitmap_bh, "get block bitmap access"); + err = ext4_journal_get_write_access(handle, block_bitmap_bh); if (err) { - brelse(block_bh); + brelse(block_bitmap_bh); goto fail; } @@ -715,9 +816,9 @@ got: spin_lock(sb_bgl_lock(sbi, group)); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); free = ext4_free_blocks_after_init(sb, group, gdp); - gdp->bg_free_blocks_count = cpu_to_le16(free); + gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); + ext4_free_blks_set(sb, gdp, free); gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } @@ -725,55 +826,19 @@ got: /* Don't need to dirty bitmap block if we didn't change it */ if (free) { - BUFFER_TRACE(block_bh, "dirty block bitmap"); - err = ext4_journal_dirty_metadata(handle, block_bh); + BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap"); + err = ext4_handle_dirty_metadata(handle, + NULL, block_bitmap_bh); } - brelse(block_bh); + brelse(block_bitmap_bh); if (err) goto fail; } - - spin_lock(sb_bgl_lock(sbi, group)); - /* If we didn't allocate from within the initialized part of the inode - * table then we need to initialize up to this inode. */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) { - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT); - - /* When marking the block group with - * ~EXT4_BG_INODE_UNINIT we don't want to depend - * on the value of bg_itable_unused even though - * mke2fs could have initialized the same for us. - * Instead we calculated the value below - */ - - free = 0; - } else { - free = EXT4_INODES_PER_GROUP(sb) - - le16_to_cpu(gdp->bg_itable_unused); - } - - /* - * Check the relative inode number against the last used - * relative inode number in this group. if it is greater - * we need to update the bg_itable_unused count - * - */ - if (ino > free) - gdp->bg_itable_unused = - cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); - } - - le16_add_cpu(&gdp->bg_free_inodes_count, -1); - if (S_ISDIR(mode)) { - le16_add_cpu(&gdp->bg_used_dirs_count, 1); - } - gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); - spin_unlock(sb_bgl_lock(sbi, group)); - BUFFER_TRACE(bh2, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh2); - if (err) goto fail; + BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh); + if (err) + goto fail; percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) @@ -825,7 +890,7 @@ got: ext4_set_inode_flags(inode); if (IS_DIRSYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); if (insert_inode_locked(inode) < 0) { err = -EINVAL; goto fail_drop; @@ -852,7 +917,7 @@ got: if (err) goto fail_free_drop; - if (test_opt(sb, EXTENTS)) { + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) { /* set extent flag only for directory, file and normal symlink*/ if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; @@ -867,6 +932,8 @@ got: } ext4_debug("allocating inode %lu\n", inode->i_ino); + trace_mark(ext4_allocate_inode, "dev %s ino %lu dir %lu mode %d", + sb->s_id, inode->i_ino, dir->i_ino, mode); goto really_out; fail: ext4_std_error(sb, err); @@ -874,7 +941,7 @@ out: iput(inode); ret = ERR_PTR(err); really_out: - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ret; fail_free_drop: @@ -886,7 +953,7 @@ fail_drop: inode->i_nlink = 0; unlock_new_inode(inode); iput(inode); - brelse(bitmap_bh); + brelse(inode_bitmap_bh); return ERR_PTR(err); } @@ -985,7 +1052,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); brelse(bitmap_bh); bitmap_bh = ext4_read_inode_bitmap(sb, i); if (!bitmap_bh) @@ -993,7 +1060,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) x = ext4_count_free(bitmap_bh, EXT4_INODES_PER_GROUP(sb) / 8); printk(KERN_DEBUG "group %lu: stored = %d, counted = %lu\n", - i, le16_to_cpu(gdp->bg_free_inodes_count), x); + i, ext4_free_inodes_count(sb, gdp), x); bitmap_count += x; } brelse(bitmap_bh); @@ -1007,7 +1074,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - desc_count += le16_to_cpu(gdp->bg_free_inodes_count); + desc_count += ext4_free_inodes_count(sb, gdp); cond_resched(); } return desc_count; @@ -1024,8 +1091,7 @@ unsigned long ext4_count_dirs(struct super_block * sb) struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; - count += le16_to_cpu(gdp->bg_used_dirs_count); + count += ext4_used_dirs_count(sb, gdp); } return count; } - diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c3325e0b00..a6444cee0c7 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -72,12 +72,17 @@ static int ext4_inode_is_fast_symlink(struct inode *inode) * "bh" may be NULL: a metadata block may have been freed from memory * but there may still be a record of it in the journal, and that record * still needs to be revoked. + * + * If the handle isn't valid we're not journaling so there's nothing to do. */ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; + if (!ext4_handle_valid(handle)) + return 0; + might_sleep(); BUFFER_TRACE(bh, "enter"); @@ -170,7 +175,9 @@ static handle_t *start_transaction(struct inode *inode) */ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) { - if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + if (!ext4_handle_valid(handle)) + return 0; + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; if (!ext4_journal_extend(handle, blocks_for_truncate(inode))) return 0; @@ -184,6 +191,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) */ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) { + BUG_ON(EXT4_JOURNAL(inode) == NULL); jbd_debug(2, "restarting handle %p\n", handle); return ext4_journal_restart(handle, blocks_for_truncate(inode)); } @@ -216,7 +224,7 @@ void ext4_delete_inode(struct inode *inode) } if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode->i_size = 0; err = ext4_mark_inode_dirty(handle, inode); if (err) { @@ -233,7 +241,7 @@ void ext4_delete_inode(struct inode *inode) * enough credits left in the handle to remove the inode from * the orphan list and set the dtime field. */ - if (handle->h_buffer_credits < 3) { + if (!ext4_handle_has_enough_credits(handle, 3)) { err = ext4_journal_extend(handle, 3); if (err > 0) err = ext4_journal_restart(handle, 3); @@ -506,10 +514,10 @@ static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, * return the total number of blocks to be allocate, including the * direct and indirect blocks. */ -static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned long blks, +static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, int blocks_to_boundary) { - unsigned long count = 0; + unsigned int count = 0; /* * Simple case, [t,d]Indirect block(s) has not allocated yet @@ -547,6 +555,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, int indirect_blks, int blks, ext4_fsblk_t new_blocks[4], int *err) { + struct ext4_allocation_request ar; int target, i; unsigned long count = 0, blk_allocated = 0; int index = 0; @@ -595,10 +604,17 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, if (!target) goto allocated; /* Now allocate data blocks */ - count = target; - /* allocating blocks for data blocks */ - current_block = ext4_new_blocks(handle, inode, iblock, - goal, &count, err); + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.goal = goal; + ar.len = target; + ar.logical = iblock; + if (S_ISREG(inode->i_mode)) + /* enable in-core preallocation only for regular files */ + ar.flags = EXT4_MB_HINT_DATA; + + current_block = ext4_mb_new_blocks(handle, &ar, err); + if (*err && (target == blks)) { /* * if the allocation failed and we didn't allocate @@ -614,7 +630,7 @@ static int ext4_alloc_blocks(handle_t *handle, struct inode *inode, */ new_blocks[index] = current_block; } - blk_allocated += count; + blk_allocated += ar.len; } allocated: /* total number of blocks allocated for direct blocks */ @@ -709,8 +725,8 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (err) goto failed; } @@ -792,8 +808,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, * generic_commit_write->__mark_inode_dirty->ext4_dirty_inode. */ jbd_debug(5, "splicing indirect only\n"); - BUFFER_TRACE(where->bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, where->bh); + BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, where->bh); if (err) goto err_out; } else { @@ -840,10 +856,10 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) */ -int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, unsigned long maxblocks, - struct buffer_head *bh_result, - int create, int extend_disksize) +static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned int maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -1045,7 +1061,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * It returns the error in case of allocation failure. */ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, - unsigned long max_blocks, struct buffer_head *bh, + unsigned int max_blocks, struct buffer_head *bh, int create, int extend_disksize, int flag) { int retval; @@ -1221,8 +1237,8 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, set_buffer_uptodate(bh); } unlock_buffer(bh); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, inode, bh); if (!fatal) fatal = err; } else { @@ -1335,6 +1351,10 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; + trace_mark(ext4_write_begin, + "dev %s ino %lu pos %llu len %u flags %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, flags); index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1346,7 +1366,7 @@ retry: goto out; } - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { ext4_journal_stop(handle); ret = -ENOMEM; @@ -1387,7 +1407,7 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh) if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; set_buffer_uptodate(bh); - return ext4_journal_dirty_metadata(handle, bh); + return ext4_handle_dirty_metadata(handle, NULL, bh); } /* @@ -1406,6 +1426,10 @@ static int ext4_ordered_write_end(struct file *file, struct inode *inode = mapping->host; int ret = 0, ret2; + trace_mark(ext4_ordered_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { @@ -1444,6 +1468,10 @@ static int ext4_writeback_write_end(struct file *file, int ret = 0, ret2; loff_t new_i_size; + trace_mark(ext4_writeback_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) { ext4_update_i_disksize(inode, new_i_size); @@ -1479,6 +1507,10 @@ static int ext4_journalled_write_end(struct file *file, unsigned from, to; loff_t new_i_size; + trace_mark(ext4_journalled_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1625,7 +1657,7 @@ struct mpage_da_data { get_block_t *get_block; struct writeback_control *wbc; int io_done; - long pages_written; + int pages_written; int retval; }; @@ -1645,35 +1677,39 @@ struct mpage_da_data { */ static int mpage_da_submit_io(struct mpage_da_data *mpd) { - struct address_space *mapping = mpd->inode->i_mapping; - int ret = 0, err, nr_pages, i; - unsigned long index, end; - struct pagevec pvec; long pages_skipped; + struct pagevec pvec; + unsigned long index, end; + int ret = 0, err, nr_pages, i; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); + /* + * We need to start from the first_page to the next_page - 1 + * to make sure we also write the mapped dirty buffer_heads. + * If we look at mpd->lbh.b_blocknr we would only be looking + * at the currently mapped buffer_heads. + */ index = mpd->first_page; end = mpd->next_page - 1; + pagevec_init(&pvec, 0); while (index <= end) { - /* - * We can use PAGECACHE_TAG_DIRTY lookup here because - * even though we have cleared the dirty flag on the page - * We still keep the page in the radix tree with tag - * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. - * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback - * which is called via the below writepage callback. - */ - nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, - PAGECACHE_TAG_DIRTY, - min(end - index, - (pgoff_t)PAGEVEC_SIZE-1) + 1); + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + pages_skipped = mpd->wbc->pages_skipped; err = mapping->a_ops->writepage(page, mpd->wbc); if (!err && (pages_skipped == mpd->wbc->pages_skipped)) @@ -1831,13 +1867,13 @@ static void ext4_print_free_blocks(struct inode *inode) ext4_count_free_blocks(inode->i_sb)); printk(KERN_EMERG "Free/Dirty block details\n"); printk(KERN_EMERG "free_blocks=%lld\n", - percpu_counter_sum(&sbi->s_freeblocks_counter)); + (long long)percpu_counter_sum(&sbi->s_freeblocks_counter)); printk(KERN_EMERG "dirty_blocks=%lld\n", - percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + (long long)percpu_counter_sum(&sbi->s_dirtyblocks_counter)); printk(KERN_EMERG "Block reservation details\n"); - printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", + printk(KERN_EMERG "i_reserved_data_blocks=%u\n", EXT4_I(inode)->i_reserved_data_blocks); - printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", + printk(KERN_EMERG "i_reserved_meta_blocks=%u\n", EXT4_I(inode)->i_reserved_meta_blocks); return; } @@ -2087,11 +2123,29 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); + /* + * We need to try to allocate + * unmapped blocks in the same page. + * Otherwise we won't make progress + * with the page in ext4_da_writepage + */ if (buffer_dirty(bh) && (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); if (mpd->io_done) return MPAGE_DA_EXTENT_TAIL; + } else if (buffer_dirty(bh) && (buffer_mapped(bh))) { + /* + * mapped dirty buffer. We need to update + * the b_state because we look at + * b_state in mpage_da_map_blocks. We don't + * update b_size because if we find an + * unmapped buffer_head later we need to + * use the b_state flag of that buffer_head. + */ + if (mpd->lbh.b_size == 0) + mpd->lbh.b_state = + bh->b_state & BH_FLAGS; } logical++; } while ((bh = bh->b_this_page) != head); @@ -2269,10 +2323,13 @@ static int ext4_da_writepage(struct page *page, { int ret = 0; loff_t size; - unsigned long len; + unsigned int len; struct buffer_head *page_bufs; struct inode *inode = page->mapping->host; + trace_mark(ext4_da_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); size = i_size_read(inode); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2378,10 +2435,25 @@ static int ext4_da_writepages(struct address_space *mapping, struct mpage_da_data mpd; struct inode *inode = mapping->host; int no_nrwrite_index_update; - long pages_written = 0, pages_skipped; + int pages_written = 0; + long pages_skipped; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); + trace_mark(ext4_da_writepages, + "dev %s ino %lu nr_t_write %ld " + "pages_skipped %ld range_start %llu " + "range_end %llu nonblocking %d " + "for_kupdate %d for_reclaim %d " + "for_writepages %d range_cyclic %d", + inode->i_sb->s_id, inode->i_ino, + wbc->nr_to_write, wbc->pages_skipped, + (unsigned long long) wbc->range_start, + (unsigned long long) wbc->range_end, + wbc->nonblocking, wbc->for_kupdate, + wbc->for_reclaim, wbc->for_writepages, + wbc->range_cyclic); + /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() @@ -2389,6 +2461,20 @@ static int ext4_da_writepages(struct address_space *mapping, */ if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + + /* + * If the filesystem has aborted, it is read-only, so return + * right away instead of dumping stack traces later on that + * will obscure the real source of the problem. We test + * EXT4_MOUNT_ABORT instead of sb->s_flag's MS_RDONLY because + * the latter could be true if the filesystem is mounted + * read-only, and in that case, ext4_da_writepages should + * *never* be called, so if that ever happens, we would want + * the stack trace. + */ + if (unlikely(sbi->s_mount_opt & EXT4_MOUNT_ABORT)) + return -EROFS; + /* * Make sure nr_to_write is >= sbi->s_mb_stream_request * This make sure small files blocks are allocated in @@ -2433,7 +2519,7 @@ static int ext4_da_writepages(struct address_space *mapping, handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - printk(KERN_EMERG "%s: jbd2_start: " + printk(KERN_CRIT "%s: jbd2_start: " "%ld pages, ino %lu; err %d\n", __func__, wbc->nr_to_write, inode->i_ino, ret); dump_stack(); @@ -2486,6 +2572,14 @@ out_writepages: if (!no_nrwrite_index_update) wbc->no_nrwrite_index_update = 0; wbc->nr_to_write -= nr_to_writebump; + trace_mark(ext4_da_writepage_result, + "dev %s ino %lu ret %d pages_written %d " + "pages_skipped %ld congestion %d " + "more_io %d no_nrwrite_index_update %d", + inode->i_sb->s_id, inode->i_ino, ret, + pages_written, wbc->pages_skipped, + wbc->encountered_congestion, wbc->more_io, + wbc->no_nrwrite_index_update); return ret; } @@ -2498,7 +2592,7 @@ static int ext4_nonda_switch(struct super_block *sb) /* * switch to non delalloc mode if we are running low * on free block. The free block accounting via percpu - * counters can get slightly wrong with FBC_BATCH getting + * counters can get slightly wrong with percpu_counter_batch getting * accumulated on each CPU without updating global counters * Delalloc need an accurate free block accounting. So switch * to non delalloc when we are near to error range. @@ -2537,6 +2631,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, len, flags, pagep, fsdata); } *fsdata = (void *)0; + + trace_mark(ext4_da_write_begin, + "dev %s ino %lu pos %llu len %u flags %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, flags); retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2550,7 +2649,7 @@ retry: goto out; } - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { ext4_journal_stop(handle); ret = -ENOMEM; @@ -2626,6 +2725,10 @@ static int ext4_da_write_end(struct file *file, } } + trace_mark(ext4_da_write_end, + "dev %s ino %lu pos %llu len %u copied %u", + inode->i_sb->s_id, inode->i_ino, + (unsigned long long) pos, len, copied); start = pos & (PAGE_CACHE_SIZE - 1); end = start + copied - 1; @@ -2718,7 +2821,10 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) filemap_write_and_wait(mapping); } - if (EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { + BUG_ON(!EXT4_JOURNAL(inode) && + EXT4_I(inode)->i_state & EXT4_STATE_JDATA); + + if (EXT4_JOURNAL(inode) && EXT4_I(inode)->i_state & EXT4_STATE_JDATA) { /* * This is a REALLY heavyweight approach, but the use of * bmap on dirty files is expected to be extremely rare: @@ -2836,6 +2942,9 @@ static int ext4_normal_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; + trace_mark(ext4_normal_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2921,6 +3030,9 @@ static int ext4_journalled_writepage(struct page *page, loff_t size = i_size_read(inode); loff_t len; + trace_mark(ext4_journalled_writepage, + "dev %s ino %lu page_index %lu", + inode->i_sb->s_id, inode->i_ino, page->index); J_ASSERT(PageLocked(page)); if (page->index == size >> PAGE_CACHE_SHIFT) len = size & ~PAGE_CACHE_MASK; @@ -2989,7 +3101,10 @@ static void ext4_invalidatepage(struct page *page, unsigned long offset) if (offset == 0) ClearPageChecked(page); - jbd2_journal_invalidatepage(journal, page, offset); + if (journal) + jbd2_journal_invalidatepage(journal, page, offset); + else + block_invalidatepage(page, offset); } static int ext4_releasepage(struct page *page, gfp_t wait) @@ -2999,7 +3114,10 @@ static int ext4_releasepage(struct page *page, gfp_t wait) WARN_ON(PageChecked(page)); if (!page_has_buffers(page)) return 0; - return jbd2_journal_try_to_free_buffers(journal, page, wait); + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, wait); + else + return try_to_free_buffers(page); } /* @@ -3271,7 +3389,7 @@ int ext4_block_truncate_page(handle_t *handle, err = 0; if (ext4_should_journal_data(inode)) { - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, inode, bh); } else { if (ext4_should_order_data(inode)) err = ext4_jbd2_file_inode(handle, inode); @@ -3395,8 +3513,8 @@ static void ext4_clear_blocks(handle_t *handle, struct inode *inode, __le32 *p; if (try_to_extend_transaction(handle, inode)) { if (bh) { - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, inode, bh); } ext4_mark_inode_dirty(handle, inode); ext4_journal_test_restart(handle, inode); @@ -3496,7 +3614,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, count, block_to_free_p, p); if (this_bh) { - BUFFER_TRACE(this_bh, "call ext4_journal_dirty_metadata"); + BUFFER_TRACE(this_bh, "call ext4_handle_dirty_metadata"); /* * The buffer head should have an attached journal head at this @@ -3505,7 +3623,7 @@ static void ext4_free_data(handle_t *handle, struct inode *inode, * the block was cleared. Check for this instead of OOPSing. */ if (bh2jh(this_bh)) - ext4_journal_dirty_metadata(handle, this_bh); + ext4_handle_dirty_metadata(handle, inode, this_bh); else ext4_error(inode->i_sb, __func__, "circular indirect block detected, " @@ -3535,7 +3653,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, ext4_fsblk_t nr; __le32 *p; - if (is_handle_aborted(handle)) + if (ext4_handle_is_aborted(handle)) return; if (depth--) { @@ -3605,7 +3723,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * will merely complain about releasing a free block, * rather than leaking blocks. */ - if (is_handle_aborted(handle)) + if (ext4_handle_is_aborted(handle)) return; if (try_to_extend_transaction(handle, inode)) { ext4_mark_inode_dirty(handle, inode); @@ -3624,9 +3742,10 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, parent_bh)){ *p = 0; BUFFER_TRACE(parent_bh, - "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, - parent_bh); + "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, + inode, + parent_bh); } } } @@ -3814,7 +3933,7 @@ do_indirects: * synchronous */ if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); out_stop: /* * If this was a simple ftruncate(), and the file will remain alive @@ -3844,7 +3963,7 @@ static int __ext4_get_inode_loc(struct inode *inode, ext4_fsblk_t block; int inodes_per_block, inode_offset; - iloc->bh = 0; + iloc->bh = NULL; if (!ext4_valid_inum(sb, inode->i_ino)) return -EIO; @@ -3951,7 +4070,7 @@ make_io: num = EXT4_INODES_PER_GROUP(sb); if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - num -= le16_to_cpu(gdp->bg_itable_unused); + num -= ext4_itable_unused_count(sb, gdp); table += num / inodes_per_block; if (end > table) end = table; @@ -4313,8 +4432,8 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_SET_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_LARGE_FILE); sb->s_dirt = 1; - handle->h_sync = 1; - err = ext4_journal_dirty_metadata(handle, + ext4_handle_sync(handle); + err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); } } @@ -4341,9 +4460,8 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize); } - - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - rc = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + rc = ext4_handle_dirty_metadata(handle, inode, bh); if (!err) err = rc; ei->i_state &= ~EXT4_STATE_NEW; @@ -4406,6 +4524,25 @@ int ext4_write_inode(struct inode *inode, int wait) return ext4_force_commit(inode->i_sb); } +int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) +{ + int err = 0; + + mark_buffer_dirty(bh); + if (inode && inode_needs_sync(inode)) { + sync_dirty_buffer(bh); + if (buffer_req(bh) && !buffer_uptodate(bh)) { + ext4_error(inode->i_sb, __func__, + "IO error syncing inode, " + "inode=%lu, block=%llu", + inode->i_ino, + (unsigned long long)bh->b_blocknr); + err = -EIO; + } + } + return err; +} + /* * ext4_setattr() * @@ -4710,16 +4847,15 @@ int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc) { - int err = 0; - if (handle) { - err = ext4_get_inode_loc(inode, iloc); - if (!err) { - BUFFER_TRACE(iloc->bh, "get_write_access"); - err = ext4_journal_get_write_access(handle, iloc->bh); - if (err) { - brelse(iloc->bh); - iloc->bh = NULL; - } + int err; + + err = ext4_get_inode_loc(inode, iloc); + if (!err) { + BUFFER_TRACE(iloc->bh, "get_write_access"); + err = ext4_journal_get_write_access(handle, iloc->bh); + if (err) { + brelse(iloc->bh); + iloc->bh = NULL; } } ext4_std_error(inode->i_sb, err); @@ -4791,7 +4927,8 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) might_sleep(); err = ext4_reserve_inode_write(handle, inode, &iloc); - if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && + if (ext4_handle_valid(handle) && + EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize && !(EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND)) { /* * We need extra buffer credits since we may write into EA block @@ -4843,6 +4980,11 @@ void ext4_dirty_inode(struct inode *inode) handle_t *current_handle = ext4_journal_current_handle(); handle_t *handle; + if (!ext4_handle_valid(current_handle)) { + ext4_mark_inode_dirty(current_handle, inode); + return; + } + handle = ext4_journal_start(inode, 2); if (IS_ERR(handle)) goto out; @@ -4880,8 +5022,9 @@ static int ext4_pin_inode(handle_t *handle, struct inode *inode) BUFFER_TRACE(iloc.bh, "get_write_access"); err = jbd2_journal_get_write_access(handle, iloc.bh); if (!err) - err = ext4_journal_dirty_metadata(handle, - iloc.bh); + err = ext4_handle_dirty_metadata(handle, + inode, + iloc.bh); brelse(iloc.bh); } } @@ -4907,6 +5050,8 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) */ journal = EXT4_JOURNAL(inode); + if (!journal) + return 0; if (is_journal_aborted(journal)) return -EROFS; @@ -4936,7 +5081,7 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val) return PTR_ERR(handle); err = ext4_mark_inode_dirty(handle, inode); - handle->h_sync = 1; + ext4_handle_sync(handle); ext4_journal_stop(handle); ext4_std_error(inode->i_sb, err); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index dc99b4776d5..42dc83fb247 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -99,7 +99,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) goto flags_out; } if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) goto flags_err; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 444ad998f72..918aec0c8a1 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -100,7 +100,7 @@ * inode as: * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. So for each group we @@ -330,6 +330,18 @@ * object * */ +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group); +static int ext4_mb_init_per_dev_proc(struct super_block *sb); +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); + + static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -445,9 +457,9 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr += first + i; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - - ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "double-free of inode" + " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, first + i, e4b->bd_group); } @@ -477,7 +489,7 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk(KERN_ERR "corruption in group %lu " + printk(KERN_ERR "corruption in group %u " "at byte %u(%u): %x in copy != %x " "on disk/prealloc\n", e4b->bd_group, i, i * 8, b1[i], b2[i]); @@ -690,8 +702,8 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_error(sb, __func__, - "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", + ext4_grp_locked_error(sb, group, __func__, + "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", group, free, grp->bb_free); /* * If we intent to continue, we consider group descritor @@ -716,7 +728,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, * stored in the inode as * * { page } - * [ group 0 buddy][ group 0 bitmap] [group 1][ group 1]... + * [ group 0 bitmap][ group 0 buddy] [group 1][ group 1]... * * * one block each for bitmap and buddy information. @@ -782,25 +794,45 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (bh[i] == NULL) goto out; - if (buffer_uptodate(bh[i]) && - !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) + if (bitmap_uptodate(bh[i])) continue; lock_buffer(bh[i]); + if (bitmap_uptodate(bh[i])) { + unlock_buffer(bh[i]); + continue; + } spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); + set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); - unlock_buffer(bh[i]); spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + unlock_buffer(bh[i]); continue; } spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + if (buffer_uptodate(bh[i])) { + /* + * if not uninit if bh is uptodate, + * bitmap is also uptodate + */ + set_bitmap_uptodate(bh[i]); + unlock_buffer(bh[i]); + continue; + } get_bh(bh[i]); + /* + * submit the buffer_head for read. We can + * safely mark the bitmap as uptodate now. + * We do it here so the bitmap uptodate bit + * get set with buffer lock held. + */ + set_bitmap_uptodate(bh[i]); bh[i]->b_end_io = end_buffer_read_sync; submit_bh(READ, bh[i]); - mb_debug("read bitmap for group %lu\n", first_group + i); + mb_debug("read bitmap for group %u\n", first_group + i); } /* wait for I/O completion */ @@ -814,6 +846,8 @@ static int ext4_mb_init_cache(struct page *page, char *incore) err = 0; first_block = page->index * blocks_per_page; + /* init the page */ + memset(page_address(page), 0xff, PAGE_CACHE_SIZE); for (i = 0; i < blocks_per_page; i++) { int group; struct ext4_group_info *grinfo; @@ -840,7 +874,6 @@ static int ext4_mb_init_cache(struct page *page, char *incore) BUG_ON(incore == NULL); mb_debug("put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); - memset(data, 0xff, blocksize); grinfo = ext4_get_group_info(sb, group); grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, @@ -848,7 +881,9 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* * incore got set to the group block bitmap below */ + ext4_lock_group(sb, group); ext4_mb_generate_buddy(sb, data, incore, group); + ext4_unlock_group(sb, group); incore = NULL; } else { /* this is block of bitmap */ @@ -862,6 +897,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) /* mark all preallocated blks used in in-core bitmap */ ext4_mb_generate_from_pa(sb, data, group); + ext4_mb_generate_from_freelist(sb, data, group); ext4_unlock_group(sb, group); /* set incore so that the buddy information can be @@ -886,18 +922,20 @@ static noinline_for_stack int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, struct ext4_buddy *e4b) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; int blocks_per_page; int block; int pnum; int poff; struct page *page; int ret; + struct ext4_group_info *grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; - mb_debug("load group %lu\n", group); + mb_debug("load group %u\n", group); blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + grp = ext4_get_group_info(sb, group); e4b->bd_blkbits = sb->s_blocksize_bits; e4b->bd_info = ext4_get_group_info(sb, group); @@ -905,6 +943,15 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, e4b->bd_group = group; e4b->bd_buddy_page = NULL; e4b->bd_bitmap_page = NULL; + e4b->alloc_semp = &grp->alloc_sem; + + /* Take the read lock on the group alloc + * sem. This would make sure a parallel + * ext4_mb_init_group happening on other + * groups mapped by the page is blocked + * till we are done with allocation + */ + down_read(e4b->alloc_semp); /* * the buddy cache inode stores the block bitmap @@ -920,6 +967,14 @@ ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, page = find_get_page(inode->i_mapping, pnum); if (page == NULL || !PageUptodate(page)) { if (page) + /* + * drop the page reference and try + * to get the page with lock. If we + * are not uptodate that implies + * somebody just created the page but + * is yet to initialize the same. So + * wait for it to initialize. + */ page_cache_release(page); page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); if (page) { @@ -985,6 +1040,9 @@ err: page_cache_release(e4b->bd_buddy_page); e4b->bd_buddy = NULL; e4b->bd_bitmap = NULL; + + /* Done with the buddy cache */ + up_read(e4b->alloc_semp); return ret; } @@ -994,6 +1052,9 @@ static void ext4_mb_release_desc(struct ext4_buddy *e4b) page_cache_release(e4b->bd_bitmap_page); if (e4b->bd_buddy_page) page_cache_release(e4b->bd_buddy_page); + /* Done with the buddy cache */ + if (e4b->alloc_semp) + up_read(e4b->alloc_semp); } @@ -1031,7 +1092,10 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_clear_bit_atomic(lock, cur, bm); + if (lock) + mb_clear_bit_atomic(lock, cur, bm); + else + mb_clear_bit(cur, bm); cur++; } } @@ -1049,7 +1113,10 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - mb_set_bit_atomic(lock, cur, bm); + if (lock) + mb_set_bit_atomic(lock, cur, bm); + else + mb_set_bit(cur, bm); cur++; } } @@ -1094,12 +1161,11 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr += block; blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_unlock_group(sb, e4b->bd_group); - ext4_error(sb, __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %lu)\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "double-free of inode" + " %lu's block %llu(bit %u in group %u)", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); - ext4_lock_group(sb, e4b->bd_group); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1296,13 +1362,20 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, ac->ac_tail = ret & 0xffff; ac->ac_buddy = ret >> 16; - /* XXXXXXX: SUCH A HORRIBLE **CK */ - /*FIXME!! Why ? */ + /* + * take the page reference. We want the page to be pinned + * so that we don't get a ext4_mb_init_cache_call for this + * group until we update the bitmap. That would mean we + * double allocate blocks. The reference is dropped + * in ext4_mb_release_context + */ ac->ac_bitmap_page = e4b->bd_bitmap_page; get_page(ac->ac_bitmap_page); ac->ac_buddy_page = e4b->bd_buddy_page; get_page(ac->ac_buddy_page); - + /* on allocation we use ac to track the held semaphore */ + ac->alloc_semp = e4b->alloc_semp; + e4b->alloc_semp = NULL; /* store last allocated for subsequent stream allocation */ if ((ac->ac_flags & EXT4_MB_HINT_DATA)) { spin_lock(&sbi->s_md_lock); @@ -1326,6 +1399,8 @@ static void ext4_mb_check_limits(struct ext4_allocation_context *ac, struct ext4_free_extent ex; int max; + if (ac->ac_status == AC_STATUS_FOUND) + return; /* * We don't want to scan for a whole year */ @@ -1575,8 +1650,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_error(sb, __func__, "%d free blocks as per " - "group info. But bitmap says 0\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "%d free blocks as per " + "group info. But bitmap says 0", free); break; } @@ -1584,8 +1660,9 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_error(sb, __func__, "%d free blocks as per " - "group info. But got %d blocks\n", + ext4_grp_locked_error(sb, e4b->bd_group, + __func__, "%d free blocks as per " + "group info. But got %d blocks", free, ex.fe_len); /* * The number of free blocks differs. This mostly @@ -1692,6 +1769,173 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } +/* + * lock the group_info alloc_sem of all the groups + * belonging to the same buddy cache page. This + * make sure other parallel operation on the buddy + * cache doesn't happen whild holding the buddy cache + * lock + */ +int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) +{ + int i; + int block, pnum; + int blocks_per_page; + int groups_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + + groups_per_page = blocks_per_page >> 1; + if (groups_per_page == 0) + groups_per_page = 1; + /* read all groups the page covers into the cache */ + for (i = 0; i < groups_per_page; i++) { + + if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + break; + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + down_write_nested(&grp->alloc_sem, i); + } + return i; +} + +void ext4_mb_put_buddy_cache_lock(struct super_block *sb, + ext4_group_t group, int locked_group) +{ + int i; + int block, pnum; + int blocks_per_page; + ext4_group_t first_group; + struct ext4_group_info *grp; + + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + first_group = pnum * blocks_per_page / 2; + /* release locks on all the groups */ + for (i = 0; i < locked_group; i++) { + + grp = ext4_get_group_info(sb, first_group + i); + /* take all groups write allocation + * semaphore. This make sure there is + * no block allocation going on in any + * of that groups + */ + up_write(&grp->alloc_sem); + } + +} + +static int ext4_mb_init_group(struct super_block *sb, ext4_group_t group) +{ + + int ret; + void *bitmap; + int blocks_per_page; + int block, pnum, poff; + int num_grp_locked = 0; + struct ext4_group_info *this_grp; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct inode *inode = sbi->s_buddy_cache; + struct page *page = NULL, *bitmap_page = NULL; + + mb_debug("init group %lu\n", group); + blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; + this_grp = ext4_get_group_info(sb, group); + /* + * This ensures we don't add group + * to this buddy cache via resize + */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, group); + if (!EXT4_MB_GRP_NEED_INIT(this_grp)) { + /* + * somebody initialized the group + * return without doing anything + */ + ret = 0; + goto err; + } + /* + * the buddy cache inode stores the block bitmap + * and buddy information in consecutive blocks. + * So for each group we need two blocks. + */ + block = group * 2; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, NULL); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); + bitmap_page = page; + bitmap = page_address(page) + (poff * sb->s_blocksize); + + /* init buddy cache */ + block++; + pnum = block / blocks_per_page; + poff = block % blocks_per_page; + page = find_or_create_page(inode->i_mapping, pnum, GFP_NOFS); + if (page == bitmap_page) { + /* + * If both the bitmap and buddy are in + * the same page we don't need to force + * init the buddy + */ + unlock_page(page); + } else if (page) { + BUG_ON(page->mapping != inode->i_mapping); + ret = ext4_mb_init_cache(page, bitmap); + if (ret) { + unlock_page(page); + goto err; + } + unlock_page(page); + } + if (page == NULL || !PageUptodate(page)) { + ret = -EIO; + goto err; + } + mark_page_accessed(page); +err: + ext4_mb_put_buddy_cache_lock(sb, group, num_grp_locked); + if (bitmap_page) + page_cache_release(bitmap_page); + if (page) + page_cache_release(page); + return ret; +} + static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { @@ -1775,7 +2019,7 @@ repeat: group = 0; /* quick check to skip empty groups */ - grp = ext4_get_group_info(ac->ac_sb, group); + grp = ext4_get_group_info(sb, group); if (grp->bb_free == 0) continue; @@ -1788,10 +2032,9 @@ repeat: * we need full data about the group * to make a good selection */ - err = ext4_mb_load_buddy(sb, group, &e4b); + err = ext4_mb_init_group(sb, group); if (err) goto out; - ext4_mb_release_desc(&e4b); } /* @@ -1932,13 +2175,13 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) if (hs->op == EXT4_MB_HISTORY_ALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s %-5u %-5u %-2u " "%-5u %-5s %-5u %-6u\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, hs->orig.fe_start, hs->orig.fe_len, hs->orig.fe_logical); - sprintf(buf3, "%lu/%d/%u@%u", hs->goal.fe_group, + sprintf(buf3, "%u/%d/%u@%u", hs->goal.fe_group, hs->goal.fe_start, hs->goal.fe_len, hs->goal.fe_logical); seq_printf(seq, fmt, hs->pid, hs->ino, buf, buf3, buf2, @@ -1947,20 +2190,20 @@ static int ext4_mb_seq_history_show(struct seq_file *seq, void *v) hs->buddy ? 1 << hs->buddy : 0); } else if (hs->op == EXT4_MB_HISTORY_PREALLOC) { fmt = "%-5u %-8u %-23s %-23s %-23s\n"; - sprintf(buf2, "%lu/%d/%u@%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u@%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len, hs->result.fe_logical); - sprintf(buf, "%lu/%d/%u@%u", hs->orig.fe_group, + sprintf(buf, "%u/%d/%u@%u", hs->orig.fe_group, hs->orig.fe_start, hs->orig.fe_len, hs->orig.fe_logical); seq_printf(seq, fmt, hs->pid, hs->ino, buf, "", buf2); } else if (hs->op == EXT4_MB_HISTORY_DISCARD) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len); seq_printf(seq, "%-5u %-8u %-23s discard\n", hs->pid, hs->ino, buf2); } else if (hs->op == EXT4_MB_HISTORY_FREE) { - sprintf(buf2, "%lu/%d/%u", hs->result.fe_group, + sprintf(buf2, "%u/%d/%u", hs->result.fe_group, hs->result.fe_start, hs->result.fe_len); seq_printf(seq, "%-5u %-8u %-23s free\n", hs->pid, hs->ino, buf2); @@ -2073,7 +2316,7 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) return NULL; group = *pos + 1; - return (void *) group; + return (void *) ((unsigned long) group); } static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) @@ -2086,13 +2329,13 @@ static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) if (*pos < 0 || *pos >= sbi->s_groups_count) return NULL; group = *pos + 1; - return (void *) group;; + return (void *) ((unsigned long) group); } static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) { struct super_block *sb = seq->private; - long group = (long) v; + ext4_group_t group = (ext4_group_t) ((unsigned long) v); int i; int err; struct ext4_buddy e4b; @@ -2114,7 +2357,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) sizeof(struct ext4_group_info); err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { - seq_printf(seq, "#%-5lu: I/O error\n", group); + seq_printf(seq, "#%-5u: I/O error\n", group); return 0; } ext4_lock_group(sb, group); @@ -2122,7 +2365,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) ext4_unlock_group(sb, group); ext4_mb_release_desc(&e4b); - seq_printf(seq, "#%-5lu: %-5u %-5u %-5u [", group, sg.info.bb_free, + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, sg.info.bb_fragments, sg.info.bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= sb->s_blocksize_bits + 1 ? @@ -2296,10 +2539,11 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, ext4_free_blocks_after_init(sb, group, desc); } else { meta_group_info[i]->bb_free = - le16_to_cpu(desc->bg_free_blocks_count); + ext4_free_blks_count(sb, desc); } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK @@ -2327,54 +2571,6 @@ exit_meta_group_info: } /* ext4_mb_add_groupinfo */ /* - * Add a group to the existing groups. - * This function is used for online resize - */ -int ext4_mb_add_more_groupinfo(struct super_block *sb, ext4_group_t group, - struct ext4_group_desc *desc) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - int blocks_per_page; - int block; - int pnum; - struct page *page; - int err; - - /* Add group based on group descriptor*/ - err = ext4_mb_add_groupinfo(sb, group, desc); - if (err) - return err; - - /* - * Cache pages containing dynamic mb_alloc datas (buddy and bitmap - * datas) are set not up to date so that they will be re-initilaized - * during the next call to ext4_mb_load_buddy - */ - - /* Set buddy page as not up to date */ - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - block = group * 2; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Set bitmap page as not up to date */ - block++; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - return 0; -} - -/* * Update an existing group. * This function is used for online resize */ @@ -2457,7 +2653,7 @@ static int ext4_mb_init_backend(struct super_block *sb) desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR - "EXT4-fs: can't read descriptor %lu\n", i); + "EXT4-fs: can't read descriptor %u\n", i); goto err_freebuddy; } if (ext4_mb_add_groupinfo(sb, i, desc) != 0) @@ -2493,6 +2689,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) if (sbi->s_mb_offsets == NULL) { return -ENOMEM; } + + i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { kfree(sbi->s_mb_maxs); @@ -2551,7 +2749,8 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); - sbi->s_journal->j_commit_callback = release_blocks_on_commit; + if (sbi->s_journal) + sbi->s_journal->j_commit_callback = release_blocks_on_commit; printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; @@ -2652,7 +2851,7 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) list_for_each_safe(l, ltmp, &txn->t_private_list) { entry = list_entry(l, struct ext4_free_data, list); - mb_debug("gonna free %u blocks in group %lu (0x%p):", + mb_debug("gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); err = ext4_mb_load_buddy(sb, entry->group, &e4b); @@ -2679,8 +2878,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + entry->start_blk + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, - (unsigned long long) discard_block, entry->count); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", + sb->s_id, (unsigned long long) discard_block, + entry->count); sb_issue_discard(sb, discard_block, entry->count); kmem_cache_free(ext4_free_ext_cachep, entry); @@ -2791,7 +2991,7 @@ void exit_ext4_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle, unsigned long reserv_blks) + handle_t *handle, unsigned int reserv_blks) { struct buffer_head *bitmap_bh = NULL; struct ext4_super_block *es; @@ -2824,7 +3024,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, if (!gdp) goto out_err; - ext4_debug("using block group %lu(%d)\n", ac->ac_b_ex.fe_group, + ext4_debug("using block group %u(%d)\n", ac->ac_b_ex.fe_group, gdp->bg_free_blocks_count); err = ext4_journal_get_write_access(handle, gdp_bh); @@ -2843,8 +3043,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, in_range(block + len - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { ext4_error(sb, __func__, - "Allocating block in system zone - block = %llu", - block); + "Allocating block %llu in system zone of %d group\n", + block, ac->ac_b_ex.fe_group); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. @@ -2852,7 +3052,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; @@ -2866,18 +3066,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + mb_set_bits(NULL, bitmap_bh->b_data, + ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - gdp->bg_free_blocks_count = - cpu_to_le16(ext4_free_blocks_after_init(sb, - ac->ac_b_ex.fe_group, - gdp)); + ext4_free_blks_set(sb, gdp, + ext4_free_blocks_after_init(sb, + ac->ac_b_ex.fe_group, gdp)); } - le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); + len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; + ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); @@ -2899,10 +3098,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, spin_unlock(sb_bgl_lock(sbi, flex_group)); } - err = ext4_journal_dirty_metadata(handle, bitmap_bh); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (err) goto out_err; - err = ext4_journal_dirty_metadata(handle, gdp_bh); + err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: sb->s_dirt = 1; @@ -3031,7 +3230,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - unsigned long pa_end; + ext4_lblk_t pa_end; if (pa->pa_deleted) continue; @@ -3075,7 +3274,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { - unsigned long pa_end; + ext4_lblk_t pa_end; spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { pa_end = pa->pa_lstart + pa->pa_len; @@ -3307,6 +3506,32 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) } /* + * the function goes through all block freed in the group + * but not yet committed and marks them used in in-core bitmap. + * buddy must be generated from this bitmap + * Need to be called with ext4 group lock (ext4_lock_group) + */ +static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, + ext4_group_t group) +{ + struct rb_node *n; + struct ext4_group_info *grp; + struct ext4_free_data *entry; + + grp = ext4_get_group_info(sb, group); + n = rb_first(&(grp->bb_free_root)); + + while (n) { + entry = rb_entry(n, struct ext4_free_data, node); + mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), + bitmap, entry->start_blk, + entry->count); + n = rb_next(n); + } + return; +} + +/* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap * Need to be called with ext4 group lock (ext4_lock_group) @@ -3346,7 +3571,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, preallocated += len; count++; } - mb_debug("prellocated %u for group %lu\n", preallocated, group); + mb_debug("prellocated %u for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3363,7 +3588,7 @@ static void ext4_mb_pa_callback(struct rcu_head *head) static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { - unsigned long grp; + ext4_group_t grp; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3473,6 +3698,10 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) mb_debug("new inode pa %p: %llu/%u for %u\n", pa, pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_mark(ext4_mb_new_inode_pa, + "dev %s ino %lu pstart %llu len %u lstart %u", + sb->s_id, ac->ac_inode->i_ino, + pa->pa_pstart, pa->pa_len, pa->pa_lstart); ext4_mb_use_inode_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3530,7 +3759,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_linear = 1; mb_debug("new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + pa->pa_pstart, pa->pa_len, pa->pa_lstart); + trace_mark(ext4_mb_new_group_pa, "dev %s pstart %llu len %u lstart %u", + sb->s_id, pa->pa_pstart, pa->pa_len, pa->pa_lstart); ext4_mb_use_group_pa(ac, pa); atomic_add(pa->pa_free, &EXT4_SB(sb)->s_mb_preallocated); @@ -3579,16 +3810,18 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, { struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - unsigned long end; - unsigned long next; + unsigned int end; + unsigned int next; ext4_group_t group; ext4_grpblk_t bit; + unsigned long long grp_blk_start; sector_t start; int err = 0; int free = 0; BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); + grp_blk_start = pa->pa_pstart - bit; BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; @@ -3618,6 +3851,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_mb_store_history(ac); } + trace_mark(ext4_mb_release_inode_pa, + "dev %s ino %lu block %llu count %u", + sb->s_id, pa->pa_inode->i_ino, grp_blk_start + bit, + next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; } @@ -3626,8 +3863,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_error(sb, __func__, "free %u, pa_free %u\n", - free, pa->pa_free); + ext4_grp_locked_error(sb, group, + __func__, "free %u, pa_free %u", + free, pa->pa_free); /* * pa is already deleted so we use the value obtained * from the bitmap and continue. @@ -3650,6 +3888,8 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, if (ac) ac->ac_op = EXT4_MB_HISTORY_DISCARD; + trace_mark(ext4_mb_release_group_pa, "dev %s pstart %llu len %d", + sb->s_id, pa->pa_pstart, pa->pa_len); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3692,7 +3932,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug("discard preallocation for group %lu\n", group); + mb_debug("discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) return 0; @@ -3700,14 +3940,14 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu\n", group); + "bitmap for %u", group); return 0; } err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %u", group); put_bh(bitmap_bh); return 0; } @@ -3815,6 +4055,8 @@ void ext4_discard_preallocations(struct inode *inode) } mb_debug("discard preallocation for inode %lu\n", inode->i_ino); + trace_mark(ext4_discard_preallocations, "dev %s ino %lu", sb->s_id, + inode->i_ino); INIT_LIST_HEAD(&list); @@ -3874,14 +4116,14 @@ repeat: err = ext4_mb_load_buddy(sb, group, &e4b); if (err) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %u", group); continue; } bitmap_bh = ext4_read_block_bitmap(sb, group); if (bitmap_bh == NULL) { ext4_error(sb, __func__, "Error in reading block " - "bitmap for %lu\n", group); + "bitmap for %u", group); ext4_mb_release_desc(&e4b); continue; } @@ -4024,8 +4266,8 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_super_block *es = sbi->s_es; ext4_group_t group; - unsigned long len; - unsigned long goal; + unsigned int len; + ext4_fsblk_t goal; ext4_grpblk_t block; /* we can't allocate > group size */ @@ -4068,6 +4310,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, ac->ac_pa = NULL; ac->ac_bitmap_page = NULL; ac->ac_buddy_page = NULL; + ac->alloc_semp = NULL; ac->ac_lg = NULL; /* we have to define context: we'll we work with a file or @@ -4146,7 +4389,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL); if (ext4_mb_load_buddy(sb, group, &e4b)) { ext4_error(sb, __func__, "Error in loading buddy " - "information for %lu\n", group); + "information for %u", group); continue; } ext4_lock_group(sb, group); @@ -4248,6 +4491,8 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) } ext4_mb_put_pa(ac, ac->ac_sb, pa); } + if (ac->alloc_semp) + up_read(ac->alloc_semp); if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) @@ -4264,6 +4509,8 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) int ret; int freed = 0; + trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", + sb->s_id, needed); for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; @@ -4286,12 +4533,24 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; - unsigned long inquota; - unsigned long reserv_blks = 0; + unsigned int inquota; + unsigned int reserv_blks = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); + trace_mark(ext4_request_blocks, "dev %s flags %u len %u ino %lu " + "lblk %llu goal %llu lleft %llu lright %llu " + "pleft %llu pright %llu ", + sb->s_id, ar->flags, ar->len, + ar->inode ? ar->inode->i_ino : 0, + (unsigned long long) ar->logical, + (unsigned long long) ar->goal, + (unsigned long long) ar->lleft, + (unsigned long long) ar->lright, + (unsigned long long) ar->pleft, + (unsigned long long) ar->pright); + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { /* * With delalloc we already reserved the blocks @@ -4313,7 +4572,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } if (ar->len == 0) { *errp = -EDQUOT; - return 0; + goto out3; } inquota = ar->len; @@ -4348,10 +4607,14 @@ repeat: ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) ext4_mb_new_preallocation(ac); } - if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); if (*errp == -EAGAIN) { + /* + * drop the reference that we took + * in ext4_mb_use_best_found + */ + ext4_mb_release_context(ac); ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; ac->ac_b_ex.fe_len = 0; @@ -4382,6 +4645,26 @@ out2: out1: if (ar->len < inquota) DQUOT_FREE_BLOCK(ar->inode, inquota - ar->len); +out3: + if (!ar->len) { + if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + reserv_blks); + } + + trace_mark(ext4_allocate_blocks, + "dev %s block %llu flags %u len %u ino %lu " + "logical %llu goal %llu lleft %llu lright %llu " + "pleft %llu pright %llu ", + sb->s_id, (unsigned long long) block, + ar->flags, ar->len, ar->inode ? ar->inode->i_ino : 0, + (unsigned long long) ar->logical, + (unsigned long long) ar->goal, + (unsigned long long) ar->lleft, + (unsigned long long) ar->lright, + (unsigned long long) ar->pleft, + (unsigned long long) ar->pright); return block; } @@ -4403,27 +4686,23 @@ static int can_merge(struct ext4_free_data *entry1, static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, - ext4_group_t group, ext4_grpblk_t block, int count) + struct ext4_free_data *new_entry) { + ext4_grpblk_t block; + struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_data *entry, *new_entry; struct rb_node **n = &db->bb_free_root.rb_node, *node; struct rb_node *parent = NULL, *new_node; - + BUG_ON(!ext4_handle_valid(handle)); BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); - new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); - new_entry->start_blk = block; - new_entry->group = group; - new_entry->count = count; - new_entry->t_tid = handle->h_transaction->t_tid; new_node = &new_entry->node; + block = new_entry->start_blk; - ext4_lock_group(sb, group); if (!*n) { /* first free block exent. We need to protect buddy cache from being freed, @@ -4441,10 +4720,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_unlock_group(sb, group); - ext4_error(sb, __func__, - "Double free of blocks %d (%d %d)\n", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, e4b->bd_group, __func__, + "Double free of blocks %d (%d %d)", + block, entry->start_blk, entry->count); return 0; } } @@ -4483,7 +4761,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, spin_lock(&sbi->s_md_lock); list_add(&new_entry->list, &handle->h_transaction->t_private_list); spin_unlock(&sbi->s_md_lock); - ext4_unlock_group(sb, group); return 0; } @@ -4499,7 +4776,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; struct ext4_super_block *es; - unsigned long overflow; + unsigned int overflow; ext4_grpblk_t bit; struct buffer_head *gd_bh; ext4_group_t block_group; @@ -4522,6 +4799,10 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, } ext4_debug("freeing block %lu\n", block); + trace_mark(ext4_free_blocks, + "dev %s block %llu count %lu metadata %d ino %lu", + sb->s_id, (unsigned long long) block, count, metadata, + inode ? inode->i_ino : 0); ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); if (ac) { @@ -4581,11 +4862,6 @@ do_more: err = ext4_journal_get_write_access(handle, gd_bh); if (err) goto error_return; - - err = ext4_mb_load_buddy(sb, block_group, &e4b); - if (err) - goto error_return; - #ifdef AGGRESSIVE_CHECK { int i; @@ -4593,13 +4869,6 @@ do_more: BUG_ON(!mb_test_bit(bit + i, bitmap_bh->b_data)); } #endif - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); - - /* We dirtied the bitmap block */ - BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); - err = ext4_journal_dirty_metadata(handle, bitmap_bh); - if (ac) { ac->ac_b_ex.fe_group = block_group; ac->ac_b_ex.fe_start = bit; @@ -4607,19 +4876,41 @@ do_more: ext4_mb_store_history(ac); } - if (metadata) { - /* blocks being freed are metadata. these blocks shouldn't - * be used until this transaction is committed */ - ext4_mb_free_metadata(handle, &e4b, block_group, bit, count); + err = ext4_mb_load_buddy(sb, block_group, &e4b); + if (err) + goto error_return; + if (metadata && ext4_handle_valid(handle)) { + struct ext4_free_data *new_entry; + /* + * blocks being freed are metadata. these blocks shouldn't + * be used until this transaction is committed + */ + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = bit; + new_entry->group = block_group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); + ext4_mb_free_metadata(handle, &e4b, new_entry); + ext4_unlock_group(sb, block_group); } else { ext4_lock_group(sb, block_group); + /* need to update group_info->bb_free and bitmap + * with group lock held. generate_buddy look at + * them with group lock_held + */ + mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, + bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); ext4_unlock_group(sb, block_group); } spin_lock(sb_bgl_lock(sbi, block_group)); - le16_add_cpu(&gdp->bg_free_blocks_count, count); + ret = ext4_free_blks_count(sb, gdp) + count; + ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); @@ -4635,9 +4926,13 @@ do_more: *freed += count; + /* We dirtied the bitmap block */ + BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); + err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); + /* And the group descriptor block */ BUFFER_TRACE(gd_bh, "dirtied group descriptor block"); - ret = ext4_journal_dirty_metadata(handle, gd_bh); + ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index b5dff1fff1e..10a2921baf1 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -20,6 +20,7 @@ #include <linux/version.h> #include <linux/blkdev.h> #include <linux/marker.h> +#include <linux/mutex.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,9 +99,6 @@ */ #define MB_DEFAULT_GROUP_PREALLOC 512 -static struct kmem_cache *ext4_pspace_cachep; -static struct kmem_cache *ext4_ac_cachep; -static struct kmem_cache *ext4_free_ext_cachep; struct ext4_free_data { /* this links the free block information from group_info */ @@ -120,26 +118,6 @@ struct ext4_free_data { tid_t t_tid; }; -struct ext4_group_info { - unsigned long bb_state; - struct rb_root bb_free_root; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; - struct list_head bb_prealloc_list; -#ifdef DOUBLE_CHECK - void *bb_bitmap; -#endif - unsigned short bb_counters[]; -}; - -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 - -#define EXT4_MB_GRP_NEED_INIT(grp) \ - (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) - - struct ext4_prealloc_space { struct list_head pa_inode_list; struct list_head pa_group_list; @@ -217,6 +195,11 @@ struct ext4_allocation_context { __u8 ac_op; /* operation, for history only */ struct page *ac_bitmap_page; struct page *ac_buddy_page; + /* + * pointer to the held semaphore upon successful + * block allocation + */ + struct rw_semaphore *alloc_semp; struct ext4_prealloc_space *ac_pa; struct ext4_locality_group *ac_lg; }; @@ -250,6 +233,7 @@ struct ext4_buddy { struct super_block *bd_sb; __u16 bd_blkbits; ext4_group_t bd_group; + struct rw_semaphore *alloc_semp; }; #define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) #define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) @@ -259,51 +243,12 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) { return; } -#else -static void ext4_mb_store_history(struct ext4_allocation_context *ac); #endif #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); - -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group); -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, sector_t block, - int count); -static void ext4_mb_put_pa(struct ext4_allocation_context *, - struct super_block *, struct ext4_prealloc_space *pa); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); -static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); - - -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); -} - -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, +static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, struct ext4_free_extent *fex) { ext4_fsblk_t block; diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index f2a9cf498ec..734abca25e3 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -59,7 +59,8 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * Make sure the credit we accumalated is not really high */ - if (needed && handle->h_buffer_credits >= EXT4_RESERVE_TRANS_BLOCKS) { + if (needed && ext4_handle_has_enough_credits(handle, + EXT4_RESERVE_TRANS_BLOCKS)) { retval = ext4_journal_restart(handle, needed); if (retval) goto err_out; @@ -229,7 +230,7 @@ static int extend_credit_for_blkdel(handle_t *handle, struct inode *inode) { int retval = 0, needed; - if (handle->h_buffer_credits > EXT4_RESERVE_TRANS_BLOCKS) + if (ext4_handle_has_enough_credits(handle, EXT4_RESERVE_TRANS_BLOCKS+1)) return 0; /* * We are freeing a blocks. During this we touch @@ -458,13 +459,13 @@ int ext4_ext_migrate(struct inode *inode) struct list_blocks_struct lb; unsigned long max_entries; - if (!test_opt(inode->i_sb, EXTENTS)) - /* - * if mounted with noextents we don't allow the migrate - */ - return -EINVAL; - - if ((EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + /* + * If the filesystem does not support extents, or the inode + * already is extent-based, error out. + */ + if (!EXT4_HAS_INCOMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_INCOMPAT_EXTENTS) || + (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) return -EINVAL; if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index da98a9012fa..fec0b4c2f5f 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -74,10 +74,6 @@ static struct buffer_head *ext4_append(handle_t *handle, #define assert(test) J_ASSERT(test) #endif -#ifndef swap -#define swap(x, y) do { typeof(x) z = x; x = y; y = z; } while (0) -#endif - #ifdef DX_DEBUG #define dxtrace(command) command #else @@ -372,6 +368,8 @@ dx_probe(const struct qstr *d_name, struct inode *dir, goto fail; } hinfo->hash_version = root->info.hash_version; + if (hinfo->hash_version <= DX_HASH_TEA) + hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; if (d_name) ext4fs_dirhash(d_name->name, d_name->len, hinfo); @@ -641,6 +639,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, dir = dir_file->f_path.dentry->d_inode; if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += + EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; count = htree_dirblock_to_tree(dir_file, dir, 0, &hinfo, start_hash, start_minor_hash); @@ -806,7 +807,7 @@ static inline int ext4_match (int len, const char * const name, static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, const struct qstr *d_name, - unsigned long offset, + unsigned int offset, struct ext4_dir_entry_2 ** res_dir) { struct ext4_dir_entry_2 * de; @@ -1043,11 +1044,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru bh = ext4_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { - unsigned long ino = le32_to_cpu(de->inode); + __u32 ino = le32_to_cpu(de->inode); brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { ext4_error(dir->i_sb, "ext4_lookup", - "bad inode number: %lu", ino); + "bad inode number: %u", ino); return ERR_PTR(-EIO); } inode = ext4_iget(dir->i_sb, ino); @@ -1060,7 +1061,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { - unsigned long ino; + __u32 ino; struct inode *inode; static const struct qstr dotdot = { .name = "..", @@ -1078,7 +1079,7 @@ struct dentry *ext4_get_parent(struct dentry *child) if (!ext4_valid_inum(child->d_inode->i_sb, ino)) { ext4_error(child->d_inode->i_sb, "ext4_get_parent", - "bad inode number: %lu", ino); + "bad inode number: %u", ino); return ERR_PTR(-EIO); } @@ -1166,9 +1167,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, u32 hash2; struct dx_map_entry *map; char *data1 = (*bh)->b_data, *data2; - unsigned split, move, size, i; + unsigned split, move, size; struct ext4_dir_entry_2 *de = NULL, *de2; - int err = 0; + int err = 0, i; bh2 = ext4_append (handle, dir, &newblock, &err); if (!(bh2)) { @@ -1228,10 +1229,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, de = de2; } dx_insert_block(frame, hash2 + continued, newblock); - err = ext4_journal_dirty_metadata(handle, bh2); + err = ext4_handle_dirty_metadata(handle, dir, bh2); if (err) goto journal_error; - err = ext4_journal_dirty_metadata(handle, frame->bh); + err = ext4_handle_dirty_metadata(handle, dir, frame->bh); if (err) goto journal_error; brelse(bh2); @@ -1266,7 +1267,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; int namelen = dentry->d_name.len; - unsigned long offset = 0; + unsigned int offset = 0; unsigned short reclen; int nlen, rlen, err; char *top; @@ -1335,8 +1336,8 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, ext4_update_dx_flag(dir); dir->i_version++; ext4_mark_inode_dirty(handle, dir); - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - err = ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + err = ext4_handle_dirty_metadata(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); brelse(bh); @@ -1408,6 +1409,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; + if (hinfo.hash_version <= DX_HASH_TEA) + hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; ext4fs_dirhash(name, namelen, &hinfo); frame = frames; @@ -1437,7 +1440,6 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de; struct super_block *sb; @@ -1459,7 +1461,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { + for (block = 0; block < blocks; block++) { bh = ext4_bread(handle, dir, block, 0, &retval); if(!bh) return retval; @@ -1574,7 +1576,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); - err = ext4_journal_dirty_metadata(handle, bh2); + err = ext4_handle_dirty_metadata(handle, inode, bh2); if (err) goto journal_error; brelse (bh2); @@ -1600,7 +1602,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; } - ext4_journal_dirty_metadata(handle, frames[0].bh); + ext4_handle_dirty_metadata(handle, inode, frames[0].bh); } de = do_split(handle, dir, &bh, frame, &hinfo, &err); if (!de) @@ -1646,8 +1648,8 @@ static int ext4_delete_entry(handle_t *handle, else de->inode = 0; dir->i_version++; - BUFFER_TRACE(bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, bh); + BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, bh); return 0; } i += ext4_rec_len_from_disk(de->rec_len); @@ -1725,7 +1727,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode (handle, dir, mode); err = PTR_ERR(inode); @@ -1759,7 +1761,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, mode); err = PTR_ERR(inode); @@ -1795,7 +1797,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFDIR | mode); err = PTR_ERR(inode); @@ -1824,8 +1826,8 @@ retry: strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; - BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, dir_block); + BUFFER_TRACE(dir_block, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, dir, dir_block); brelse(dir_block); ext4_mark_inode_dirty(handle, inode); err = ext4_add_entry(handle, dentry, inode); @@ -1854,7 +1856,7 @@ out_stop: */ static int empty_dir(struct inode *inode) { - unsigned long offset; + unsigned int offset; struct buffer_head *bh; struct ext4_dir_entry_2 *de, *de1; struct super_block *sb; @@ -1899,7 +1901,7 @@ static int empty_dir(struct inode *inode) if (err) ext4_error(sb, __func__, "error %d reading directory" - " #%lu offset %lu", + " #%lu offset %u", err, inode->i_ino, offset); offset += sb->s_blocksize; continue; @@ -1937,6 +1939,9 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) struct ext4_iloc iloc; int err = 0, rc; + if (!ext4_handle_valid(handle)) + return 0; + lock_super(sb); if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; @@ -1965,7 +1970,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* Insert this inode at the head of the on-disk orphan list... */ NEXT_ORPHAN(inode) = le32_to_cpu(EXT4_SB(sb)->s_es->s_last_orphan); EXT4_SB(sb)->s_es->s_last_orphan = cpu_to_le32(inode->i_ino); - err = ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + err = ext4_handle_dirty_metadata(handle, inode, EXT4_SB(sb)->s_sbh); rc = ext4_mark_iloc_dirty(handle, inode, &iloc); if (!err) err = rc; @@ -1999,10 +2004,13 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) struct list_head *prev; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi; - unsigned long ino_next; + __u32 ino_next; struct ext4_iloc iloc; int err = 0; + if (!ext4_handle_valid(handle)) + return 0; + lock_super(inode->i_sb); if (list_empty(&ei->i_orphan)) { unlock_super(inode->i_sb); @@ -2021,7 +2029,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) * transaction handle with which to update the orphan list on * disk, but we still need to remove the inode from the linked * list in memory. */ - if (!handle) + if (sbi->s_journal && !handle) goto out; err = ext4_reserve_inode_write(handle, inode, &iloc); @@ -2029,19 +2037,19 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) goto out_err; if (prev == &sbi->s_orphan) { - jbd_debug(4, "superblock will point to %lu\n", ino_next); + jbd_debug(4, "superblock will point to %u\n", ino_next); BUFFER_TRACE(sbi->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, sbi->s_sbh); if (err) goto out_brelse; sbi->s_es->s_last_orphan = cpu_to_le32(ino_next); - err = ext4_journal_dirty_metadata(handle, sbi->s_sbh); + err = ext4_handle_dirty_metadata(handle, inode, sbi->s_sbh); } else { struct ext4_iloc iloc2; struct inode *i_prev = &list_entry(prev, struct ext4_inode_info, i_orphan)->vfs_inode; - jbd_debug(4, "orphan inode %lu will point to %lu\n", + jbd_debug(4, "orphan inode %lu will point to %u\n", i_prev->i_ino, ino_next); err = ext4_reserve_inode_write(handle, i_prev, &iloc2); if (err) @@ -2086,7 +2094,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) goto end_rmdir; if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = dentry->d_inode; @@ -2140,7 +2148,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); retval = -ENOENT; bh = ext4_find_entry(dir, &dentry->d_name, &de); @@ -2197,7 +2205,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); @@ -2212,8 +2220,7 @@ retry: * We have a transaction open. All is sweetness. It also sets * i_size in generic_commit_write(). */ - err = __page_symlink(inode, symname, l, - mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS); + err = __page_symlink(inode, symname, l, 1); if (err) { clear_nlink(inode); unlock_new_inode(inode); @@ -2261,7 +2268,7 @@ retry: return PTR_ERR(handle); if (IS_DIRSYNC(dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); inode->i_ctime = ext4_current_time(inode); ext4_inc_count(handle, inode); @@ -2310,7 +2317,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, return PTR_ERR(handle); if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) - handle->h_sync = 1; + ext4_handle_sync(handle); old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); /* @@ -2364,8 +2371,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, new_dir->i_ctime = new_dir->i_mtime = ext4_current_time(new_dir); ext4_mark_inode_dirty(handle, new_dir); - BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, new_bh); + BUFFER_TRACE(new_bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, new_dir, new_bh); brelse(new_bh); new_bh = NULL; } @@ -2415,8 +2422,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, BUFFER_TRACE(dir_bh, "get_write_access"); ext4_journal_get_write_access(handle, dir_bh); PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino); - BUFFER_TRACE(dir_bh, "call ext4_journal_dirty_metadata"); - ext4_journal_dirty_metadata(handle, dir_bh); + BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata"); + ext4_handle_dirty_metadata(handle, old_dir, dir_bh); ext4_dec_count(handle, old_dir); if (new_inode) { /* checked empty_dir above, can't have another parent, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b6ec1843a01..c328be5d688 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -50,7 +50,7 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) ext4_warning(sb, __func__, - "Cannot add at group %u (only %lu groups)", + "Cannot add at group %u (only %u groups)", input->group, sbi->s_groups_count); else if (offset != 0) ext4_warning(sb, __func__, "Last group not full"); @@ -149,7 +149,7 @@ static int extend_or_restart_transaction(handle_t *handle, int thresh, { int err; - if (handle->h_buffer_credits >= thresh) + if (ext4_handle_has_enough_credits(handle, thresh)) return 0; err = ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA); @@ -232,7 +232,7 @@ static int setup_new_group_blocks(struct super_block *sb, memcpy(gdb->b_data, sbi->s_group_desc[i]->b_data, gdb->b_size); set_buffer_uptodate(gdb); unlock_buffer(gdb); - ext4_journal_dirty_metadata(handle, gdb); + ext4_handle_dirty_metadata(handle, NULL, gdb); ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -251,7 +251,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(bh); goto exit_bh; } - ext4_journal_dirty_metadata(handle, gdb); + ext4_handle_dirty_metadata(handle, NULL, gdb); ext4_set_bit(bit, bh->b_data); brelse(gdb); } @@ -276,7 +276,7 @@ static int setup_new_group_blocks(struct super_block *sb, err = PTR_ERR(it); goto exit_bh; } - ext4_journal_dirty_metadata(handle, it); + ext4_handle_dirty_metadata(handle, NULL, it); brelse(it); ext4_set_bit(bit, bh->b_data); } @@ -284,11 +284,9 @@ static int setup_new_group_blocks(struct super_block *sb, if ((err = extend_or_restart_transaction(handle, 2, bh))) goto exit_bh; - mark_bitmap_end(input->blocks_count, EXT4_BLOCKS_PER_GROUP(sb), - bh->b_data); - ext4_journal_dirty_metadata(handle, bh); + mark_bitmap_end(input->blocks_count, sb->s_blocksize * 8, bh->b_data); + ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); - /* Mark unused entries in inode bitmap used */ ext4_debug("clear inode bitmap %#04llx (+%llu)\n", input->inode_bitmap, input->inode_bitmap - start); @@ -297,9 +295,9 @@ static int setup_new_group_blocks(struct super_block *sb, goto exit_journal; } - mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), EXT4_BLOCKS_PER_GROUP(sb), + mark_bitmap_end(EXT4_INODES_PER_GROUP(sb), sb->s_blocksize * 8, bh->b_data); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); exit_bh: brelse(bh); @@ -486,12 +484,12 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, * reserved inode, and will become GDT blocks (primary and backup). */ data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)] = 0; - ext4_journal_dirty_metadata(handle, dind); + ext4_handle_dirty_metadata(handle, NULL, dind); brelse(dind); inode->i_blocks -= (gdbackups + 1) * sb->s_blocksize >> 9; ext4_mark_iloc_dirty(handle, inode, &iloc); memset((*primary)->b_data, 0, sb->s_blocksize); - ext4_journal_dirty_metadata(handle, *primary); + ext4_handle_dirty_metadata(handle, NULL, *primary); o_group_desc = EXT4_SB(sb)->s_group_desc; memcpy(n_group_desc, o_group_desc, @@ -502,7 +500,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, kfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); return 0; @@ -618,7 +616,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, primary[i]->b_blocknr, gdbackups, blk + primary[i]->b_blocknr); */ data[gdbackups] = cpu_to_le32(blk + primary[i]->b_blocknr); - err2 = ext4_journal_dirty_metadata(handle, primary[i]); + err2 = ext4_handle_dirty_metadata(handle, NULL, primary[i]); if (!err) err = err2; } @@ -676,7 +674,8 @@ static void update_backups(struct super_block *sb, struct buffer_head *bh; /* Out of journal space, and can't get more - abort - so sad */ - if (handle->h_buffer_credits == 0 && + if (ext4_handle_valid(handle) && + handle->h_buffer_credits == 0 && ext4_journal_extend(handle, EXT4_MAX_TRANS_DATA) && (err = ext4_journal_restart(handle, EXT4_MAX_TRANS_DATA))) break; @@ -696,7 +695,7 @@ static void update_backups(struct super_block *sb, memset(bh->b_data + size, 0, rest); set_buffer_uptodate(bh); unlock_buffer(bh); - ext4_journal_dirty_metadata(handle, bh); + ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); } if ((err2 = ext4_journal_stop(handle)) && !err) @@ -715,7 +714,7 @@ static void update_backups(struct super_block *sb, exit_err: if (err) { ext4_warning(sb, __func__, - "can't update backup for group %lu (err %d), " + "can't update backup for group %u (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; sbi->s_es->s_state &= cpu_to_le16(~EXT4_VALID_FS); @@ -747,6 +746,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) struct inode *inode = NULL; handle_t *handle; int gdb_off, gdb_num; + int num_grp_locked = 0; int err, err2; gdb_num = input->group / EXT4_DESC_PER_BLOCK(sb); @@ -761,13 +761,13 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __func__, "blocks_count overflow\n"); + ext4_warning(sb, __func__, "blocks_count overflow"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __func__, "inodes_count overflow\n"); + ext4_warning(sb, __func__, "inodes_count overflow"); return -EINVAL; } @@ -787,6 +787,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) } } + if ((err = verify_group_input(sb, input))) goto exit_put; @@ -855,6 +856,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * using the new disk blocks. */ + num_grp_locked = ext4_mb_get_buddy_cache_lock(sb, input->group); /* Update group descriptor block for new group */ gdp = (struct ext4_group_desc *)((char *)primary->b_data + gdb_off * EXT4_DESC_SIZE(sb)); @@ -862,17 +864,20 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) ext4_block_bitmap_set(sb, gdp, input->block_bitmap); /* LV FIXME */ ext4_inode_bitmap_set(sb, gdp, input->inode_bitmap); /* LV FIXME */ ext4_inode_table_set(sb, gdp, input->inode_table); /* LV FIXME */ - gdp->bg_free_blocks_count = cpu_to_le16(input->free_blocks_count); - gdp->bg_free_inodes_count = cpu_to_le16(EXT4_INODES_PER_GROUP(sb)); + ext4_free_blks_set(sb, gdp, input->free_blocks_count); + ext4_free_inodes_set(sb, gdp, EXT4_INODES_PER_GROUP(sb)); + gdp->bg_flags |= cpu_to_le16(EXT4_BG_INODE_ZEROED); gdp->bg_checksum = ext4_group_desc_csum(sbi, input->group, gdp); /* * We can allocate memory for mb_alloc based on the new group * descriptor */ - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); - if (err) + err = ext4_mb_add_groupinfo(sb, input->group, gdp); + if (err) { + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); goto exit_journal; + } /* * Make the new blocks and inodes valid next. We do this before @@ -914,8 +919,9 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* Update the global fs size fields */ sbi->s_groups_count++; + ext4_mb_put_buddy_cache_lock(sb, input->group, num_grp_locked); - ext4_journal_dirty_metadata(handle, primary); + ext4_handle_dirty_metadata(handle, NULL, primary); /* Update the reserved block counts only once the new group is * active. */ @@ -937,7 +943,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) EXT4_INODES_PER_GROUP(sb); } - ext4_journal_dirty_metadata(handle, sbi->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); sb->s_dirt = 1; exit_journal: @@ -975,9 +981,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, struct buffer_head *bh; handle_t *handle; int err; - unsigned long freed_blocks; ext4_group_t group; - struct ext4_group_info *grp; /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after @@ -997,8 +1001,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __func__, - "CONFIG_LBD not enabled\n"); + ext4_warning(sb, __func__, "CONFIG_LBD not enabled"); return -EINVAL; } @@ -1071,62 +1074,18 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); sb->s_dirt = 1; unlock_super(sb); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); - ext4_free_blocks_sb(handle, sb, o_blocks_count, add, &freed_blocks); + /* We add the blocks to the bitmap and set the group need init bit */ + ext4_add_groupblocks(handle, sb, o_blocks_count, add); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) goto exit_put; - /* - * Mark mballoc pages as not up to date so that they will be updated - * next time they are loaded by ext4_mb_load_buddy. - * - * XXX Bad, Bad, BAD!!! We should not be overloading the - * Uptodate flag, particularly on thte bitmap bh, as way of - * hinting to ext4_mb_load_buddy() that it needs to be - * overloaded. A user could take a LVM snapshot, then do an - * on-line fsck, and clear the uptodate flag, and this would - * not be a bug in userspace, but a bug in the kernel. FIXME!!! - */ - { - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct inode *inode = sbi->s_buddy_cache; - int blocks_per_page; - int block; - int pnum; - struct page *page; - - /* Set buddy page as not up to date */ - blocks_per_page = PAGE_CACHE_SIZE / sb->s_blocksize; - block = group * 2; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Set bitmap page as not up to date */ - block++; - pnum = block / blocks_per_page; - page = find_get_page(inode->i_mapping, pnum); - if (page != NULL) { - ClearPageUptodate(page); - page_cache_release(page); - } - - /* Get the info on the last group */ - grp = ext4_get_group_info(sb, group); - - /* Update free blocks in group info */ - ext4_mb_update_group_info(grp, add); - } - if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extended group to %llu blocks\n", ext4_blocks_count(es)); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 04158ad74db..8f7e0be8ab1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -51,8 +51,6 @@ struct proc_dir_entry *ext4_proc_root; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_create_journal(struct super_block *, struct ext4_super_block *, - unsigned int); static void ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, @@ -93,6 +91,38 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb, (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0); } +__u32 ext4_free_blks_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_blocks_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0); +} + +__u32 ext4_free_inodes_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_free_inodes_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0); +} + +__u32 ext4_used_dirs_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_used_dirs_count_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0); +} + +__u32 ext4_itable_unused_count(struct super_block *sb, + struct ext4_group_desc *bg) +{ + return le16_to_cpu(bg->bg_itable_unused_lo) | + (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ? + (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0); +} + void ext4_block_bitmap_set(struct super_block *sb, struct ext4_group_desc *bg, ext4_fsblk_t blk) { @@ -117,6 +147,38 @@ void ext4_inode_table_set(struct super_block *sb, bg->bg_inode_table_hi = cpu_to_le32(blk >> 32); } +void ext4_free_blks_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_blocks_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_blocks_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_free_inodes_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_free_inodes_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_free_inodes_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_used_dirs_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_used_dirs_count_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_used_dirs_count_hi = cpu_to_le16(count >> 16); +} + +void ext4_itable_unused_set(struct super_block *sb, + struct ext4_group_desc *bg, __u32 count) +{ + bg->bg_itable_unused_lo = cpu_to_le16((__u16)count); + if (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT) + bg->bg_itable_unused_hi = cpu_to_le16(count >> 16); +} + /* * Wrappers for jbd2_journal_start/end. * @@ -136,13 +198,19 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; - if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, - "Detected aborted journal"); - return ERR_PTR(-EROFS); + if (journal) { + if (is_journal_aborted(journal)) { + ext4_abort(sb, __func__, + "Detected aborted journal"); + return ERR_PTR(-EROFS); + } + return jbd2_journal_start(journal, nblocks); } - - return jbd2_journal_start(journal, nblocks); + /* + * We're not journaling, return the appropriate indication. + */ + current->journal_info = EXT4_NOJOURNAL_HANDLE; + return current->journal_info; } /* @@ -157,6 +225,14 @@ int __ext4_journal_stop(const char *where, handle_t *handle) int err; int rc; + if (!ext4_handle_valid(handle)) { + /* + * Do this here since we don't call jbd2_journal_stop() in + * no-journal mode. + */ + current->journal_info = NULL; + return 0; + } sb = handle->h_transaction->t_journal->j_private; err = handle->h_err; rc = jbd2_journal_stop(handle); @@ -174,6 +250,8 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); + BUG_ON(!ext4_handle_valid(handle)); + if (bh) BUFFER_TRACE(bh, "abort"); @@ -350,6 +428,44 @@ void ext4_warning(struct super_block *sb, const char *function, va_end(args); } +void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, + const char *function, const char *fmt, ...) +__releases(bitlock) +__acquires(bitlock) +{ + va_list args; + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + va_start(args, fmt); + printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + vprintk(fmt, args); + printk("\n"); + va_end(args); + + if (test_opt(sb, ERRORS_CONT)) { + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + ext4_commit_super(sb, es, 0); + return; + } + ext4_unlock_group(sb, grp); + ext4_handle_error(sb); + /* + * We only get here in the ERRORS_RO case; relocking the group + * may be dangerous, but nothing bad will happen since the + * filesystem will have already been marked read/only and the + * journal has been aborted. We return 1 as a hint to callers + * who might what to use the return value from + * ext4_grp_locked_error() to distinguish beween the + * ERRORS_CONT and ERRORS_RO case, and perhaps return more + * aggressively from the ext4 function in question, with a + * more appropriate error code. + */ + ext4_lock_group(sb, grp); + return; +} + + void ext4_update_dynamic_rev(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; @@ -389,7 +505,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev) return bdev; fail: - printk(KERN_ERR "EXT4: failed to open journal device %s: %ld\n", + printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n", __bdevname(dev, b), PTR_ERR(bdev)); return NULL; } @@ -448,11 +564,13 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; - if (err < 0) - ext4_abort(sb, __func__, "Couldn't clean up the journal"); - + if (sbi->s_journal) { + err = jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + if (err < 0) + ext4_abort(sb, __func__, + "Couldn't clean up the journal"); + } if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); @@ -522,6 +640,11 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); + /* + * Note: We can be called before EXT4_SB(sb)->s_journal is set, + * therefore it can be null here. Don't check it, just initialize + * jinode. + */ jbd2_journal_init_jbd_inode(&ei->jinode, &ei->vfs_inode); ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; @@ -588,7 +711,8 @@ static void ext4_clear_inode(struct inode *inode) } #endif ext4_discard_preallocations(inode); - jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, + if (EXT4_JOURNAL(inode)) + jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode); } @@ -681,10 +805,19 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) #endif if (!test_opt(sb, RESERVATION)) seq_puts(seq, ",noreservation"); - if (sbi->s_commit_interval) { + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { seq_printf(seq, ",commit=%u", (unsigned) (sbi->s_commit_interval / HZ)); } + if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) { + seq_printf(seq, ",min_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) { + seq_printf(seq, ",max_batch_time=%u", + (unsigned) sbi->s_min_batch_time); + } + /* * We're changing the default of barrier mount option, so * let's always display its mount state so it's clear what its @@ -696,8 +829,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (!test_opt(sb, EXTENTS)) - seq_puts(seq, ",noextents"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); if (!test_opt(sb, DELALLOC)) @@ -772,6 +903,25 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, ext4_nfs_get_inode); } +/* + * Try to release metadata pages (indirect blocks, directories) which are + * mapped via the block device. Since these pages could have journal heads + * which would prevent try_to_free_buffers() from freeing them, we must use + * jbd2 layer's try_to_free_buffers() function to release them. + */ +static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait) +{ + journal_t *journal = EXT4_SB(sb)->s_journal; + + WARN_ON(PageChecked(page)); + if (!page_has_buffers(page)) + return 0; + if (journal) + return jbd2_journal_try_to_free_buffers(journal, page, + wait & ~__GFP_WAIT); + return try_to_free_buffers(page); +} + #ifdef CONFIG_QUOTA #define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) @@ -803,7 +953,9 @@ static struct dquot_operations ext4_quota_operations = { .acquire_dquot = ext4_acquire_dquot, .release_dquot = ext4_release_dquot, .mark_dirty = ext4_mark_dquot_dirty, - .write_info = ext4_write_info + .write_info = ext4_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; static struct quotactl_ops ext4_qctl_operations = { @@ -836,6 +988,7 @@ static const struct super_operations ext4_sops = { .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, #endif + .bdev_try_to_free_page = bdev_try_to_free_page, }; static const struct export_operations ext4_export_ops = { @@ -850,16 +1003,17 @@ enum { Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, - Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, + Opt_commit, Opt_min_batch_time, Opt_max_batch_time, + Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, - Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, + Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, - Opt_inode_readahead_blks + Opt_inode_readahead_blks, Opt_journal_ioprio }; static const match_table_t tokens = { @@ -889,8 +1043,9 @@ static const match_table_t tokens = { {Opt_nobh, "nobh"}, {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, + {Opt_min_batch_time, "min_batch_time=%u"}, + {Opt_max_batch_time, "max_batch_time=%u"}, {Opt_journal_update, "journal=update"}, - {Opt_journal_inum, "journal=%u"}, {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_checksum, "journal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, @@ -911,14 +1066,13 @@ static const match_table_t tokens = { {Opt_quota, "quota"}, {Opt_usrquota, "usrquota"}, {Opt_barrier, "barrier=%u"}, - {Opt_extents, "extents"}, - {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, + {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_err, NULL}, }; @@ -943,8 +1097,11 @@ static ext4_fsblk_t get_sb_block(void **data) return sb_block; } +#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + static int parse_options(char *options, struct super_block *sb, - unsigned int *inum, unsigned long *journal_devnum, + unsigned long *journal_devnum, + unsigned int *journal_ioprio, ext4_fsblk_t *n_blocks_count, int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -956,7 +1113,6 @@ static int parse_options(char *options, struct super_block *sb, int qtype, qfmt; char *qname; #endif - ext4_fsblk_t last_block; if (!options) return 1; @@ -1068,16 +1224,6 @@ static int parse_options(char *options, struct super_block *sb, } set_opt(sbi->s_mount_opt, UPDATE_JOURNAL); break; - case Opt_journal_inum: - if (is_remount) { - printk(KERN_ERR "EXT4-fs: cannot specify " - "journal on remount\n"); - return 0; - } - if (match_int(&args[0], &option)) - return 0; - *inum = option; - break; case Opt_journal_dev: if (is_remount) { printk(KERN_ERR "EXT4-fs: cannot specify " @@ -1107,6 +1253,22 @@ static int parse_options(char *options, struct super_block *sb, option = JBD2_DEFAULT_MAX_COMMIT_AGE; sbi->s_commit_interval = HZ * option; break; + case Opt_max_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + if (option == 0) + option = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_max_batch_time = option; + break; + case Opt_min_batch_time: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_min_batch_time = option; + break; case Opt_data_journal: data_opt = EXT4_MOUNT_JOURNAL_DATA; goto datacheck; @@ -1142,8 +1304,7 @@ static int parse_options(char *options, struct super_block *sb, case Opt_grpjquota: qtype = GRPQUOTA; set_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change journaled " @@ -1182,8 +1343,7 @@ set_qf_name: case Opt_offgrpjquota: qtype = GRPQUOTA; clear_qf_name: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { printk(KERN_ERR "EXT4-fs: Cannot change " "journaled quota options when " @@ -1202,8 +1362,7 @@ clear_qf_name: case Opt_jqfmt_vfsv0: qfmt = QFMT_VFS_V0; set_qf_format: - if ((sb_any_quota_enabled(sb) || - sb_any_quota_suspended(sb)) && + if (sb_any_quota_loaded(sb) && sbi->s_jquota_fmt != qfmt) { printk(KERN_ERR "EXT4-fs: Cannot change " "journaled quota options when " @@ -1222,7 +1381,7 @@ set_qf_format: set_opt(sbi->s_mount_opt, GRPQUOTA); break; case Opt_noquota: - if (sb_any_quota_enabled(sb)) { + if (sb_any_quota_loaded(sb)) { printk(KERN_ERR "EXT4-fs: Cannot change quota " "options when quota turned on.\n"); return 0; @@ -1280,33 +1439,6 @@ set_qf_format: case Opt_bh: clear_opt(sbi->s_mount_opt, NOBH); break; - case Opt_extents: - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, - EXT4_FEATURE_INCOMPAT_EXTENTS)) { - ext4_warning(sb, __func__, - "extents feature not enabled " - "on this filesystem, use tune2fs\n"); - return 0; - } - set_opt(sbi->s_mount_opt, EXTENTS); - break; - case Opt_noextents: - /* - * When e2fsprogs support resizing an already existing - * ext3 file system to greater than 2**32 we need to - * add support to block allocator to handle growing - * already existing block mapped inode so that blocks - * allocated for them fall within 2**32 - */ - last_block = ext4_blocks_count(sbi->s_es) - 1; - if (last_block > 0xffffffffULL) { - printk(KERN_ERR "EXT4-fs: Filesystem too " - "large to mount with " - "-o noextents options\n"); - return 0; - } - clear_opt(sbi->s_mount_opt, EXTENTS); - break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); sb->s_flags |= MS_I_VERSION; @@ -1331,6 +1463,14 @@ set_qf_format: return 0; sbi->s_inode_readahead_blks = option; break; + case Opt_journal_ioprio: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > 7) + break; + *journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, + option); + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1406,24 +1546,19 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, printk(KERN_WARNING "EXT4-fs warning: checktime reached, " "running e2fsck is recommended\n"); -#if 0 - /* @@@ We _will_ want to clear the valid bit if we find - * inconsistencies, to force a fsck at reboot. But for - * a plain journaled filesystem we can keep it set as - * valid forever! :) - */ - es->s_state &= cpu_to_le16(~EXT4_VALID_FS); -#endif + if (!sbi->s_journal) + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); es->s_mtime = cpu_to_le32(get_seconds()); ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + if (sbi->s_journal) + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, es, 1); if (test_opt(sb, DEBUG)) - printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%lu, " + printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", sb->s_blocksize, sbi->s_groups_count, @@ -1431,9 +1566,13 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", - sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : - "external", EXT4_SB(sb)->s_journal->j_devname); + if (EXT4_SB(sb)->s_journal) { + printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", + sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + "external", EXT4_SB(sb)->s_journal->j_devname); + } else { + printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id); + } return res; } @@ -1445,7 +1584,6 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; - __u64 block_bitmap = 0; int i; if (!sbi->s_es->s_log_groups_per_flex) { @@ -1464,21 +1602,18 @@ static int ext4_fill_flex_info(struct super_block *sb) sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { printk(KERN_ERR "EXT4-fs: not enough memory for " - "%lu flex groups\n", flex_group_count); + "%u flex groups\n", flex_group_count); goto failed; } - gdp = ext4_get_group_desc(sb, 1, &bh); - block_bitmap = ext4_block_bitmap(sb, gdp) - 1; - for (i = 0; i < sbi->s_groups_count; i++) { gdp = ext4_get_group_desc(sb, i, &bh); flex_group = ext4_flex_group(sbi, i); sbi->s_flex_groups[flex_group].free_inodes += - le16_to_cpu(gdp->bg_free_inodes_count); + ext4_free_inodes_count(sb, gdp); sbi->s_flex_groups[flex_group].free_blocks += - le16_to_cpu(gdp->bg_free_blocks_count); + ext4_free_blks_count(sb, gdp); } return 1; @@ -1552,14 +1687,14 @@ static int ext4_check_descriptors(struct super_block *sb) block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Block bitmap for group %lu not in group " + "Block bitmap for group %u not in group " "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Inode bitmap for group %lu not in group " + "Inode bitmap for group %u not in group " "(block %llu)!\n", i, inode_bitmap); return 0; } @@ -1567,14 +1702,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Inode table for group %lu not in group " + "Inode table for group %u not in group " "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " - "Checksum for group %lu failed (%u!=%u)\n", + "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { @@ -1866,19 +2001,20 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned int journal_inum = 0; unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; char *cp; + const char *descr; int ret = -EINVAL; int blocksize; - int db_count; - int i; + unsigned int db_count; + unsigned int i; int needs_recovery, has_huge_files; - __le32 features; + int features; __u64 blocks_count; int err; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -1959,31 +2095,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); + sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; + sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; + sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; set_opt(sbi->s_mount_opt, RESERVATION); set_opt(sbi->s_mount_opt, BARRIER); /* - * turn on extents feature by default in ext4 filesystem - * only if feature flag already set by mkfs or tune2fs. - * Use -o noextents to turn it off - */ - if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) - set_opt(sbi->s_mount_opt, EXTENTS); - else - ext4_warning(sb, __func__, - "extents feature not enabled on this filesystem, " - "use tune2fs.\n"); - - /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ set_opt(sbi->s_mount_opt, DELALLOC); - if (!parse_options((char *) data, sb, &journal_inum, &journal_devnum, - NULL, 0)) + if (!parse_options((char *) data, sb, &journal_devnum, + &journal_ioprio, NULL, 0)) goto failed_mount; sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | @@ -2005,15 +2132,17 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP); if (features) { printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + "unsupported optional features (%x).\n", sb->s_id, + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) & + ~EXT4_FEATURE_INCOMPAT_SUPP)); goto failed_mount; } features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP); if (!(sb->s_flags & MS_RDONLY) && features) { printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of " - "unsupported optional features (%x).\n", - sb->s_id, le32_to_cpu(features)); + "unsupported optional features (%x).\n", sb->s_id, + (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); goto failed_mount; } has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, @@ -2118,6 +2247,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < 4; i++) sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]); sbi->s_def_hash_version = es->s_def_hash_version; + i = le32_to_cpu(es->s_flags); + if (i & EXT2_FLAGS_UNSIGNED_HASH) + sbi->s_hash_unsigned = 3; + else if ((i & EXT2_FLAGS_SIGNED_HASH) == 0) { +#ifdef __CHAR_UNSIGNED__ + es->s_flags |= cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH); + sbi->s_hash_unsigned = 3; +#else + es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH); +#endif + sb->s_dirt = 1; + } if (sbi->s_blocks_per_group > blocksize * 8) { printk(KERN_ERR @@ -2145,20 +2286,30 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_BLOCKS_PER_GROUP(sb) == 0) goto cantfind_ext4; - /* ensure blocks_count calculation below doesn't sign-extend */ - if (ext4_blocks_count(es) + EXT4_BLOCKS_PER_GROUP(sb) < - le32_to_cpu(es->s_first_data_block) + 1) { - printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu, " - "first data block %u, blocks per group %lu\n", - ext4_blocks_count(es), - le32_to_cpu(es->s_first_data_block), - EXT4_BLOCKS_PER_GROUP(sb)); + /* + * It makes no sense for the first data block to be beyond the end + * of the filesystem. + */ + if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) { + printk(KERN_WARNING "EXT4-fs: bad geometry: first data" + "block %u is beyond end of filesystem (%llu)\n", + le32_to_cpu(es->s_first_data_block), + ext4_blocks_count(es)); goto failed_mount; } blocks_count = (ext4_blocks_count(es) - le32_to_cpu(es->s_first_data_block) + EXT4_BLOCKS_PER_GROUP(sb) - 1); do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb)); + if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) { + printk(KERN_WARNING "EXT4-fs: groups count too large: %u " + "(block count %llu, first data block %u, " + "blocks per group %lu)\n", sbi->s_groups_count, + ext4_blocks_count(es), + le32_to_cpu(es->s_first_data_block), + EXT4_BLOCKS_PER_GROUP(sb)); + goto failed_mount; + } sbi->s_groups_count = blocks_count; db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) / EXT4_DESC_PER_BLOCK(sb); @@ -2270,27 +2421,26 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_commit_super(sb, es, 1); - printk(KERN_CRIT - "EXT4-fs (device %s): mount failed\n", - sb->s_id); goto failed_mount4; } } - } else if (journal_inum) { - if (ext4_create_journal(sb, es, journal_inum)) - goto failed_mount3; + } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) && + EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) { + printk(KERN_ERR "EXT4-fs: required journal recovery " + "suppressed and not mounted read-only\n"); + goto failed_mount4; } else { - if (!silent) - printk(KERN_ERR - "ext4: No journal on filesystem on %s\n", - sb->s_id); - goto failed_mount3; + clear_opt(sbi->s_mount_opt, DATA_FLAGS); + set_opt(sbi->s_mount_opt, WRITEBACK_DATA); + sbi->s_journal = NULL; + needs_recovery = 0; + goto no_journal; } if (ext4_blocks_count(es) > 0xffffffffULL && !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_64BIT)) { - printk(KERN_ERR "ext4: Failed to set 64-bit journal feature\n"); + printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n"); goto failed_mount4; } @@ -2335,6 +2485,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) default: break; } + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + +no_journal: if (test_opt(sb, NOBH)) { if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { @@ -2420,13 +2573,22 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; - if (needs_recovery) + if (needs_recovery) { printk(KERN_INFO "EXT4-fs: recovery complete.\n"); - ext4_mark_recovery_complete(sb, es); - printk(KERN_INFO "EXT4-fs: mounted filesystem with %s data mode.\n", - test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ? "journal": - test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": - "writeback"); + ext4_mark_recovery_complete(sb, es); + } + if (EXT4_SB(sb)->s_journal) { + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + descr = " journalled data mode"; + else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + descr = " ordered data mode"; + else + descr = " writeback data mode"; + } else + descr = "out journal"; + + printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n", + sb->s_id, descr); lock_kernel(); return 0; @@ -2438,8 +2600,11 @@ cantfind_ext4: goto failed_mount; failed_mount4: - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + if (sbi->s_journal) { + jbd2_journal_destroy(sbi->s_journal); + sbi->s_journal = NULL; + } failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); @@ -2476,11 +2641,9 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_commit_interval) - journal->j_commit_interval = sbi->s_commit_interval; - /* We could also set up an ext4-specific default for the commit - * interval here, but for now we'll just fall back to the jbd - * default. */ + journal->j_commit_interval = sbi->s_commit_interval; + journal->j_min_batch_time = sbi->s_min_batch_time; + journal->j_max_batch_time = sbi->s_max_batch_time; spin_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) @@ -2500,6 +2663,8 @@ static journal_t *ext4_get_journal(struct super_block *sb, struct inode *journal_inode; journal_t *journal; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + /* First, test for the existence of a valid inode on disk. Bad * things happen if we iget() an unused inode, as the subsequent * iput() will try to delete it. */ @@ -2548,13 +2713,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, struct ext4_super_block *es; struct block_device *bdev; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + bdev = ext4_blkdev_get(j_dev); if (bdev == NULL) return NULL; if (bd_claim(bdev, sb)) { printk(KERN_ERR - "EXT4: failed to claim external journal device.\n"); + "EXT4-fs: failed to claim external journal device.\n"); blkdev_put(bdev, FMODE_READ|FMODE_WRITE); return NULL; } @@ -2635,6 +2802,8 @@ static int ext4_load_journal(struct super_block *sb, int err = 0; int really_read_only; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { printk(KERN_INFO "EXT4-fs: external journal device major/minor " @@ -2719,48 +2888,6 @@ static int ext4_load_journal(struct super_block *sb, return 0; } -static int ext4_create_journal(struct super_block *sb, - struct ext4_super_block *es, - unsigned int journal_inum) -{ - journal_t *journal; - int err; - - if (sb->s_flags & MS_RDONLY) { - printk(KERN_ERR "EXT4-fs: readonly filesystem when trying to " - "create journal.\n"); - return -EROFS; - } - - journal = ext4_get_journal(sb, journal_inum); - if (!journal) - return -EINVAL; - - printk(KERN_INFO "EXT4-fs: creating new journal on inode %u\n", - journal_inum); - - err = jbd2_journal_create(journal); - if (err) { - printk(KERN_ERR "EXT4-fs: error creating journal.\n"); - jbd2_journal_destroy(journal); - return -EIO; - } - - EXT4_SB(sb)->s_journal = journal; - - ext4_update_dynamic_rev(sb); - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL); - - es->s_journal_inum = cpu_to_le32(journal_inum); - sb->s_dirt = 1; - - /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); - - return 0; -} - static void ext4_commit_super(struct super_block *sb, struct ext4_super_block *es, int sync) { @@ -2777,20 +2904,23 @@ static void ext4_commit_super(struct super_block *sb, * be remapped. Nothing we can do but to retry the * write and hope for the best. */ - printk(KERN_ERR "ext4: previous I/O error to " + printk(KERN_ERR "EXT4-fs: previous I/O error to " "superblock detected for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); } es->s_wtime = cpu_to_le32(get_seconds()); - ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); - es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); + ext4_free_blocks_count_set(es, percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeblocks_counter)); + es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( + &EXT4_SB(sb)->s_freeinodes_counter)); + BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { sync_dirty_buffer(sbh); if (buffer_write_io_error(sbh)) { - printk(KERN_ERR "ext4: I/O error while writing " + printk(KERN_ERR "EXT4-fs: I/O error while writing " "superblock for %s.\n", sb->s_id); clear_buffer_write_io_error(sbh); set_buffer_uptodate(sbh); @@ -2809,6 +2939,10 @@ static void ext4_mark_recovery_complete(struct super_block *sb, { journal_t *journal = EXT4_SB(sb)->s_journal; + if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) { + BUG_ON(journal != NULL); + return; + } jbd2_journal_lock_updates(journal); if (jbd2_journal_flush(journal) < 0) goto out; @@ -2838,6 +2972,8 @@ static void ext4_clear_journal_err(struct super_block *sb, int j_errno; const char *errstr; + BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)); + journal = EXT4_SB(sb)->s_journal; /* @@ -2870,14 +3006,17 @@ static void ext4_clear_journal_err(struct super_block *sb, int ext4_force_commit(struct super_block *sb) { journal_t *journal; - int ret; + int ret = 0; if (sb->s_flags & MS_RDONLY) return 0; journal = EXT4_SB(sb)->s_journal; - sb->s_dirt = 0; - ret = ext4_journal_force_commit(journal); + if (journal) { + sb->s_dirt = 0; + ret = ext4_journal_force_commit(journal); + } + return ret; } @@ -2889,9 +3028,13 @@ int ext4_force_commit(struct super_block *sb) */ static void ext4_write_super(struct super_block *sb) { - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - sb->s_dirt = 0; + if (EXT4_SB(sb)->s_journal) { + if (mutex_trylock(&sb->s_lock) != 0) + BUG(); + sb->s_dirt = 0; + } else { + ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + } } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -2900,10 +3043,14 @@ static int ext4_sync_fs(struct super_block *sb, int wait) trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (EXT4_SB(sb)->s_journal) { + if (wait) + ret = ext4_force_commit(sb); + else + jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + } else { + ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); + } return ret; } @@ -2918,15 +3065,17 @@ static void ext4_write_super_lockfs(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { journal_t *journal = EXT4_SB(sb)->s_journal; - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* - * We don't want to clear needs_recovery flag when we failed - * to flush the journal. - */ - if (jbd2_journal_flush(journal) < 0) - return; + /* + * We don't want to clear needs_recovery flag when we + * failed to flush the journal. + */ + if (jbd2_journal_flush(journal) < 0) + return; + } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -2940,7 +3089,7 @@ static void ext4_write_super_lockfs(struct super_block *sb) */ static void ext4_unlockfs(struct super_block *sb) { - if (!(sb->s_flags & MS_RDONLY)) { + if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -2958,6 +3107,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) unsigned long old_sb_flags; struct ext4_mount_options old_opts; ext4_group_t g; + unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err; #ifdef CONFIG_QUOTA int i; @@ -2969,16 +3119,21 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_resuid = sbi->s_resuid; old_opts.s_resgid = sbi->s_resgid; old_opts.s_commit_interval = sbi->s_commit_interval; + old_opts.s_min_batch_time = sbi->s_min_batch_time; + old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) old_opts.s_qf_names[i] = sbi->s_qf_names[i]; #endif + if (sbi->s_journal && sbi->s_journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; /* * Allow the "check" option to be passed as a remount option. */ - if (!parse_options(data, sb, NULL, NULL, &n_blocks_count, 1)) { + if (!parse_options(data, sb, NULL, &journal_ioprio, + &n_blocks_count, 1)) { err = -EINVAL; goto restore_opts; } @@ -2991,7 +3146,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) es = sbi->s_es; - ext4_init_journal_params(sb, sbi->s_journal); + if (sbi->s_journal) { + ext4_init_journal_params(sb, sbi->s_journal); + set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + } if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) || n_blocks_count > ext4_blocks_count(es)) { @@ -3020,17 +3178,20 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * We have to unlock super so that we can wait for * transactions. */ - unlock_super(sb); - ext4_mark_recovery_complete(sb, es); - lock_super(sb); + if (sbi->s_journal) { + unlock_super(sb); + ext4_mark_recovery_complete(sb, es); + lock_super(sb); + } } else { - __le32 ret; + int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP))) { printk(KERN_WARNING "EXT4-fs: %s: couldn't " "remount RDWR because of unsupported " - "optional features (%x).\n", - sb->s_id, le32_to_cpu(ret)); + "optional features (%x).\n", sb->s_id, + (le32_to_cpu(sbi->s_es->s_feature_ro_compat) & + ~EXT4_FEATURE_RO_COMPAT_SUPP)); err = -EROFS; goto restore_opts; } @@ -3047,7 +3208,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (!ext4_group_desc_csum_verify(sbi, g, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_remount: " - "Checksum for group %lu failed (%u!=%u)\n", + "Checksum for group %u failed (%u!=%u)\n", g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)), le16_to_cpu(gdp->bg_checksum)); err = -EINVAL; @@ -3076,7 +3237,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * been changed by e2fsck since we originally mounted * the partition.) */ - ext4_clear_journal_err(sb, es); + if (sbi->s_journal) + ext4_clear_journal_err(sb, es); sbi->s_mount_state = le16_to_cpu(es->s_state); if ((err = ext4_group_extend(sb, es, n_blocks_count))) goto restore_opts; @@ -3084,6 +3246,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } } + if (sbi->s_journal == NULL) + ext4_commit_super(sb, es, 1); + #ifdef CONFIG_QUOTA /* Release old quota file names */ for (i = 0; i < MAXQUOTAS; i++) @@ -3098,6 +3263,8 @@ restore_opts: sbi->s_resuid = old_opts.s_resuid; sbi->s_resgid = old_opts.s_resgid; sbi->s_commit_interval = old_opts.s_commit_interval; + sbi->s_min_batch_time = old_opts.s_min_batch_time; + sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { @@ -3360,7 +3527,8 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * When we journal data on quota file, we have to flush journal to see * all updates to the file when we bypass pagecache... */ - if (ext4_should_journal_data(path.dentry->d_inode)) { + if (EXT4_SB(sb)->s_journal && + ext4_should_journal_data(path.dentry->d_inode)) { /* * We don't need to lock updates but journal_flush() could * otherwise be livelocked... @@ -3434,7 +3602,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, struct buffer_head *bh; handle_t *handle = journal_current_handle(); - if (!handle) { + if (EXT4_SB(sb)->s_journal && !handle) { printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); @@ -3459,7 +3627,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, flush_dcache_page(bh->b_page); unlock_buffer(bh); if (journal_quota) - err = ext4_journal_dirty_metadata(handle, bh); + err = ext4_handle_dirty_metadata(handle, NULL, bh); else { /* Always do at least ordered writes for quotas */ err = ext4_jbd2_file_inode(handle, inode); @@ -3513,18 +3681,15 @@ static int ext4_ui_proc_open(struct inode *inode, struct file *file) static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, size_t cnt, loff_t *ppos) { - unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; + unsigned long *p = PDE(file->f_path.dentry->d_inode)->data; char str[32]; - unsigned long value; if (cnt >= sizeof(str)) return -EINVAL; if (copy_from_user(str, buf, cnt)) return -EFAULT; - value = simple_strtol(str, NULL, 0); - if (value < 0) - return -ERANGE; - *p = value; + + *p = simple_strtoul(str, NULL, 0); return cnt; } @@ -3615,7 +3780,7 @@ static void __exit exit_ext4_fs(void) } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); -MODULE_DESCRIPTION("Fourth Extended Filesystem with extents"); +MODULE_DESCRIPTION("Fourth Extended Filesystem"); MODULE_LICENSE("GPL"); module_init(init_ext4_fs) module_exit(exit_ext4_fs) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 80626d516fe..157ce6589c5 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -457,7 +457,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); sb->s_dirt = 1; - ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); } } @@ -487,9 +487,9 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, ext4_forget(handle, 1, inode, bh, bh->b_blocknr); } else { le32_add_cpu(&BHDR(bh)->h_refcount, -1); - error = ext4_journal_dirty_metadata(handle, bh); + error = ext4_handle_dirty_metadata(handle, inode, bh); if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); DQUOT_FREE_BLOCK(inode, 1); ea_bdebug(bh, "refcount now=%d; releasing", le32_to_cpu(BHDR(bh)->h_refcount)); @@ -724,8 +724,9 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, if (error == -EIO) goto bad_block; if (!error) - error = ext4_journal_dirty_metadata(handle, - bs->bh); + error = ext4_handle_dirty_metadata(handle, + inode, + bs->bh); if (error) goto cleanup; goto inserted; @@ -794,8 +795,9 @@ inserted: ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); - error = ext4_journal_dirty_metadata(handle, - new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, + new_bh); if (error) goto cleanup_dquot; } @@ -810,8 +812,8 @@ inserted: /* We need to allocate a new block */ ext4_fsblk_t goal = ext4_group_first_block_no(sb, EXT4_I(inode)->i_block_group); - ext4_fsblk_t block = ext4_new_meta_block(handle, inode, - goal, &error); + ext4_fsblk_t block = ext4_new_meta_blocks(handle, inode, + goal, NULL, &error); if (error) goto cleanup; ea_idebug(inode, "creating block %d", block); @@ -833,7 +835,8 @@ getblk_failed: set_buffer_uptodate(new_bh); unlock_buffer(new_bh); ext4_xattr_cache_insert(new_bh); - error = ext4_journal_dirty_metadata(handle, new_bh); + error = ext4_handle_dirty_metadata(handle, + inode, new_bh); if (error) goto cleanup; } @@ -1040,7 +1043,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, */ is.iloc.bh = NULL; if (IS_SYNC(inode)) - handle->h_sync = 1; + ext4_handle_sync(handle); } cleanup: diff --git a/fs/filesystems.c b/fs/filesystems.c index d0e20ced62d..d488dcd7f2b 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -253,24 +253,27 @@ static int __init proc_filesystems_init(void) module_init(proc_filesystems_init); #endif -struct file_system_type *get_fs_type(const char *name) +static struct file_system_type *__get_fs_type(const char *name, int len) { struct file_system_type *fs; - const char *dot = strchr(name, '.'); - unsigned len = dot ? dot - name : strlen(name); read_lock(&file_systems_lock); fs = *(find_filesystem(name, len)); if (fs && !try_module_get(fs->owner)) fs = NULL; read_unlock(&file_systems_lock); - if (!fs && (request_module("%.*s", len, name) == 0)) { - read_lock(&file_systems_lock); - fs = *(find_filesystem(name, len)); - if (fs && !try_module_get(fs->owner)) - fs = NULL; - read_unlock(&file_systems_lock); - } + return fs; +} + +struct file_system_type *get_fs_type(const char *name) +{ + struct file_system_type *fs; + const char *dot = strchr(name, '.'); + int len = dot ? dot - name : strlen(name); + + fs = __get_fs_type(name, len); + if (!fs && (request_module("%.*s", len, name) == 0)) + fs = __get_fs_type(name, len); if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) { put_filesystem(fs); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d0ff0b8cf30..e5eaa62fd17 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -421,9 +421,6 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc) * If we're a pdlfush thread, then implement pdflush collision avoidance * against the entire list. * - * WB_SYNC_HOLD is a hack for sys_sync(): reattach the inode to sb->s_dirty so - * that it can be located for waiting on in __writeback_single_inode(). - * * If `bdi' is non-zero then we're being asked to writeback a specific queue. * This function assumes that the blockdev superblock's inodes are backed by * a variety of queues, so all inodes are searched. For other superblocks, @@ -443,6 +440,7 @@ void generic_sync_sb_inodes(struct super_block *sb, struct writeback_control *wbc) { const unsigned long start = jiffies; /* livelock avoidance */ + int sync = wbc->sync_mode == WB_SYNC_ALL; spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&sb->s_io)) @@ -499,10 +497,6 @@ void generic_sync_sb_inodes(struct super_block *sb, __iget(inode); pages_skipped = wbc->pages_skipped; __writeback_single_inode(inode, wbc); - if (wbc->sync_mode == WB_SYNC_HOLD) { - inode->dirtied_when = jiffies; - list_move(&inode->i_list, &sb->s_dirty); - } if (current_is_pdflush()) writeback_release(bdi); if (wbc->pages_skipped != pages_skipped) { @@ -523,7 +517,49 @@ void generic_sync_sb_inodes(struct super_block *sb, if (!list_empty(&sb->s_more_io)) wbc->more_io = 1; } - spin_unlock(&inode_lock); + + if (sync) { + struct inode *inode, *old_inode = NULL; + + /* + * Data integrity sync. Must wait for all pages under writeback, + * because there may have been pages dirtied before our sync + * call, but which had writeout started before we write it out. + * In which case, the inode may not be on the dirty list, but + * we still have to wait for that writeout. + */ + list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + struct address_space *mapping; + + if (inode->i_state & (I_FREEING|I_WILL_FREE)) + continue; + mapping = inode->i_mapping; + if (mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); + /* + * We hold a reference to 'inode' so it couldn't have + * been removed from s_inodes list while we dropped the + * inode_lock. We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it + * under inode_lock. So we keep the reference and iput + * it later. + */ + iput(old_inode); + old_inode = inode; + + filemap_fdatawait(mapping); + + cond_resched(); + + spin_lock(&inode_lock); + } + spin_unlock(&inode_lock); + iput(old_inode); + } else + spin_unlock(&inode_lock); + return; /* Leave any unwritten inodes on s_io */ } EXPORT_SYMBOL_GPL(generic_sync_sb_inodes); @@ -588,8 +624,7 @@ restart: /* * writeback and wait upon the filesystem's dirty inodes. The caller will - * do this in two passes - one to write, and one to wait. WB_SYNC_HOLD is - * used to park the written inodes on sb->s_dirty for the wait pass. + * do this in two passes - one to write, and one to wait. * * A finite limit is set on the number of pages which will be written. * To prevent infinite livelock of sys_sync(). @@ -600,30 +635,21 @@ restart: void sync_inodes_sb(struct super_block *sb, int wait) { struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, }; - unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); - unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); - wbc.nr_to_write = nr_dirty + nr_unstable + - (inodes_stat.nr_inodes - inodes_stat.nr_unused) + - nr_dirty + nr_unstable; - wbc.nr_to_write += wbc.nr_to_write / 2; /* Bit more for luck */ - sync_sb_inodes(sb, &wbc); -} + if (!wait) { + unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY); + unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS); -/* - * Rather lame livelock avoidance. - */ -static void set_sb_syncing(int val) -{ - struct super_block *sb; - spin_lock(&sb_lock); - list_for_each_entry_reverse(sb, &super_blocks, s_list) - sb->s_syncing = val; - spin_unlock(&sb_lock); + wbc.nr_to_write = nr_dirty + nr_unstable + + (inodes_stat.nr_inodes - inodes_stat.nr_unused); + } else + wbc.nr_to_write = LONG_MAX; /* doesn't actually matter */ + + sync_sb_inodes(sb, &wbc); } /** @@ -652,9 +678,6 @@ static void __sync_inodes(int wait) spin_lock(&sb_lock); restart: list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_syncing) - continue; - sb->s_syncing = 1; sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); @@ -672,13 +695,10 @@ restart: void sync_inodes(int wait) { - set_sb_syncing(0); __sync_inodes(0); - if (wait) { - set_sb_syncing(0); + if (wait) __sync_inodes(1); - } } /** diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 4f3cab32141..99c99dfb037 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -48,11 +48,13 @@ static ssize_t fuse_conn_waiting_read(struct file *file, char __user *buf, size_t size; if (!*ppos) { + long value; struct fuse_conn *fc = fuse_ctl_file_conn_get(file); if (!fc) return 0; - file->private_data=(void *)(long)atomic_read(&fc->num_waiting); + value = atomic_read(&fc->num_waiting); + file->private_data = (void *)value; fuse_conn_put(fc); } size = sprintf(tmp, "%ld\n", (long)file->private_data); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index fba571648a8..e0c7ada08a1 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -269,7 +269,7 @@ static void flush_bg_queue(struct fuse_conn *fc) * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) - __releases(fc->lock) +__releases(&fc->lock) { void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; @@ -293,13 +293,13 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) wake_up(&req->waitq); if (end) end(fc, req); - else - fuse_put_request(fc, req); + fuse_put_request(fc, req); } static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) - __releases(fc->lock) __acquires(fc->lock) +__releases(&fc->lock) +__acquires(&fc->lock) { if (signal_pending(current)) return; @@ -317,7 +317,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) } static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) - __releases(fc->lock) __acquires(fc->lock) +__releases(&fc->lock) +__acquires(&fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -380,7 +381,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) } } -void request_send(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; spin_lock(&fc->lock); @@ -399,8 +400,8 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) spin_unlock(&fc->lock); } -static void request_send_nowait_locked(struct fuse_conn *fc, - struct fuse_req *req) +static void fuse_request_send_nowait_locked(struct fuse_conn *fc, + struct fuse_req *req) { req->background = 1; fc->num_background++; @@ -414,11 +415,11 @@ static void request_send_nowait_locked(struct fuse_conn *fc, flush_bg_queue(fc); } -static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) +static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { spin_lock(&fc->lock); if (fc->connected) { - request_send_nowait_locked(fc, req); + fuse_request_send_nowait_locked(fc, req); spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; @@ -426,16 +427,16 @@ static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) } } -void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 0; - request_send_nowait(fc, req); + fuse_request_send_nowait(fc, req); } -void request_send_background(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - request_send_nowait(fc, req); + fuse_request_send_nowait(fc, req); } /* @@ -443,10 +444,11 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * * fc->connected must have been checked previously */ -void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req) { req->isreply = 1; - request_send_nowait_locked(fc, req); + fuse_request_send_nowait_locked(fc, req); } /* @@ -539,8 +541,8 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->seglen = cs->iov[0].iov_len; cs->addr = (unsigned long) cs->iov[0].iov_base; - cs->iov ++; - cs->nr_segs --; + cs->iov++; + cs->nr_segs--; } down_read(¤t->mm->mmap_sem); err = get_user_pages(current, current->mm, cs->addr, 1, cs->write, 0, @@ -589,9 +591,11 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page *page, kunmap_atomic(mapaddr, KM_USER1); } while (count) { - int err; - if (!cs->len && (err = fuse_copy_fill(cs))) - return err; + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } if (page) { void *mapaddr = kmap_atomic(page, KM_USER1); void *buf = mapaddr + offset; @@ -631,9 +635,11 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size) { while (size) { - int err; - if (!cs->len && (err = fuse_copy_fill(cs))) - return err; + if (!cs->len) { + int err = fuse_copy_fill(cs); + if (err) + return err; + } fuse_copy_do(cs, &val, &size); } return 0; @@ -664,6 +670,8 @@ static int request_pending(struct fuse_conn *fc) /* Wait until a request is available on the pending list */ static void request_wait(struct fuse_conn *fc) +__releases(&fc->lock) +__acquires(&fc->lock) { DECLARE_WAITQUEUE(wait, current); @@ -691,7 +699,7 @@ static void request_wait(struct fuse_conn *fc) */ static int fuse_read_interrupt(struct fuse_conn *fc, struct fuse_req *req, const struct iovec *iov, unsigned long nr_segs) - __releases(fc->lock) +__releases(&fc->lock) { struct fuse_copy_state cs; struct fuse_in_header ih; @@ -813,6 +821,34 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, const struct iovec *iov, return err; } +static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_poll_wakeup_out outarg; + int err; + + if (size != sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + return fuse_notify_poll_wakeup(fc, &outarg); +} + +static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, + unsigned int size, struct fuse_copy_state *cs) +{ + switch (code) { + case FUSE_NOTIFY_POLL: + return fuse_notify_poll(fc, size, cs); + + default: + return -EINVAL; + } +} + /* Look up request on processing list by unique ID */ static struct fuse_req *request_find(struct fuse_conn *fc, u64 unique) { @@ -876,9 +912,23 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, const struct iovec *iov, err = fuse_copy_one(&cs, &oh, sizeof(oh)); if (err) goto err_finish; + + err = -EINVAL; + if (oh.len != nbytes) + goto err_finish; + + /* + * Zero oh.unique indicates unsolicited notification message + * and error contains notification code. + */ + if (!oh.unique) { + err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), &cs); + fuse_copy_finish(&cs); + return err ? err : nbytes; + } + err = -EINVAL; - if (!oh.unique || oh.error <= -1000 || oh.error > 0 || - oh.len != nbytes) + if (oh.error <= -1000 || oh.error > 0) goto err_finish; spin_lock(&fc->lock); @@ -966,6 +1016,8 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) +__releases(&fc->lock) +__acquires(&fc->lock) { while (!list_empty(head)) { struct fuse_req *req; @@ -988,7 +1040,8 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) * locked). */ static void end_io_requests(struct fuse_conn *fc) - __releases(fc->lock) __acquires(fc->lock) +__releases(&fc->lock) +__acquires(&fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = @@ -1002,11 +1055,11 @@ static void end_io_requests(struct fuse_conn *fc) wake_up(&req->waitq); if (end) { req->end = NULL; - /* The end function will consume this reference */ __fuse_get_request(req); spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); + fuse_put_request(fc, req); spin_lock(&fc->lock); } } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 95bc22bdd06..fdff346e96f 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -189,7 +189,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) parent = dget_parent(entry); fuse_lookup_init(fc, req, get_node_id(parent->d_inode), &entry->d_name, &outarg); - request_send(fc, req); + fuse_request_send(fc, req); dput(parent); err = req->out.h.error; fuse_put_request(fc, req); @@ -204,7 +204,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; } spin_lock(&fc->lock); - fi->nlookup ++; + fi->nlookup++; spin_unlock(&fc->lock); } fuse_put_request(fc, forget_req); @@ -283,7 +283,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name, attr_version = fuse_get_attr_version(fc); fuse_lookup_init(fc, req, nodeid, name, outarg); - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); /* Zero nodeid is same as -ENOENT, but with valid timeout */ @@ -369,7 +369,7 @@ static void fuse_sync_release(struct fuse_conn *fc, struct fuse_file *ff, { fuse_release_fill(ff, nodeid, flags, FUSE_RELEASE); ff->reserved_req->force = 1; - request_send(fc, ff->reserved_req); + fuse_request_send(fc, ff->reserved_req); fuse_put_request(fc, ff->reserved_req); kfree(ff); } @@ -408,7 +408,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, goto out_put_forget_req; err = -ENOMEM; - ff = fuse_file_alloc(); + ff = fuse_file_alloc(fc); if (!ff) goto out_put_request; @@ -432,7 +432,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, req->out.args[0].value = &outentry; req->out.args[1].size = sizeof(outopen); req->out.args[1].value = &outopen; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; if (err) { if (err == -ENOSYS) @@ -502,7 +502,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req, else req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err) @@ -631,15 +631,17 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { struct inode *inode = entry->d_inode; - /* Set nlink to zero so the inode can be cleared, if - the inode does have more links this will be - discovered at the next lookup/getattr */ + /* + * Set nlink to zero so the inode can be cleared, if the inode + * does have more links this will be discovered at the next + * lookup/getattr. + */ clear_nlink(inode); fuse_invalidate_attr(inode); fuse_invalidate_attr(dir); @@ -662,7 +664,7 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) req->in.numargs = 1; req->in.args[0].size = entry->d_name.len + 1; req->in.args[0].value = entry->d_name.name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { @@ -695,7 +697,7 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, req->in.args[1].value = oldent->d_name.name; req->in.args[2].size = newent->d_name.len + 1; req->in.args[2].value = newent->d_name.name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { @@ -811,7 +813,7 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat, else req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) { @@ -911,7 +913,7 @@ static int fuse_access(struct inode *inode, int mask) req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -1033,7 +1035,7 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) req->num_pages = 1; req->pages[0] = page; fuse_read_fill(req, file, inode, file->f_pos, PAGE_SIZE, FUSE_READDIR); - request_send(fc, req); + fuse_request_send(fc, req); nbytes = req->out.args[0].size; err = req->out.h.error; fuse_put_request(fc, req); @@ -1067,7 +1069,7 @@ static char *read_link(struct dentry *dentry) req->out.numargs = 1; req->out.args[0].size = PAGE_SIZE - 1; req->out.args[0].value = link; - request_send(fc, req); + fuse_request_send(fc, req); if (req->out.h.error) { free_page((unsigned long) link); link = ERR_PTR(req->out.h.error); @@ -1273,7 +1275,7 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, else req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err) { @@ -1367,7 +1369,7 @@ static int fuse_setxattr(struct dentry *entry, const char *name, req->in.args[1].value = name; req->in.args[2].size = size; req->in.args[2].value = value; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -1413,7 +1415,7 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; } - request_send(fc, req); + fuse_request_send(fc, req); ret = req->out.h.error; if (!ret) ret = size ? req->out.args[0].size : outarg.size; @@ -1463,7 +1465,7 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; } - request_send(fc, req); + fuse_request_send(fc, req); ret = req->out.h.error; if (!ret) ret = size ? req->out.args[0].size : outarg.size; @@ -1496,7 +1498,7 @@ static int fuse_removexattr(struct dentry *entry, const char *name) req->in.numargs = 1; req->in.args[0].size = strlen(name) + 1; req->in.args[0].value = name; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 34930a964b8..e8162646a9b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -39,14 +39,14 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, req->out.numargs = 1; req->out.args[0].size = sizeof(*outargp); req->out.args[0].value = outargp; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); return err; } -struct fuse_file *fuse_file_alloc(void) +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc) { struct fuse_file *ff; ff = kmalloc(sizeof(struct fuse_file), GFP_KERNEL); @@ -58,7 +58,12 @@ struct fuse_file *fuse_file_alloc(void) } else { INIT_LIST_HEAD(&ff->write_entry); atomic_set(&ff->count, 0); + spin_lock(&fc->lock); + ff->kh = ++fc->khctr; + spin_unlock(&fc->lock); } + RB_CLEAR_NODE(&ff->polled_node); + init_waitqueue_head(&ff->poll_wait); } return ff; } @@ -79,7 +84,6 @@ static void fuse_release_end(struct fuse_conn *fc, struct fuse_req *req) { dput(req->misc.release.dentry); mntput(req->misc.release.vfsmount); - fuse_put_request(fc, req); } static void fuse_file_put(struct fuse_file *ff) @@ -89,7 +93,7 @@ static void fuse_file_put(struct fuse_file *ff) struct inode *inode = req->misc.release.dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); req->end = fuse_release_end; - request_send_background(fc, req); + fuse_request_send_background(fc, req); kfree(ff); } } @@ -109,6 +113,7 @@ void fuse_finish_open(struct inode *inode, struct file *file, int fuse_open_common(struct inode *inode, struct file *file, int isdir) { + struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_open_out outarg; struct fuse_file *ff; int err; @@ -121,7 +126,7 @@ int fuse_open_common(struct inode *inode, struct file *file, int isdir) if (err) return err; - ff = fuse_file_alloc(); + ff = fuse_file_alloc(fc); if (!ff) return -ENOMEM; @@ -167,7 +172,11 @@ int fuse_release_common(struct inode *inode, struct file *file, int isdir) spin_lock(&fc->lock); list_del(&ff->write_entry); + if (!RB_EMPTY_NODE(&ff->polled_node)) + rb_erase(&ff->polled_node, &fc->polled_files); spin_unlock(&fc->lock); + + wake_up_interruptible_sync(&ff->poll_wait); /* * Normally this will send the RELEASE request, * however if some asynchronous READ or WRITE requests @@ -280,7 +289,7 @@ static int fuse_flush(struct file *file, fl_owner_t id) req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; req->force = 1; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -344,7 +353,7 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, req->in.numargs = 1; req->in.args[0].size = sizeof(inarg); req->in.args[0].value = &inarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) { @@ -396,7 +405,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, inarg->read_flags |= FUSE_READ_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - request_send(fc, req); + fuse_request_send(fc, req); return req->out.args[0].size; } @@ -493,7 +502,6 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) } if (req->ff) fuse_file_put(req->ff); - fuse_put_request(fc, req); } static void fuse_send_readpages(struct fuse_req *req, struct file *file, @@ -509,10 +517,11 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, struct fuse_file *ff = file->private_data; req->ff = fuse_file_get(ff); req->end = fuse_readpages_end; - request_send_background(fc, req); + fuse_request_send_background(fc, req); } else { - request_send(fc, req); + fuse_request_send(fc, req); fuse_readpages_end(fc, req); + fuse_put_request(fc, req); } } @@ -543,7 +552,7 @@ static int fuse_readpages_fill(void *_data, struct page *page) } } req->pages[req->num_pages] = page; - req->num_pages ++; + req->num_pages++; return 0; } @@ -636,7 +645,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, inarg->write_flags |= FUSE_WRITE_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); } - request_send(fc, req); + fuse_request_send(fc, req); return req->misc.write.out.size; } @@ -646,7 +655,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = __grab_cache_page(mapping, index); + *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) return -ENOMEM; return 0; @@ -779,7 +788,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req, break; err = -ENOMEM; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, 0); if (!page) break; @@ -1042,7 +1051,6 @@ static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { __free_page(req->pages[0]); fuse_file_put(req->ff); - fuse_put_request(fc, req); } static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) @@ -1060,6 +1068,8 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) /* Called under fc->lock, may release and reacquire it */ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +__releases(&fc->lock) +__acquires(&fc->lock) { struct fuse_inode *fi = get_fuse_inode(req->inode); loff_t size = i_size_read(req->inode); @@ -1079,13 +1089,14 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) req->in.args[1].size = inarg->size; fi->writectr++; - request_send_background_locked(fc, req); + fuse_request_send_background_locked(fc, req); return; out_free: fuse_writepage_finish(fc, req); spin_unlock(&fc->lock); fuse_writepage_free(fc, req); + fuse_put_request(fc, req); spin_lock(&fc->lock); } @@ -1096,6 +1107,8 @@ static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) * Called with fc->lock */ void fuse_flush_writepages(struct inode *inode) +__releases(&fc->lock) +__acquires(&fc->lock) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); @@ -1325,7 +1338,7 @@ static int fuse_getlk(struct file *file, struct file_lock *fl) req->out.numargs = 1; req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (!err) @@ -1357,7 +1370,7 @@ static int fuse_setlk(struct file *file, struct file_lock *fl, int flock) return PTR_ERR(req); fuse_lk_fill(req, file, fl, opcode, pid, flock); - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; /* locking is restartable */ if (err == -EINTR) @@ -1433,7 +1446,7 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) req->out.numargs = 1; req->out.args[0].size = sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; fuse_put_request(fc, req); if (err == -ENOSYS) @@ -1470,6 +1483,406 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) return retval; } +static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov, + unsigned int nr_segs, size_t bytes, bool to_user) +{ + struct iov_iter ii; + int page_idx = 0; + + if (!bytes) + return 0; + + iov_iter_init(&ii, iov, nr_segs, bytes, 0); + + while (iov_iter_count(&ii)) { + struct page *page = pages[page_idx++]; + size_t todo = min_t(size_t, PAGE_SIZE, iov_iter_count(&ii)); + void *kaddr, *map; + + kaddr = map = kmap(page); + + while (todo) { + char __user *uaddr = ii.iov->iov_base + ii.iov_offset; + size_t iov_len = ii.iov->iov_len - ii.iov_offset; + size_t copy = min(todo, iov_len); + size_t left; + + if (!to_user) + left = copy_from_user(kaddr, uaddr, copy); + else + left = copy_to_user(uaddr, kaddr, copy); + + if (unlikely(left)) + return -EFAULT; + + iov_iter_advance(&ii, copy); + todo -= copy; + kaddr += copy; + } + + kunmap(map); + } + + return 0; +} + +/* + * For ioctls, there is no generic way to determine how much memory + * needs to be read and/or written. Furthermore, ioctls are allowed + * to dereference the passed pointer, so the parameter requires deep + * copying but FUSE has no idea whatsoever about what to copy in or + * out. + * + * This is solved by allowing FUSE server to retry ioctl with + * necessary in/out iovecs. Let's assume the ioctl implementation + * needs to read in the following structure. + * + * struct a { + * char *buf; + * size_t buflen; + * } + * + * On the first callout to FUSE server, inarg->in_size and + * inarg->out_size will be NULL; then, the server completes the ioctl + * with FUSE_IOCTL_RETRY set in out->flags, out->in_iovs set to 1 and + * the actual iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) } } + * + * which tells FUSE to copy in the requested area and retry the ioctl. + * On the second round, the server has access to the structure and + * from that it can tell what to look for next, so on the invocation, + * it sets FUSE_IOCTL_RETRY, out->in_iovs to 2 and iov array to + * + * { { .iov_base = inarg.arg, .iov_len = sizeof(struct a) }, + * { .iov_base = a.buf, .iov_len = a.buflen } } + * + * FUSE will copy both struct a and the pointed buffer from the + * process doing the ioctl and retry ioctl with both struct a and the + * buffer. + * + * This time, FUSE server has everything it needs and completes ioctl + * without FUSE_IOCTL_RETRY which finishes the ioctl call. + * + * Copying data out works the same way. + * + * Note that if FUSE_IOCTL_UNRESTRICTED is clear, the kernel + * automatically initializes in and out iovs by decoding @cmd with + * _IOC_* macros and the server is not allowed to request RETRY. This + * limits ioctl data transfers to well-formed ioctls and is the forced + * behavior for all FUSE servers. + */ +static long fuse_file_do_ioctl(struct file *file, unsigned int cmd, + unsigned long arg, unsigned int flags) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_ioctl_in inarg = { + .fh = ff->fh, + .cmd = cmd, + .arg = arg, + .flags = flags + }; + struct fuse_ioctl_out outarg; + struct fuse_req *req = NULL; + struct page **pages = NULL; + struct page *iov_page = NULL; + struct iovec *in_iov = NULL, *out_iov = NULL; + unsigned int in_iovs = 0, out_iovs = 0, num_pages = 0, max_pages; + size_t in_size, out_size, transferred; + int err; + + /* assume all the iovs returned by client always fits in a page */ + BUILD_BUG_ON(sizeof(struct iovec) * FUSE_IOCTL_MAX_IOV > PAGE_SIZE); + + if (!fuse_allow_task(fc, current)) + return -EACCES; + + err = -EIO; + if (is_bad_inode(inode)) + goto out; + + err = -ENOMEM; + pages = kzalloc(sizeof(pages[0]) * FUSE_MAX_PAGES_PER_REQ, GFP_KERNEL); + iov_page = alloc_page(GFP_KERNEL); + if (!pages || !iov_page) + goto out; + + /* + * If restricted, initialize IO parameters as encoded in @cmd. + * RETRY from server is not allowed. + */ + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) { + struct iovec *iov = page_address(iov_page); + + iov->iov_base = (void __user *)arg; + iov->iov_len = _IOC_SIZE(cmd); + + if (_IOC_DIR(cmd) & _IOC_WRITE) { + in_iov = iov; + in_iovs = 1; + } + + if (_IOC_DIR(cmd) & _IOC_READ) { + out_iov = iov; + out_iovs = 1; + } + } + + retry: + inarg.in_size = in_size = iov_length(in_iov, in_iovs); + inarg.out_size = out_size = iov_length(out_iov, out_iovs); + + /* + * Out data can be used either for actual out data or iovs, + * make sure there always is at least one page. + */ + out_size = max_t(size_t, out_size, PAGE_SIZE); + max_pages = DIV_ROUND_UP(max(in_size, out_size), PAGE_SIZE); + + /* make sure there are enough buffer pages and init request with them */ + err = -ENOMEM; + if (max_pages > FUSE_MAX_PAGES_PER_REQ) + goto out; + while (num_pages < max_pages) { + pages[num_pages] = alloc_page(GFP_KERNEL | __GFP_HIGHMEM); + if (!pages[num_pages]) + goto out; + num_pages++; + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + req = NULL; + goto out; + } + memcpy(req->pages, pages, sizeof(req->pages[0]) * num_pages); + req->num_pages = num_pages; + + /* okay, let's send it to the client */ + req->in.h.opcode = FUSE_IOCTL; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + if (in_size) { + req->in.numargs++; + req->in.args[1].size = in_size; + req->in.argpages = 1; + + err = fuse_ioctl_copy_user(pages, in_iov, in_iovs, in_size, + false); + if (err) + goto out; + } + + req->out.numargs = 2; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + req->out.args[1].size = out_size; + req->out.argpages = 1; + req->out.argvar = 1; + + fuse_request_send(fc, req); + err = req->out.h.error; + transferred = req->out.args[1].size; + fuse_put_request(fc, req); + req = NULL; + if (err) + goto out; + + /* did it ask for retry? */ + if (outarg.flags & FUSE_IOCTL_RETRY) { + char *vaddr; + + /* no retry if in restricted mode */ + err = -EIO; + if (!(flags & FUSE_IOCTL_UNRESTRICTED)) + goto out; + + in_iovs = outarg.in_iovs; + out_iovs = outarg.out_iovs; + + /* + * Make sure things are in boundary, separate checks + * are to protect against overflow. + */ + err = -ENOMEM; + if (in_iovs > FUSE_IOCTL_MAX_IOV || + out_iovs > FUSE_IOCTL_MAX_IOV || + in_iovs + out_iovs > FUSE_IOCTL_MAX_IOV) + goto out; + + err = -EIO; + if ((in_iovs + out_iovs) * sizeof(struct iovec) != transferred) + goto out; + + /* okay, copy in iovs and retry */ + vaddr = kmap_atomic(pages[0], KM_USER0); + memcpy(page_address(iov_page), vaddr, transferred); + kunmap_atomic(vaddr, KM_USER0); + + in_iov = page_address(iov_page); + out_iov = in_iov + in_iovs; + + goto retry; + } + + err = -EIO; + if (transferred > inarg.out_size) + goto out; + + err = fuse_ioctl_copy_user(pages, out_iov, out_iovs, transferred, true); + out: + if (req) + fuse_put_request(fc, req); + if (iov_page) + __free_page(iov_page); + while (num_pages) + __free_page(pages[--num_pages]); + kfree(pages); + + return err ? err : outarg.result; +} + +static long fuse_file_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_file_do_ioctl(file, cmd, arg, 0); +} + +static long fuse_file_compat_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + return fuse_file_do_ioctl(file, cmd, arg, FUSE_IOCTL_COMPAT); +} + +/* + * All files which have been polled are linked to RB tree + * fuse_conn->polled_files which is indexed by kh. Walk the tree and + * find the matching one. + */ +static struct rb_node **fuse_find_polled_node(struct fuse_conn *fc, u64 kh, + struct rb_node **parent_out) +{ + struct rb_node **link = &fc->polled_files.rb_node; + struct rb_node *last = NULL; + + while (*link) { + struct fuse_file *ff; + + last = *link; + ff = rb_entry(last, struct fuse_file, polled_node); + + if (kh < ff->kh) + link = &last->rb_left; + else if (kh > ff->kh) + link = &last->rb_right; + else + return link; + } + + if (parent_out) + *parent_out = last; + return link; +} + +/* + * The file is about to be polled. Make sure it's on the polled_files + * RB tree. Note that files once added to the polled_files tree are + * not removed before the file is released. This is because a file + * polled once is likely to be polled again. + */ +static void fuse_register_polled_file(struct fuse_conn *fc, + struct fuse_file *ff) +{ + spin_lock(&fc->lock); + if (RB_EMPTY_NODE(&ff->polled_node)) { + struct rb_node **link, *parent; + + link = fuse_find_polled_node(fc, ff->kh, &parent); + BUG_ON(*link); + rb_link_node(&ff->polled_node, parent, link); + rb_insert_color(&ff->polled_node, &fc->polled_files); + } + spin_unlock(&fc->lock); +} + +static unsigned fuse_file_poll(struct file *file, poll_table *wait) +{ + struct inode *inode = file->f_dentry->d_inode; + struct fuse_file *ff = file->private_data; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_poll_in inarg = { .fh = ff->fh, .kh = ff->kh }; + struct fuse_poll_out outarg; + struct fuse_req *req; + int err; + + if (fc->no_poll) + return DEFAULT_POLLMASK; + + poll_wait(file, &ff->poll_wait, wait); + + /* + * Ask for notification iff there's someone waiting for it. + * The client may ignore the flag and always notify. + */ + if (waitqueue_active(&ff->poll_wait)) { + inarg.flags |= FUSE_POLL_SCHEDULE_NOTIFY; + fuse_register_polled_file(fc, ff); + } + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->in.h.opcode = FUSE_POLL; + req->in.h.nodeid = get_node_id(inode); + req->in.numargs = 1; + req->in.args[0].size = sizeof(inarg); + req->in.args[0].value = &inarg; + req->out.numargs = 1; + req->out.args[0].size = sizeof(outarg); + req->out.args[0].value = &outarg; + fuse_request_send(fc, req); + err = req->out.h.error; + fuse_put_request(fc, req); + + if (!err) + return outarg.revents; + if (err == -ENOSYS) { + fc->no_poll = 1; + return DEFAULT_POLLMASK; + } + return POLLERR; +} + +/* + * This is called from fuse_handle_notify() on FUSE_NOTIFY_POLL and + * wakes up the poll waiters. + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg) +{ + u64 kh = outarg->kh; + struct rb_node **link; + + spin_lock(&fc->lock); + + link = fuse_find_polled_node(fc, kh, NULL); + if (*link) { + struct fuse_file *ff; + + ff = rb_entry(*link, struct fuse_file, polled_node); + wake_up_interruptible_sync(&ff->poll_wait); + } + + spin_unlock(&fc->lock); + return 0; +} + static const struct file_operations fuse_file_operations = { .llseek = fuse_file_llseek, .read = do_sync_read, @@ -1484,6 +1897,9 @@ static const struct file_operations fuse_file_operations = { .lock = fuse_file_lock, .flock = fuse_file_flock, .splice_read = generic_file_splice_read, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, }; static const struct file_operations fuse_direct_io_file_operations = { @@ -1496,6 +1912,9 @@ static const struct file_operations fuse_direct_io_file_operations = { .fsync = fuse_fsync, .lock = fuse_file_lock, .flock = fuse_file_flock, + .unlocked_ioctl = fuse_file_ioctl, + .compat_ioctl = fuse_file_compat_ioctl, + .poll = fuse_file_poll, /* no mmap and splice_read */ }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 35accfdd747..5e64b815a5a 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -19,6 +19,8 @@ #include <linux/backing-dev.h> #include <linux/mutex.h> #include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/poll.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -100,6 +102,9 @@ struct fuse_file { /** Request reserved for flush and release */ struct fuse_req *reserved_req; + /** Kernel file handle guaranteed to be unique */ + u64 kh; + /** File handle used by userspace */ u64 fh; @@ -108,6 +113,12 @@ struct fuse_file { /** Entry on inode's write_files list */ struct list_head write_entry; + + /** RB node to be linked on fuse_conn->polled_files */ + struct rb_node polled_node; + + /** Wait queue head for poll */ + wait_queue_head_t poll_wait; }; /** One input argument of a request */ @@ -322,6 +333,12 @@ struct fuse_conn { /** The list of requests under I/O */ struct list_head io; + /** The next unique kernel file handle */ + u64 khctr; + + /** rbtree of fuse_files waiting for poll events indexed by ph */ + struct rb_root polled_files; + /** Number of requests currently in the background */ unsigned num_background; @@ -355,19 +372,19 @@ struct fuse_conn { /** Connection failed (version mismatch). Cannot race with setting other bitfields since it is only set once in INIT reply, before any other request, and never cleared */ - unsigned conn_error : 1; + unsigned conn_error:1; /** Connection successful. Only set in INIT */ - unsigned conn_init : 1; + unsigned conn_init:1; /** Do readpages asynchronously? Only set in INIT */ - unsigned async_read : 1; + unsigned async_read:1; /** Do not send separate SETATTR request before open(O_TRUNC) */ - unsigned atomic_o_trunc : 1; + unsigned atomic_o_trunc:1; /** Filesystem supports NFS exporting. Only set in INIT */ - unsigned export_support : 1; + unsigned export_support:1; /* * The following bitfields are only for optimization purposes @@ -375,43 +392,46 @@ struct fuse_conn { */ /** Is fsync not implemented by fs? */ - unsigned no_fsync : 1; + unsigned no_fsync:1; /** Is fsyncdir not implemented by fs? */ - unsigned no_fsyncdir : 1; + unsigned no_fsyncdir:1; /** Is flush not implemented by fs? */ - unsigned no_flush : 1; + unsigned no_flush:1; /** Is setxattr not implemented by fs? */ - unsigned no_setxattr : 1; + unsigned no_setxattr:1; /** Is getxattr not implemented by fs? */ - unsigned no_getxattr : 1; + unsigned no_getxattr:1; /** Is listxattr not implemented by fs? */ - unsigned no_listxattr : 1; + unsigned no_listxattr:1; /** Is removexattr not implemented by fs? */ - unsigned no_removexattr : 1; + unsigned no_removexattr:1; /** Are file locking primitives not implemented by fs? */ - unsigned no_lock : 1; + unsigned no_lock:1; /** Is access not implemented by fs? */ - unsigned no_access : 1; + unsigned no_access:1; /** Is create not implemented by fs? */ - unsigned no_create : 1; + unsigned no_create:1; /** Is interrupt not implemented by fs? */ - unsigned no_interrupt : 1; + unsigned no_interrupt:1; /** Is bmap not implemented by fs? */ - unsigned no_bmap : 1; + unsigned no_bmap:1; + + /** Is poll not implemented by fs? */ + unsigned no_poll:1; /** Do multi-page cached writes */ - unsigned big_writes : 1; + unsigned big_writes:1; /** The number of requests waiting for completion */ atomic_t num_waiting; @@ -445,6 +465,9 @@ struct fuse_conn { /** Version counter for attribute changes */ u64 attr_version; + + /** Called on final put */ + void (*release)(struct fuse_conn *); }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -499,7 +522,7 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, */ int fuse_open_common(struct inode *inode, struct file *file, int isdir); -struct fuse_file *fuse_file_alloc(void); +struct fuse_file *fuse_file_alloc(struct fuse_conn *fc); void fuse_file_free(struct fuse_file *ff); void fuse_finish_open(struct inode *inode, struct file *file, struct fuse_file *ff, struct fuse_open_out *outarg); @@ -519,6 +542,12 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, int isdir); /** + * Notify poll wakeup + */ +int fuse_notify_poll_wakeup(struct fuse_conn *fc, + struct fuse_notify_poll_wakeup_out *outarg); + +/** * Initialize file operations on a regular file */ void fuse_init_file_inode(struct inode *inode); @@ -593,19 +622,20 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); /** * Send a request (synchronous) */ -void request_send(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req); /** * Send a request with no reply */ -void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); /** * Send a request in the background */ -void request_send_background(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req); -void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); +void fuse_request_send_background_locked(struct fuse_conn *fc, + struct fuse_req *req); /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -623,6 +653,11 @@ void fuse_invalidate_entry_cache(struct dentry *entry); struct fuse_conn *fuse_conn_get(struct fuse_conn *fc); /** + * Initialize fuse_conn + */ +int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb); + +/** * Release reference to fuse_conn */ void fuse_conn_put(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 2e99f34b443..47c96fdca1a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -37,10 +37,10 @@ struct fuse_mount_data { unsigned rootmode; unsigned user_id; unsigned group_id; - unsigned fd_present : 1; - unsigned rootmode_present : 1; - unsigned user_id_present : 1; - unsigned group_id_present : 1; + unsigned fd_present:1; + unsigned rootmode_present:1; + unsigned user_id_present:1; + unsigned group_id_present:1; unsigned flags; unsigned max_read; unsigned blksize; @@ -94,7 +94,7 @@ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, req->in.numargs = 1; req->in.args[0].size = sizeof(struct fuse_forget_in); req->in.args[0].value = inarg; - request_send_noreply(fc, req); + fuse_request_send_noreply(fc, req); } static void fuse_clear_inode(struct inode *inode) @@ -250,7 +250,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid, fi = get_fuse_inode(inode); spin_lock(&fc->lock); - fi->nlookup ++; + fi->nlookup++; spin_unlock(&fc->lock); fuse_change_attributes(inode, attr, attr_valid, attr_version); @@ -269,7 +269,7 @@ static void fuse_send_destroy(struct fuse_conn *fc) fc->destroy_req = NULL; req->in.h.opcode = FUSE_DESTROY; req->force = 1; - request_send(fc, req); + fuse_request_send(fc, req); fuse_put_request(fc, req); } } @@ -334,7 +334,7 @@ static int fuse_statfs(struct dentry *dentry, struct kstatfs *buf) req->out.args[0].size = fc->minor < 4 ? FUSE_COMPAT_STATFS_SIZE : sizeof(outarg); req->out.args[0].value = &outarg; - request_send(fc, req); + fuse_request_send(fc, req); err = req->out.h.error; if (!err) convert_fuse_statfs(buf, &outarg.st); @@ -462,68 +462,69 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } -static struct fuse_conn *new_conn(struct super_block *sb) +int fuse_conn_init(struct fuse_conn *fc, struct super_block *sb) { - struct fuse_conn *fc; int err; - fc = kzalloc(sizeof(*fc), GFP_KERNEL); - if (fc) { - spin_lock_init(&fc->lock); - mutex_init(&fc->inst_mutex); - atomic_set(&fc->count, 1); - init_waitqueue_head(&fc->waitq); - init_waitqueue_head(&fc->blocked_waitq); - init_waitqueue_head(&fc->reserved_req_waitq); - INIT_LIST_HEAD(&fc->pending); - INIT_LIST_HEAD(&fc->processing); - INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->interrupts); - INIT_LIST_HEAD(&fc->bg_queue); - atomic_set(&fc->num_waiting, 0); - fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - fc->bdi.unplug_io_fn = default_unplug_io_fn; - /* fuse does it's own writeback accounting */ - fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; - fc->dev = sb->s_dev; - err = bdi_init(&fc->bdi); - if (err) - goto error_kfree; - if (sb->s_bdev) { - err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", - MAJOR(fc->dev), MINOR(fc->dev)); - } else { - err = bdi_register_dev(&fc->bdi, fc->dev); - } - if (err) - goto error_bdi_destroy; - /* - * For a single fuse filesystem use max 1% of dirty + - * writeback threshold. - * - * This gives about 1M of write buffer for memory maps on a - * machine with 1G and 10% dirty_ratio, which should be more - * than enough. - * - * Privileged users can raise it by writing to - * - * /sys/class/bdi/<bdi>/max_ratio - */ - bdi_set_max_ratio(&fc->bdi, 1); - fc->reqctr = 0; - fc->blocked = 1; - fc->attr_version = 1; - get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); + memset(fc, 0, sizeof(*fc)); + spin_lock_init(&fc->lock); + mutex_init(&fc->inst_mutex); + atomic_set(&fc->count, 1); + init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); + init_waitqueue_head(&fc->reserved_req_waitq); + INIT_LIST_HEAD(&fc->pending); + INIT_LIST_HEAD(&fc->processing); + INIT_LIST_HEAD(&fc->io); + INIT_LIST_HEAD(&fc->interrupts); + INIT_LIST_HEAD(&fc->bg_queue); + INIT_LIST_HEAD(&fc->entry); + atomic_set(&fc->num_waiting, 0); + fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; + fc->bdi.unplug_io_fn = default_unplug_io_fn; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + fc->khctr = 0; + fc->polled_files = RB_ROOT; + fc->dev = sb->s_dev; + err = bdi_init(&fc->bdi); + if (err) + goto error_mutex_destroy; + if (sb->s_bdev) { + err = bdi_register(&fc->bdi, NULL, "%u:%u-fuseblk", + MAJOR(fc->dev), MINOR(fc->dev)); + } else { + err = bdi_register_dev(&fc->bdi, fc->dev); } - return fc; + if (err) + goto error_bdi_destroy; + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi/<bdi>/max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); + fc->reqctr = 0; + fc->blocked = 1; + fc->attr_version = 1; + get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); -error_bdi_destroy: + return 0; + + error_bdi_destroy: bdi_destroy(&fc->bdi); -error_kfree: + error_mutex_destroy: mutex_destroy(&fc->inst_mutex); - kfree(fc); - return NULL; + return err; } +EXPORT_SYMBOL_GPL(fuse_conn_init); void fuse_conn_put(struct fuse_conn *fc) { @@ -532,7 +533,7 @@ void fuse_conn_put(struct fuse_conn *fc) fuse_request_free(fc->destroy_req); mutex_destroy(&fc->inst_mutex); bdi_destroy(&fc->bdi); - kfree(fc); + fc->release(fc); } } @@ -542,7 +543,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) return fc; } -static struct inode *get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -553,8 +554,7 @@ static struct inode *get_root_inode(struct super_block *sb, unsigned mode) return fuse_iget(sb, 1, 0, &attr, 0, 0); } -struct fuse_inode_handle -{ +struct fuse_inode_handle { u64 nodeid; u32 generation; }; @@ -761,7 +761,6 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->max_write = max_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } - fuse_put_request(fc, req); fc->blocked = 0; wake_up_all(&fc->blocked_waitq); } @@ -787,7 +786,12 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) req->out.args[0].size = sizeof(struct fuse_init_out); req->out.args[0].value = &req->misc.init_out; req->end = process_init_reply; - request_send_background(fc, req); + fuse_request_send_background(fc, req); +} + +static void fuse_free_conn(struct fuse_conn *fc) +{ + kfree(fc); } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -828,10 +832,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->f_op != &fuse_dev_operations) return -EINVAL; - fc = new_conn(sb); + fc = kmalloc(sizeof(*fc), GFP_KERNEL); if (!fc) return -ENOMEM; + err = fuse_conn_init(fc, sb); + if (err) { + kfree(fc); + return err; + } + + fc->release = fuse_free_conn; fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; @@ -841,7 +852,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = fc; err = -ENOMEM; - root = get_root_inode(sb, d.rootmode); + root = fuse_get_root_inode(sb, d.rootmode); if (!root) goto err; @@ -952,7 +963,7 @@ static inline void unregister_fuseblk(void) static void fuse_inode_init_once(void *foo) { - struct inode * inode = foo; + struct inode *inode = foo; inode_init_once(inode); } @@ -1031,7 +1042,7 @@ static int __init fuse_init(void) { int res; - printk("fuse init (API version %i.%i)\n", + printk(KERN_INFO "fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); INIT_LIST_HEAD(&fuse_conn_list); diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index ab2f57e3fb8..e563a644981 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -1,6 +1,6 @@ config GFS2_FS tristate "GFS2 file system support" - depends on EXPERIMENTAL && (64BIT || (LSF && LBD)) + depends on EXPERIMENTAL && (64BIT || LBD) select FS_POSIX_ACL select CRC32 help diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index ec65851ec80..c1b4ec6a965 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,5 +1,5 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o -gfs2-y := acl.o bmap.o daemon.o dir.o eaops.o eattr.o glock.o \ +gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o locking.o main.o meta_io.o \ mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ diff --git a/fs/gfs2/acl.c b/fs/gfs2/acl.c index 3e9bd46f27e..e335dceb6a4 100644 --- a/fs/gfs2/acl.c +++ b/fs/gfs2/acl.c @@ -91,7 +91,7 @@ static int acl_get(struct gfs2_inode *ip, int access, struct posix_acl **acl, struct gfs2_ea_location el_this; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return 0; memset(&er, 0, sizeof(struct gfs2_ea_request)); diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index bec76b1c2bb..11ffc56f1f8 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -75,9 +75,9 @@ static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh, void *kaddr = kmap(page); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_di.di_size); - memset(kaddr + ip->i_di.di_size, 0, - PAGE_CACHE_SIZE - ip->i_di.di_size); + ip->i_disksize); + memset(kaddr + ip->i_disksize, 0, + PAGE_CACHE_SIZE - ip->i_disksize); kunmap(page); SetPageUptodate(page); @@ -132,7 +132,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) if (error) goto out; - if (ip->i_di.di_size) { + if (ip->i_disksize) { /* Get a free block, fill it with the stuffed data, and write it out to disk */ @@ -159,7 +159,7 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) di = (struct gfs2_dinode *)dibh->b_data; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); - if (ip->i_di.di_size) { + if (ip->i_disksize) { *(__be64 *)(di + 1) = cpu_to_be64(block); gfs2_add_inode_blocks(&ip->i_inode, 1); di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); @@ -926,7 +926,7 @@ static int do_grow(struct gfs2_inode *ip, u64 size) } } - ip->i_di.di_size = size; + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1033,7 +1033,7 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) goto out; if (gfs2_is_stuffed(ip)) { - ip->i_di.di_size = size; + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1045,9 +1045,9 @@ static int trunc_start(struct gfs2_inode *ip, u64 size) error = gfs2_block_truncate_page(ip->i_inode.i_mapping); if (!error) { - ip->i_di.di_size = size; + ip->i_disksize = size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_di.di_flags |= GFS2_DIF_TRUNC_IN_PROG; + ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); } @@ -1114,13 +1114,13 @@ static int trunc_end(struct gfs2_inode *ip) if (error) goto out; - if (!ip->i_di.di_size) { + if (!ip->i_disksize) { ip->i_height = 0; ip->i_goal = ip->i_no_addr; gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode)); } ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; - ip->i_di.di_flags &= ~GFS2_DIF_TRUNC_IN_PROG; + ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG; gfs2_trans_add_bh(ip->i_gl, dibh, 1); gfs2_dinode_out(ip, dibh->b_data); @@ -1205,9 +1205,9 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size) if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), S_ISREG(ip->i_inode.i_mode))) return -EINVAL; - if (size > ip->i_di.di_size) + if (size > ip->i_disksize) error = do_grow(ip, size); - else if (size < ip->i_di.di_size) + else if (size < ip->i_disksize) error = do_shrink(ip, size); else /* update time stamps */ @@ -1219,7 +1219,7 @@ int gfs2_truncatei(struct gfs2_inode *ip, u64 size) int gfs2_truncatei_resume(struct gfs2_inode *ip) { int error; - error = trunc_dealloc(ip, ip->i_di.di_size); + error = trunc_dealloc(ip, ip->i_disksize); if (!error) error = trunc_end(ip); return error; @@ -1231,35 +1231,6 @@ int gfs2_file_dealloc(struct gfs2_inode *ip) } /** - * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file - * @ip: the file - * @len: the number of bytes to be written to the file - * @data_blocks: returns the number of data blocks required - * @ind_blocks: returns the number of indirect blocks required - * - */ - -void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, - unsigned int *data_blocks, unsigned int *ind_blocks) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - unsigned int tmp; - - if (gfs2_is_dir(ip)) { - *data_blocks = DIV_ROUND_UP(len, sdp->sd_jbsize) + 2; - *ind_blocks = 3 * (sdp->sd_max_jheight - 1); - } else { - *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; - *ind_blocks = 3 * (sdp->sd_max_height - 1); - } - - for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { - tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); - *ind_blocks += tmp; - } -} - -/** * gfs2_write_alloc_required - figure out if a write will require an allocation * @ip: the file being written to * @offset: the offset to write to @@ -1276,6 +1247,7 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, struct buffer_head bh; unsigned int shift; u64 lblock, lblock_stop, size; + u64 end_of_file; *alloc_required = 0; @@ -1291,19 +1263,12 @@ int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, *alloc_required = 1; shift = sdp->sd_sb.sb_bsize_shift; - if (gfs2_is_dir(ip)) { - unsigned int bsize = sdp->sd_jbsize; - lblock = offset; - do_div(lblock, bsize); - lblock_stop = offset + len + bsize - 1; - do_div(lblock_stop, bsize); - } else { - u64 end_of_file = (ip->i_di.di_size + sdp->sd_sb.sb_bsize - 1) >> shift; - lblock = offset >> shift; - lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; - if (lblock_stop > end_of_file) - return 0; - } + BUG_ON(gfs2_is_dir(ip)); + end_of_file = (ip->i_disksize + sdp->sd_sb.sb_bsize - 1) >> shift; + lblock = offset >> shift; + lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift; + if (lblock_stop > end_of_file) + return 0; size = (lblock_stop - lblock) << shift; do { diff --git a/fs/gfs2/bmap.h b/fs/gfs2/bmap.h index 4e6cde2943b..c983177e05a 100644 --- a/fs/gfs2/bmap.h +++ b/fs/gfs2/bmap.h @@ -10,10 +10,40 @@ #ifndef __BMAP_DOT_H__ #define __BMAP_DOT_H__ +#include "inode.h" + struct inode; struct gfs2_inode; struct page; + +/** + * gfs2_write_calc_reserv - calculate number of blocks needed to write to a file + * @ip: the file + * @len: the number of bytes to be written to the file + * @data_blocks: returns the number of data blocks required + * @ind_blocks: returns the number of indirect blocks required + * + */ + +static inline void gfs2_write_calc_reserv(const struct gfs2_inode *ip, + unsigned int len, + unsigned int *data_blocks, + unsigned int *ind_blocks) +{ + const struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + unsigned int tmp; + + BUG_ON(gfs2_is_dir(ip)); + *data_blocks = (len >> sdp->sd_sb.sb_bsize_shift) + 3; + *ind_blocks = 3 * (sdp->sd_max_height - 1); + + for (tmp = *data_blocks; tmp > sdp->sd_diptrs;) { + tmp = DIV_ROUND_UP(tmp, sdp->sd_inptrs); + *ind_blocks += tmp; + } +} + int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page); int gfs2_block_map(struct inode *inode, sector_t lblock, struct buffer_head *bh, int create); int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsigned *extlen); @@ -21,10 +51,6 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi int gfs2_truncatei(struct gfs2_inode *ip, u64 size); int gfs2_truncatei_resume(struct gfs2_inode *ip); int gfs2_file_dealloc(struct gfs2_inode *ip); - -void gfs2_write_calc_reserv(struct gfs2_inode *ip, unsigned int len, - unsigned int *data_blocks, - unsigned int *ind_blocks); int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset, unsigned int len, int *alloc_required); diff --git a/fs/gfs2/daemon.c b/fs/gfs2/daemon.c deleted file mode 100644 index e51991947d2..00000000000 --- a/fs/gfs2/daemon.c +++ /dev/null @@ -1,136 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/spinlock.h> -#include <linux/completion.h> -#include <linux/buffer_head.h> -#include <linux/kthread.h> -#include <linux/delay.h> -#include <linux/gfs2_ondisk.h> -#include <linux/lm_interface.h> -#include <linux/freezer.h> - -#include "gfs2.h" -#include "incore.h" -#include "daemon.h" -#include "glock.h" -#include "log.h" -#include "quota.h" -#include "recovery.h" -#include "super.h" -#include "util.h" - -/* This uses schedule_timeout() instead of msleep() because it's good for - the daemons to wake up more often than the timeout when unmounting so - the user's unmount doesn't sit there forever. - - The kthread functions used to start these daemons block and flush signals. */ - -/** - * gfs2_glockd - Reclaim unused glock structures - * @sdp: Pointer to GFS2 superblock - * - * One or more of these daemons run, reclaiming glocks on sd_reclaim_list. - * Number of daemons can be set by user, with num_glockd mount option. - */ - -int gfs2_glockd(void *data) -{ - struct gfs2_sbd *sdp = data; - - while (!kthread_should_stop()) { - while (atomic_read(&sdp->sd_reclaim_count)) - gfs2_reclaim_glock(sdp); - - wait_event_interruptible(sdp->sd_reclaim_wq, - (atomic_read(&sdp->sd_reclaim_count) || - kthread_should_stop())); - if (freezing(current)) - refrigerator(); - } - - return 0; -} - -/** - * gfs2_recoverd - Recover dead machine's journals - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_recoverd(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - - while (!kthread_should_stop()) { - gfs2_check_journals(sdp); - t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - -/** - * gfs2_quotad - Write cached quota changes into the quota file - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_quotad(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - int error; - - while (!kthread_should_stop()) { - /* Update the master statfs file */ - - t = sdp->sd_statfs_sync_time + - gfs2_tune_get(sdp, gt_statfs_quantum) * HZ; - - if (time_after_eq(jiffies, t)) { - error = gfs2_statfs_sync(sdp); - if (error && - error != -EROFS && - !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - fs_err(sdp, "quotad: (1) error=%d\n", error); - sdp->sd_statfs_sync_time = jiffies; - } - - /* Update quota file */ - - t = sdp->sd_quota_sync_time + - gfs2_tune_get(sdp, gt_quota_quantum) * HZ; - - if (time_after_eq(jiffies, t)) { - error = gfs2_quota_sync(sdp); - if (error && - error != -EROFS && - !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - fs_err(sdp, "quotad: (2) error=%d\n", error); - sdp->sd_quota_sync_time = jiffies; - } - - gfs2_quota_scan(sdp); - - t = gfs2_tune_get(sdp, gt_quotad_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - - return 0; -} - diff --git a/fs/gfs2/daemon.h b/fs/gfs2/daemon.h deleted file mode 100644 index 4be084fb6a6..00000000000 --- a/fs/gfs2/daemon.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __DAEMON_DOT_H__ -#define __DAEMON_DOT_H__ - -int gfs2_glockd(void *data); -int gfs2_recoverd(void *data); -int gfs2_quotad(void *data); - -#endif /* __DAEMON_DOT_H__ */ diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index eed040d8ba3..b7c8e5c7079 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -36,7 +36,7 @@ * the block. In leaves, they begin at offset sizeof(struct gfs2_leaf) from the * beginning of the leaf block. The dirents reside in leaves when * - * dip->i_di.di_flags & GFS2_DIF_EXHASH is true + * dip->i_diskflags & GFS2_DIF_EXHASH is true * * Otherwise, the dirents are "linear", within a single stuffed dinode block. * @@ -128,8 +128,8 @@ static int gfs2_dir_write_stuffed(struct gfs2_inode *ip, const char *buf, gfs2_trans_add_bh(ip->i_gl, dibh, 1); memcpy(dibh->b_data + offset + sizeof(struct gfs2_dinode), buf, size); - if (ip->i_di.di_size < offset + size) - ip->i_di.di_size = offset + size; + if (ip->i_disksize < offset + size) + ip->i_disksize = offset + size; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, dibh->b_data); @@ -226,8 +226,8 @@ out: if (error) return error; - if (ip->i_di.di_size < offset + copied) - ip->i_di.di_size = offset + copied; + if (ip->i_disksize < offset + copied) + ip->i_disksize = offset + copied; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_trans_add_bh(ip->i_gl, dibh, 1); @@ -277,11 +277,11 @@ static int gfs2_dir_read_data(struct gfs2_inode *ip, char *buf, u64 offset, int copied = 0; int error = 0; - if (offset >= ip->i_di.di_size) + if (offset >= ip->i_disksize) return 0; - if (offset + size > ip->i_di.di_size) - size = ip->i_di.di_size - offset; + if (offset + size > ip->i_disksize) + size = ip->i_disksize - offset; if (!size) return 0; @@ -755,12 +755,12 @@ static struct gfs2_dirent *gfs2_dirent_search(struct inode *inode, struct gfs2_inode *ip = GFS2_I(inode); int error; - if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (ip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf; unsigned hsize = 1 << ip->i_depth; unsigned index; u64 ln; - if (hsize * sizeof(u64) != ip->i_di.di_size) { + if (hsize * sizeof(u64) != ip->i_disksize) { gfs2_consist_inode(ip); return ERR_PTR(-EIO); } @@ -858,8 +858,8 @@ static int dir_make_exhash(struct inode *inode) return -ENOSPC; bn = bh->b_blocknr; - gfs2_assert(sdp, dip->i_di.di_entries < (1 << 16)); - leaf->lf_entries = cpu_to_be16(dip->i_di.di_entries); + gfs2_assert(sdp, dip->i_entries < (1 << 16)); + leaf->lf_entries = cpu_to_be16(dip->i_entries); /* Copy dirents */ @@ -905,9 +905,9 @@ static int dir_make_exhash(struct inode *inode) for (x = sdp->sd_hash_ptrs; x--; lp++) *lp = cpu_to_be64(bn); - dip->i_di.di_size = sdp->sd_sb.sb_bsize / 2; + dip->i_disksize = sdp->sd_sb.sb_bsize / 2; gfs2_add_inode_blocks(&dip->i_inode, 1); - dip->i_di.di_flags |= GFS2_DIF_EXHASH; + dip->i_diskflags |= GFS2_DIF_EXHASH; for (x = sdp->sd_hash_ptrs, y = -1; x; x >>= 1, y++) ; dip->i_depth = y; @@ -1082,7 +1082,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { + if (hsize * sizeof(u64) != dip->i_disksize) { gfs2_consist_inode(dip); return -EIO; } @@ -1091,7 +1091,7 @@ static int dir_double_exhash(struct gfs2_inode *dip) buf = kcalloc(3, sdp->sd_hash_bsize, GFP_NOFS | __GFP_NOFAIL); - for (block = dip->i_di.di_size >> sdp->sd_hash_bsize_shift; block--;) { + for (block = dip->i_disksize >> sdp->sd_hash_bsize_shift; block--;) { error = gfs2_dir_read_data(dip, (char *)buf, block * sdp->sd_hash_bsize, sdp->sd_hash_bsize, 1); @@ -1370,7 +1370,7 @@ static int dir_e_read(struct inode *inode, u64 *offset, void *opaque, unsigned depth = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { + if (hsize * sizeof(u64) != dip->i_disksize) { gfs2_consist_inode(dip); return -EIO; } @@ -1426,10 +1426,10 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, int copied = 0; int error; - if (!dip->i_di.di_entries) + if (!dip->i_entries) return 0; - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) + if (dip->i_diskflags & GFS2_DIF_EXHASH) return dir_e_read(inode, offset, opaque, filldir); if (!gfs2_is_stuffed(dip)) { @@ -1453,17 +1453,17 @@ int gfs2_dir_read(struct inode *inode, u64 *offset, void *opaque, error = PTR_ERR(dent); goto out; } - if (dip->i_di.di_entries != g.offset) { + if (dip->i_entries != g.offset) { fs_warn(sdp, "Number of entries corrupt in dir %llu, " - "ip->i_di.di_entries (%u) != g.offset (%u)\n", + "ip->i_entries (%u) != g.offset (%u)\n", (unsigned long long)dip->i_no_addr, - dip->i_di.di_entries, + dip->i_entries, g.offset); error = -EIO; goto out; } error = do_filldir_main(dip, offset, opaque, filldir, darr, - dip->i_di.di_entries, &copied); + dip->i_entries, &copied); out: kfree(darr); } @@ -1612,7 +1612,7 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, dent = gfs2_init_dirent(inode, dent, name, bh); gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(type); - if (ip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (ip->i_diskflags & GFS2_DIF_EXHASH) { leaf = (struct gfs2_leaf *)bh->b_data; be16_add_cpu(&leaf->lf_entries, 1); } @@ -1621,14 +1621,14 @@ int gfs2_dir_add(struct inode *inode, const struct qstr *name, if (error) break; gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_entries++; + ip->i_entries++; ip->i_inode.i_mtime = ip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(ip, bh->b_data); brelse(bh); error = 0; break; } - if (!(ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + if (!(ip->i_diskflags & GFS2_DIF_EXHASH)) { error = dir_make_exhash(inode); if (error) break; @@ -1691,7 +1691,7 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) } dirent_del(dip, bh, prev, dent); - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (dip->i_diskflags & GFS2_DIF_EXHASH) { struct gfs2_leaf *leaf = (struct gfs2_leaf *)bh->b_data; u16 entries = be16_to_cpu(leaf->lf_entries); if (!entries) @@ -1704,10 +1704,10 @@ int gfs2_dir_del(struct gfs2_inode *dip, const struct qstr *name) if (error) return error; - if (!dip->i_di.di_entries) + if (!dip->i_entries) gfs2_consist_inode(dip); gfs2_trans_add_bh(dip->i_gl, bh, 1); - dip->i_di.di_entries--; + dip->i_entries--; dip->i_inode.i_mtime = dip->i_inode.i_ctime = CURRENT_TIME; gfs2_dinode_out(dip, bh->b_data); brelse(bh); @@ -1748,7 +1748,7 @@ int gfs2_dir_mvino(struct gfs2_inode *dip, const struct qstr *filename, gfs2_inum_out(nip, dent); dent->de_type = cpu_to_be16(new_type); - if (dip->i_di.di_flags & GFS2_DIF_EXHASH) { + if (dip->i_diskflags & GFS2_DIF_EXHASH) { brelse(bh); error = gfs2_meta_inode_buffer(dip, &bh); if (error) @@ -1784,7 +1784,7 @@ static int foreach_leaf(struct gfs2_inode *dip, leaf_call_t lc, void *data) int error = 0; hsize = 1 << dip->i_depth; - if (hsize * sizeof(u64) != dip->i_di.di_size) { + if (hsize * sizeof(u64) != dip->i_disksize) { gfs2_consist_inode(dip); return -EIO; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 8a468cac932..4f919440c3b 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -11,6 +11,7 @@ #define __DIR_DOT_H__ #include <linux/dcache.h> +#include <linux/crc32.h> struct inode; struct gfs2_inode; diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index e3f76f451b0..0d1c76d906a 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -114,11 +114,11 @@ static int ea_foreach(struct gfs2_inode *ip, ea_call_t ea_call, void *data) __be64 *eablk, *end; int error; - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &bh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &bh); if (error) return error; - if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) { + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) { error = ea_foreach_i(ip, bh, ea_call, data); goto out; } @@ -414,7 +414,7 @@ int gfs2_ea_list(struct gfs2_inode *ip, struct gfs2_ea_request *er) if (error) return error; - if (ip->i_di.di_eattr) { + if (ip->i_eattr) { struct ea_list ei = { .ei_er = er, .ei_size = 0 }; error = ea_foreach(ip, ea_list_i, &ei); @@ -514,7 +514,7 @@ int gfs2_ea_get_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return -ENODATA; error = gfs2_ea_find(ip, er, &el); @@ -741,7 +741,7 @@ static int ea_init_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - ip->i_di.di_eattr = bh->b_blocknr; + ip->i_eattr = bh->b_blocknr; error = ea_write(ip, GFS2_EA_BH2FIRST(bh), er); brelse(bh); @@ -935,10 +935,10 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, int error; int mh_size = sizeof(struct gfs2_meta_header); - if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { __be64 *end; - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); if (error) return error; @@ -972,9 +972,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, gfs2_buffer_clear_tail(indbh, mh_size); eablk = (__be64 *)(indbh->b_data + mh_size); - *eablk = cpu_to_be64(ip->i_di.di_eattr); - ip->i_di.di_eattr = blk; - ip->i_di.di_flags |= GFS2_DIF_EA_INDIRECT; + *eablk = cpu_to_be64(ip->i_eattr); + ip->i_eattr = blk; + ip->i_diskflags |= GFS2_DIF_EA_INDIRECT; gfs2_add_inode_blocks(&ip->i_inode, 1); eablk++; @@ -1015,7 +1015,7 @@ static int ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er, if (error) return error; - if (!(ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT)) + if (!(ip->i_diskflags & GFS2_DIF_EA_INDIRECT)) blks++; if (GFS2_EAREQ_SIZE_STUFFED(er) > GFS2_SB(&ip->i_inode)->sd_jbsize) blks += DIV_ROUND_UP(er->er_data_len, GFS2_SB(&ip->i_inode)->sd_jbsize); @@ -1040,7 +1040,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) { + if (!ip->i_eattr) { if (er->er_flags & XATTR_REPLACE) return -ENODATA; return ea_init(ip, er); @@ -1051,7 +1051,7 @@ int gfs2_ea_set_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) return error; if (el.el_ea) { - if (ip->i_di.di_flags & GFS2_DIF_APPENDONLY) { + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) { brelse(el.el_bh); return -EPERM; } @@ -1145,7 +1145,7 @@ int gfs2_ea_remove_i(struct gfs2_inode *ip, struct gfs2_ea_request *er) struct gfs2_ea_location el; int error; - if (!ip->i_di.di_eattr) + if (!ip->i_eattr) return -ENODATA; error = gfs2_ea_find(ip, er, &el); @@ -1309,7 +1309,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) memset(&rlist, 0, sizeof(struct gfs2_rgrp_list)); - error = gfs2_meta_read(ip->i_gl, ip->i_di.di_eattr, DIO_WAIT, &indbh); + error = gfs2_meta_read(ip->i_gl, ip->i_eattr, DIO_WAIT, &indbh); if (error) return error; @@ -1388,7 +1388,7 @@ static int ea_dealloc_indirect(struct gfs2_inode *ip) if (bstart) gfs2_free_meta(ip, bstart, blen); - ip->i_di.di_flags &= ~GFS2_DIF_EA_INDIRECT; + ip->i_diskflags &= ~GFS2_DIF_EA_INDIRECT; error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { @@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) struct buffer_head *dibh; int error; - rgd = gfs2_blk2rgrpd(sdp, ip->i_di.di_eattr); + rgd = gfs2_blk2rgrpd(sdp, ip->i_eattr); if (!rgd) { gfs2_consist_inode(ip); return -EIO; @@ -1432,9 +1432,9 @@ static int ea_dealloc_block(struct gfs2_inode *ip) if (error) goto out_gunlock; - gfs2_free_meta(ip, ip->i_di.di_eattr, 1); + gfs2_free_meta(ip, ip->i_eattr, 1); - ip->i_di.di_eattr = 0; + ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); error = gfs2_meta_inode_buffer(ip, &dibh); @@ -1479,7 +1479,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) goto out_rindex; - if (ip->i_di.di_flags & GFS2_DIF_EA_INDIRECT) { + if (ip->i_diskflags & GFS2_DIF_EA_INDIRECT) { error = ea_dealloc_indirect(ip); if (error) goto out_rindex; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index c962283d4e7..6b983aef785 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -40,6 +40,7 @@ #include "quota.h" #include "super.h" #include "util.h" +#include "bmap.h" struct gfs2_gl_hash_bucket { struct hlist_head hb_list; @@ -61,9 +62,10 @@ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int static DECLARE_RWSEM(gfs2_umount_flush_sem); static struct dentry *gfs2_root; -static struct task_struct *scand_process; -static unsigned int scand_secs = 5; static struct workqueue_struct *glock_workqueue; +static LIST_HEAD(lru_list); +static atomic_t lru_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(lru_lock); #define GFS2_GL_HASH_SHIFT 15 #define GFS2_GL_HASH_SIZE (1 << GFS2_GL_HASH_SHIFT) @@ -174,6 +176,22 @@ static void gfs2_glock_hold(struct gfs2_glock *gl) } /** + * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list + * @gl: the glock + * + */ + +static void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) +{ + spin_lock(&lru_lock); + if (list_empty(&gl->gl_lru) && gl->gl_state != LM_ST_UNLOCKED) { + list_add_tail(&gl->gl_lru, &lru_list); + atomic_inc(&lru_count); + } + spin_unlock(&lru_lock); +} + +/** * gfs2_glock_put() - Decrement reference count on glock * @gl: The glock to put * @@ -187,14 +205,23 @@ int gfs2_glock_put(struct gfs2_glock *gl) if (atomic_dec_and_test(&gl->gl_ref)) { hlist_del(&gl->gl_list); write_unlock(gl_lock_addr(gl->gl_hash)); + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + } + spin_unlock(&lru_lock); GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_UNLOCKED); - GLOCK_BUG_ON(gl, !list_empty(&gl->gl_reclaim)); + GLOCK_BUG_ON(gl, !list_empty(&gl->gl_lru)); GLOCK_BUG_ON(gl, !list_empty(&gl->gl_holders)); glock_free(gl); rv = 1; goto out; } write_unlock(gl_lock_addr(gl->gl_hash)); + /* 1 for being hashed, 1 for having state != LM_ST_UNLOCKED */ + if (atomic_read(&gl->gl_ref) == 2) + gfs2_glock_schedule_for_reclaim(gl); out: return rv; } @@ -289,10 +316,13 @@ static void gfs2_holder_wake(struct gfs2_holder *gh) * do_promote - promote as many requests as possible on the current queue * @gl: The glock * - * Returns: true if there is a blocked holder at the head of the list + * Returns: 1 if there is a blocked holder at the head of the list, or 2 + * if a type specific operation is underway. */ static int do_promote(struct gfs2_glock *gl) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh, *tmp; @@ -310,6 +340,8 @@ restart: ret = glops->go_lock(gh); spin_lock(&gl->gl_spin); if (ret) { + if (ret == 1) + return 2; gh->gh_error = ret; list_del_init(&gh->gh_list); gfs2_holder_wake(gh); @@ -414,6 +446,7 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_holder *gh; unsigned state = ret & LM_OUT_ST_MASK; + int rv; spin_lock(&gl->gl_spin); state_change(gl, state); @@ -468,7 +501,6 @@ retry: gfs2_demote_wake(gl); if (state != LM_ST_UNLOCKED) { if (glops->go_xmote_bh) { - int rv; spin_unlock(&gl->gl_spin); rv = glops->go_xmote_bh(gl, gh); if (rv == -EAGAIN) @@ -479,10 +511,13 @@ retry: goto out; } } - do_promote(gl); + rv = do_promote(gl); + if (rv == 2) + goto out_locked; } out: clear_bit(GLF_LOCK, &gl->gl_flags); +out_locked: spin_unlock(&gl->gl_spin); gfs2_glock_put(gl); } @@ -511,6 +546,8 @@ static unsigned int gfs2_lm_lock(struct gfs2_sbd *sdp, void *lock, */ static void do_xmote(struct gfs2_glock *gl, struct gfs2_holder *gh, unsigned int target) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { const struct gfs2_glock_operations *glops = gl->gl_ops; struct gfs2_sbd *sdp = gl->gl_sbd; @@ -576,8 +613,11 @@ static inline struct gfs2_holder *find_first_holder(const struct gfs2_glock *gl) */ static void run_queue(struct gfs2_glock *gl, const int nonblock) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { struct gfs2_holder *gh = NULL; + int ret; if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) return; @@ -596,8 +636,11 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); - if (do_promote(gl) == 0) + ret = do_promote(gl); + if (ret == 0) goto out; + if (ret == 2) + return; gh = find_first_waiter(gl); gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) @@ -820,7 +863,7 @@ static void wait_on_demote(struct gfs2_glock *gl) */ static void handle_callback(struct gfs2_glock *gl, unsigned int state, - int remote, unsigned long delay) + unsigned long delay) { int bit = delay ? GLF_PENDING_DEMOTE : GLF_DEMOTE; @@ -828,9 +871,6 @@ static void handle_callback(struct gfs2_glock *gl, unsigned int state, if (gl->gl_demote_state == LM_ST_EXCLUSIVE) { gl->gl_demote_state = state; gl->gl_demote_time = jiffies; - if (remote && gl->gl_ops->go_type == LM_TYPE_IOPEN && - gl->gl_object) - gfs2_glock_schedule_for_reclaim(gl); } else if (gl->gl_demote_state != LM_ST_UNLOCKED && gl->gl_demote_state != state) { gl->gl_demote_state = LM_ST_UNLOCKED; @@ -877,6 +917,8 @@ void gfs2_print_dbg(struct seq_file *seq, const char *fmt, ...) */ static inline void add_to_queue(struct gfs2_holder *gh) +__releases(&gl->gl_spin) +__acquires(&gl->gl_spin) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_sbd; @@ -998,7 +1040,7 @@ void gfs2_glock_dq(struct gfs2_holder *gh) spin_lock(&gl->gl_spin); if (gh->gh_flags & GL_NOCACHE) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0); list_del_init(&gh->gh_list); if (find_first_holder(gl) == NULL) { @@ -1269,12 +1311,26 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, delay = gl->gl_ops->go_min_hold_time; spin_lock(&gl->gl_spin); - handle_callback(gl, state, 1, delay); + handle_callback(gl, state, delay); spin_unlock(&gl->gl_spin); if (queue_delayed_work(glock_workqueue, &gl->gl_work, delay) == 0) gfs2_glock_put(gl); } +static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +{ + struct gfs2_jdesc *jd; + + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_jid != jid) + continue; + jd->jd_dirty = 1; + break; + } + spin_unlock(&sdp->sd_jindex_spin); +} + /** * gfs2_glock_cb - Callback used by locking module * @sdp: Pointer to the superblock @@ -1338,80 +1394,83 @@ void gfs2_glock_cb(void *cb_data, unsigned int type, void *data) * Returns: 1 if it's ok */ -static int demote_ok(struct gfs2_glock *gl) +static int demote_ok(const struct gfs2_glock *gl) { const struct gfs2_glock_operations *glops = gl->gl_ops; - int demote = 1; - - if (test_bit(GLF_STICKY, &gl->gl_flags)) - demote = 0; - else if (glops->go_demote_ok) - demote = glops->go_demote_ok(gl); - - return demote; -} - -/** - * gfs2_glock_schedule_for_reclaim - Add a glock to the reclaim list - * @gl: the glock - * - */ - -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl) -{ - struct gfs2_sbd *sdp = gl->gl_sbd; - spin_lock(&sdp->sd_reclaim_lock); - if (list_empty(&gl->gl_reclaim)) { - gfs2_glock_hold(gl); - list_add(&gl->gl_reclaim, &sdp->sd_reclaim_list); - atomic_inc(&sdp->sd_reclaim_count); - spin_unlock(&sdp->sd_reclaim_lock); - wake_up(&sdp->sd_reclaim_wq); - } else - spin_unlock(&sdp->sd_reclaim_lock); + if (gl->gl_state == LM_ST_UNLOCKED) + return 0; + if (!list_empty(&gl->gl_holders)) + return 0; + if (glops->go_demote_ok) + return glops->go_demote_ok(gl); + return 1; } -/** - * gfs2_reclaim_glock - process the next glock on the filesystem's reclaim list - * @sdp: the filesystem - * - * Called from gfs2_glockd() glock reclaim daemon, or when promoting a - * different glock and we notice that there are a lot of glocks in the - * reclaim list. - * - */ -void gfs2_reclaim_glock(struct gfs2_sbd *sdp) +static int gfs2_shrink_glock_memory(int nr, gfp_t gfp_mask) { struct gfs2_glock *gl; - int done_callback = 0; + int may_demote; + int nr_skipped = 0; + int got_ref = 0; + LIST_HEAD(skipped); - spin_lock(&sdp->sd_reclaim_lock); - if (list_empty(&sdp->sd_reclaim_list)) { - spin_unlock(&sdp->sd_reclaim_lock); - return; - } - gl = list_entry(sdp->sd_reclaim_list.next, - struct gfs2_glock, gl_reclaim); - list_del_init(&gl->gl_reclaim); - spin_unlock(&sdp->sd_reclaim_lock); + if (nr == 0) + goto out; - atomic_dec(&sdp->sd_reclaim_count); - atomic_inc(&sdp->sd_reclaimed); + if (!(gfp_mask & __GFP_FS)) + return -1; - spin_lock(&gl->gl_spin); - if (find_first_holder(gl) == NULL && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) { - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); - done_callback = 1; + spin_lock(&lru_lock); + while(nr && !list_empty(&lru_list)) { + gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru); + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); + + /* Test for being demotable */ + if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) { + gfs2_glock_hold(gl); + got_ref = 1; + spin_unlock(&lru_lock); + spin_lock(&gl->gl_spin); + may_demote = demote_ok(gl); + spin_unlock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + if (may_demote) { + handle_callback(gl, LM_ST_UNLOCKED, 0); + nr--; + if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) + gfs2_glock_put(gl); + } + spin_lock(&lru_lock); + if (may_demote) + continue; + } + if (list_empty(&gl->gl_lru) && + (atomic_read(&gl->gl_ref) <= (2 + got_ref))) { + nr_skipped++; + list_add(&gl->gl_lru, &skipped); + } + if (got_ref) { + spin_unlock(&lru_lock); + gfs2_glock_put(gl); + spin_lock(&lru_lock); + got_ref = 0; + } } - spin_unlock(&gl->gl_spin); - if (!done_callback || - queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) - gfs2_glock_put(gl); + list_splice(&skipped, &lru_list); + atomic_add(nr_skipped, &lru_count); + spin_unlock(&lru_lock); +out: + return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure; } +static struct shrinker glock_shrinker = { + .shrink = gfs2_shrink_glock_memory, + .seeks = DEFAULT_SEEKS, +}; + /** * examine_bucket - Call a function for glock in a hash bucket * @examiner: the function @@ -1457,26 +1516,6 @@ out: } /** - * scan_glock - look at a glock and see if we can reclaim it - * @gl: the glock to look at - * - */ - -static void scan_glock(struct gfs2_glock *gl) -{ - if (gl->gl_ops == &gfs2_inode_glops && gl->gl_object) - return; - if (test_bit(GLF_LOCK, &gl->gl_flags)) - return; - - spin_lock(&gl->gl_spin); - if (find_first_holder(gl) == NULL && - gl->gl_state != LM_ST_UNLOCKED && demote_ok(gl)) - gfs2_glock_schedule_for_reclaim(gl); - spin_unlock(&gl->gl_spin); -} - -/** * clear_glock - look at a glock and see if we can free it from glock cache * @gl: the glock to look at * @@ -1484,23 +1523,16 @@ static void scan_glock(struct gfs2_glock *gl) static void clear_glock(struct gfs2_glock *gl) { - struct gfs2_sbd *sdp = gl->gl_sbd; - int released; - - spin_lock(&sdp->sd_reclaim_lock); - if (!list_empty(&gl->gl_reclaim)) { - list_del_init(&gl->gl_reclaim); - atomic_dec(&sdp->sd_reclaim_count); - spin_unlock(&sdp->sd_reclaim_lock); - released = gfs2_glock_put(gl); - gfs2_assert(sdp, !released); - } else { - spin_unlock(&sdp->sd_reclaim_lock); + spin_lock(&lru_lock); + if (!list_empty(&gl->gl_lru)) { + list_del_init(&gl->gl_lru); + atomic_dec(&lru_count); } + spin_unlock(&lru_lock); spin_lock(&gl->gl_spin); if (find_first_holder(gl) == NULL && gl->gl_state != LM_ST_UNLOCKED) - handle_callback(gl, LM_ST_UNLOCKED, 0, 0); + handle_callback(gl, LM_ST_UNLOCKED, 0); spin_unlock(&gl->gl_spin); gfs2_glock_hold(gl); if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0) @@ -1548,6 +1580,20 @@ void gfs2_gl_hash_clear(struct gfs2_sbd *sdp) } } +void gfs2_glock_finish_truncate(struct gfs2_inode *ip) +{ + struct gfs2_glock *gl = ip->i_gl; + int ret; + + ret = gfs2_truncatei_resume(ip); + gfs2_assert_withdraw(gl->gl_sbd, ret == 0); + + spin_lock(&gl->gl_spin); + clear_bit(GLF_LOCK, &gl->gl_flags); + run_queue(gl, 1); + spin_unlock(&gl->gl_spin); +} + static const char *state2str(unsigned state) { switch(state) { @@ -1623,8 +1669,6 @@ static const char *gflags2str(char *buf, const unsigned long *gflags) char *p = buf; if (test_bit(GLF_LOCK, gflags)) *p++ = 'l'; - if (test_bit(GLF_STICKY, gflags)) - *p++ = 's'; if (test_bit(GLF_DEMOTE, gflags)) *p++ = 'D'; if (test_bit(GLF_PENDING_DEMOTE, gflags)) @@ -1743,34 +1787,6 @@ static int gfs2_dump_lockstate(struct gfs2_sbd *sdp) return error; } -/** - * gfs2_scand - Look for cached glocks and inodes to toss from memory - * @sdp: Pointer to GFS2 superblock - * - * One of these daemons runs, finding candidates to add to sd_reclaim_list. - * See gfs2_glockd() - */ - -static int gfs2_scand(void *data) -{ - unsigned x; - unsigned delay; - - while (!kthread_should_stop()) { - for (x = 0; x < GFS2_GL_HASH_SIZE; x++) - examine_bucket(scan_glock, NULL, x); - if (freezing(current)) - refrigerator(); - delay = scand_secs; - if (delay < 1) - delay = 1; - schedule_timeout_interruptible(delay * HZ); - } - - return 0; -} - - int __init gfs2_glock_init(void) { @@ -1784,28 +1800,21 @@ int __init gfs2_glock_init(void) } #endif - scand_process = kthread_run(gfs2_scand, NULL, "gfs2_scand"); - if (IS_ERR(scand_process)) - return PTR_ERR(scand_process); - glock_workqueue = create_workqueue("glock_workqueue"); - if (IS_ERR(glock_workqueue)) { - kthread_stop(scand_process); + if (IS_ERR(glock_workqueue)) return PTR_ERR(glock_workqueue); - } + + register_shrinker(&glock_shrinker); return 0; } void gfs2_glock_exit(void) { + unregister_shrinker(&glock_shrinker); destroy_workqueue(glock_workqueue); - kthread_stop(scand_process); } -module_param(scand_secs, uint, S_IRUGO|S_IWUSR); -MODULE_PARM_DESC(scand_secs, "The number of seconds between scand runs"); - static int gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { struct gfs2_glock *gl; diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 695c6b19361..543ec7ecfbd 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -129,9 +129,9 @@ int gfs2_lvb_hold(struct gfs2_glock *gl); void gfs2_lvb_unhold(struct gfs2_glock *gl); void gfs2_glock_cb(void *cb_data, unsigned int type, void *data); -void gfs2_glock_schedule_for_reclaim(struct gfs2_glock *gl); void gfs2_reclaim_glock(struct gfs2_sbd *sdp); void gfs2_gl_hash_clear(struct gfs2_sbd *sdp); +void gfs2_glock_finish_truncate(struct gfs2_inode *ip); int __init gfs2_glock_init(void); void gfs2_glock_exit(void); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index c6c318c2a0f..8522d3aa64f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -201,19 +201,12 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags) * Returns: 1 if it's ok */ -static int inode_go_demote_ok(struct gfs2_glock *gl) +static int inode_go_demote_ok(const struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_sbd; - int demote = 0; - - if (!gl->gl_object && !gl->gl_aspace->i_mapping->nrpages) - demote = 1; - else if (!sdp->sd_args.ar_localcaching && - time_after_eq(jiffies, gl->gl_stamp + - gfs2_tune_get(sdp, gt_demote_secs) * HZ)) - demote = 1; - - return demote; + if (sdp->sd_jindex == gl->gl_object || sdp->sd_rindex == gl->gl_object) + return 0; + return 1; } /** @@ -227,6 +220,7 @@ static int inode_go_demote_ok(struct gfs2_glock *gl) static int inode_go_lock(struct gfs2_holder *gh) { struct gfs2_glock *gl = gh->gh_gl; + struct gfs2_sbd *sdp = gl->gl_sbd; struct gfs2_inode *ip = gl->gl_object; int error = 0; @@ -239,10 +233,16 @@ static int inode_go_lock(struct gfs2_holder *gh) return error; } - if ((ip->i_di.di_flags & GFS2_DIF_TRUNC_IN_PROG) && + if ((ip->i_diskflags & GFS2_DIF_TRUNC_IN_PROG) && (gl->gl_state == LM_ST_EXCLUSIVE) && - (gh->gh_state == LM_ST_EXCLUSIVE)) - error = gfs2_truncatei_resume(ip); + (gh->gh_state == LM_ST_EXCLUSIVE)) { + spin_lock(&sdp->sd_trunc_lock); + if (list_empty(&ip->i_trunc_list)) + list_add(&sdp->sd_trunc_list, &ip->i_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + wake_up(&sdp->sd_quota_wait); + return 1; + } return error; } @@ -260,10 +260,13 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) const struct gfs2_inode *ip = gl->gl_object; if (ip == NULL) return 0; - gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%08lx\n", + gfs2_print_dbg(seq, " I: n:%llu/%llu t:%u f:0x%02lx d:0x%08x s:%llu/%llu\n", (unsigned long long)ip->i_no_formal_ino, (unsigned long long)ip->i_no_addr, - IF2DT(ip->i_inode.i_mode), ip->i_flags); + IF2DT(ip->i_inode.i_mode), ip->i_flags, + (unsigned int)ip->i_diskflags, + (unsigned long long)ip->i_inode.i_size, + (unsigned long long)ip->i_disksize); return 0; } @@ -274,7 +277,7 @@ static int inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) * Returns: 1 if it's ok */ -static int rgrp_go_demote_ok(struct gfs2_glock *gl) +static int rgrp_go_demote_ok(const struct gfs2_glock *gl) { return !gl->gl_aspace->i_mapping->nrpages; } @@ -318,7 +321,9 @@ static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) const struct gfs2_rgrpd *rgd = gl->gl_object; if (rgd == NULL) return 0; - gfs2_print_dbg(seq, " R: n:%llu\n", (unsigned long long)rgd->rd_addr); + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); return 0; } @@ -374,13 +379,25 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh) } /** + * trans_go_demote_ok + * @gl: the glock + * + * Always returns 0 + */ + +static int trans_go_demote_ok(const struct gfs2_glock *gl) +{ + return 0; +} + +/** * quota_go_demote_ok - Check to see if it's ok to unlock a quota glock * @gl: the glock * * Returns: 1 if it's ok */ -static int quota_go_demote_ok(struct gfs2_glock *gl) +static int quota_go_demote_ok(const struct gfs2_glock *gl) { return !atomic_read(&gl->gl_lvb_count); } @@ -414,6 +431,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { const struct gfs2_glock_operations gfs2_trans_glops = { .go_xmote_th = trans_go_sync, .go_xmote_bh = trans_go_xmote_bh, + .go_demote_ok = trans_go_demote_ok, .go_type = LM_TYPE_NONDISK, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index f566ec1b4e8..608849d0002 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -68,12 +68,6 @@ struct gfs2_bitmap { u32 bi_len; }; -struct gfs2_rgrp_host { - u32 rg_free; - u32 rg_dinodes; - u64 rg_igeneration; -}; - struct gfs2_rgrpd { struct list_head rd_list; /* Link with superblock */ struct list_head rd_list_mru; @@ -83,14 +77,16 @@ struct gfs2_rgrpd { u32 rd_length; /* length of rgrp header in fs blocks */ u32 rd_data; /* num of data blocks in rgrp */ u32 rd_bitbytes; /* number of bytes in data bitmaps */ - struct gfs2_rgrp_host rd_rg; + u32 rd_free; + u32 rd_free_clone; + u32 rd_dinodes; + u64 rd_igeneration; struct gfs2_bitmap *rd_bits; - unsigned int rd_bh_count; struct mutex rd_mutex; - u32 rd_free_clone; struct gfs2_log_element rd_le; - u32 rd_last_alloc; struct gfs2_sbd *rd_sbd; + unsigned int rd_bh_count; + u32 rd_last_alloc; unsigned char rd_flags; #define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ #define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ @@ -129,7 +125,7 @@ struct gfs2_glock_operations { void (*go_xmote_th) (struct gfs2_glock *gl); int (*go_xmote_bh) (struct gfs2_glock *gl, struct gfs2_holder *gh); void (*go_inval) (struct gfs2_glock *gl, int flags); - int (*go_demote_ok) (struct gfs2_glock *gl); + int (*go_demote_ok) (const struct gfs2_glock *gl); int (*go_lock) (struct gfs2_holder *gh); void (*go_unlock) (struct gfs2_holder *gh); int (*go_dump)(struct seq_file *seq, const struct gfs2_glock *gl); @@ -159,7 +155,6 @@ struct gfs2_holder { enum { GLF_LOCK = 1, - GLF_STICKY = 2, GLF_DEMOTE = 3, GLF_PENDING_DEMOTE = 4, GLF_DEMOTE_IN_PROGRESS = 5, @@ -194,7 +189,7 @@ struct gfs2_glock { unsigned long gl_tchange; void *gl_object; - struct list_head gl_reclaim; + struct list_head gl_lru; struct gfs2_sbd *gl_sbd; @@ -233,29 +228,24 @@ enum { GIF_USER = 4, /* user inode, not metadata addr space */ }; -struct gfs2_dinode_host { - u64 di_size; /* number of bytes in file */ - u64 di_generation; /* generation number for NFS */ - u32 di_flags; /* GFS2_DIF_... */ - /* These only apply to directories */ - u32 di_entries; /* The number of entries in the directory */ - u64 di_eattr; /* extended attribute block number */ -}; struct gfs2_inode { struct inode i_inode; u64 i_no_addr; u64 i_no_formal_ino; + u64 i_generation; + u64 i_eattr; + loff_t i_disksize; unsigned long i_flags; /* GIF_... */ - - struct gfs2_dinode_host i_di; /* To be replaced by ref to block */ - struct gfs2_glock *i_gl; /* Move into i_gh? */ struct gfs2_holder i_iopen_gh; struct gfs2_holder i_gh; /* for prepare/commit_write only */ struct gfs2_alloc *i_alloc; u64 i_goal; /* goal block for allocations */ struct rw_semaphore i_rw_mutex; + struct list_head i_trunc_list; + u32 i_entries; + u32 i_diskflags; u8 i_height; u8 i_depth; }; @@ -406,13 +396,11 @@ struct gfs2_args { struct gfs2_tune { spinlock_t gt_spin; - unsigned int gt_demote_secs; /* Cache retention for unheld glock */ unsigned int gt_incore_log_blocks; unsigned int gt_log_flush_secs; unsigned int gt_recoverd_secs; unsigned int gt_logd_secs; - unsigned int gt_quotad_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ unsigned int gt_quota_warn_period; /* Secs between quota warn msgs */ @@ -488,10 +476,6 @@ struct gfs2_sbd { /* Lock Stuff */ struct lm_lockstruct sd_lockstruct; - struct list_head sd_reclaim_list; - spinlock_t sd_reclaim_lock; - wait_queue_head_t sd_reclaim_wq; - atomic_t sd_reclaim_count; struct gfs2_holder sd_live_gh; struct gfs2_glock *sd_rename_gl; struct gfs2_glock *sd_trans_gl; @@ -519,7 +503,6 @@ struct gfs2_sbd { spinlock_t sd_statfs_spin; struct gfs2_statfs_change_host sd_statfs_master; struct gfs2_statfs_change_host sd_statfs_local; - unsigned long sd_statfs_sync_time; /* Resource group stuff */ @@ -552,8 +535,6 @@ struct gfs2_sbd { struct task_struct *sd_recoverd_process; struct task_struct *sd_logd_process; struct task_struct *sd_quotad_process; - struct task_struct *sd_glockd_process[GFS2_GLOCKD_MAX]; - unsigned int sd_glockd_num; /* Quota stuff */ @@ -561,13 +542,15 @@ struct gfs2_sbd { atomic_t sd_quota_count; spinlock_t sd_quota_spin; struct mutex sd_quota_mutex; + wait_queue_head_t sd_quota_wait; + struct list_head sd_trunc_list; + spinlock_t sd_trunc_lock; unsigned int sd_quota_slots; unsigned int sd_quota_chunks; unsigned char **sd_quota_bitmap; u64 sd_quota_sync_gen; - unsigned long sd_quota_sync_time; /* Log stuff */ @@ -624,10 +607,6 @@ struct gfs2_sbd { struct mutex sd_freeze_lock; unsigned int sd_freeze_count; - /* Counters */ - - atomic_t sd_reclaimed; - char sd_fsname[GFS2_FSNAME_LEN]; char sd_table_name[GFS2_FSNAME_LEN]; char sd_proto_name[GFS2_FSNAME_LEN]; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index d57616840e8..3b87c188da4 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -32,7 +32,6 @@ #include "log.h" #include "meta_io.h" #include "ops_address.h" -#include "ops_inode.h" #include "quota.h" #include "rgrp.h" #include "trans.h" @@ -248,7 +247,6 @@ fail: static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { - struct gfs2_dinode_host *di = &ip->i_di; const struct gfs2_dinode *str = buf; struct timespec atime; u16 height, depth; @@ -274,8 +272,8 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) * to do that. */ ip->i_inode.i_nlink = be32_to_cpu(str->di_nlink); - di->di_size = be64_to_cpu(str->di_size); - i_size_write(&ip->i_inode, di->di_size); + ip->i_disksize = be64_to_cpu(str->di_size); + i_size_write(&ip->i_inode, ip->i_disksize); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); atime.tv_sec = be64_to_cpu(str->di_atime); atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); @@ -287,9 +285,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) ip->i_inode.i_ctime.tv_nsec = be32_to_cpu(str->di_ctime_nsec); ip->i_goal = be64_to_cpu(str->di_goal_meta); - di->di_generation = be64_to_cpu(str->di_generation); + ip->i_generation = be64_to_cpu(str->di_generation); - di->di_flags = be32_to_cpu(str->di_flags); + ip->i_diskflags = be32_to_cpu(str->di_flags); gfs2_set_inode_flags(&ip->i_inode); height = be16_to_cpu(str->di_height); if (unlikely(height > GFS2_MAX_META_HEIGHT)) @@ -300,9 +298,9 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) if (unlikely(depth > GFS2_DIR_MAX_DEPTH)) goto corrupt; ip->i_depth = (u8)depth; - di->di_entries = be32_to_cpu(str->di_entries); + ip->i_entries = be32_to_cpu(str->di_entries); - di->di_eattr = be64_to_cpu(str->di_eattr); + ip->i_eattr = be64_to_cpu(str->di_eattr); if (S_ISREG(ip->i_inode.i_mode)) gfs2_set_aops(&ip->i_inode); @@ -388,7 +386,6 @@ int gfs2_dinode_dealloc(struct gfs2_inode *ip) gfs2_free_di(rgd, ip); gfs2_trans_end(sdp); - clear_bit(GLF_STICKY, &ip->i_gl->gl_flags); out_rg_gunlock: gfs2_glock_dq_uninit(&al->al_rgd_gh); @@ -690,7 +687,7 @@ static int create_ok(struct gfs2_inode *dip, const struct qstr *name, return error; } - if (dip->i_di.di_entries == (u32)-1) + if (dip->i_entries == (u32)-1) return -EFBIG; if (S_ISDIR(mode) && dip->i_inode.i_nlink == (u32)-1) return -EMLINK; @@ -790,11 +787,11 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_glock *gl, di->di_flags = 0; if (S_ISREG(mode)) { - if ((dip->i_di.di_flags & GFS2_DIF_INHERIT_JDATA) || + if ((dip->i_diskflags & GFS2_DIF_INHERIT_JDATA) || gfs2_tune_get(sdp, gt_new_files_jdata)) di->di_flags |= cpu_to_be32(GFS2_DIF_JDATA); } else if (S_ISDIR(mode)) { - di->di_flags |= cpu_to_be32(dip->i_di.di_flags & + di->di_flags |= cpu_to_be32(dip->i_diskflags & GFS2_DIF_INHERIT_JDATA); } @@ -1068,7 +1065,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, struct qstr dotname; int error; - if (ip->i_di.di_entries != 2) { + if (ip->i_entries != 2) { if (gfs2_consist_inode(ip)) gfs2_dinode_print(ip); return -EIO; @@ -1168,7 +1165,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) return error; } - if (!ip->i_di.di_size) { + if (!ip->i_disksize) { gfs2_consist_inode(ip); error = -EIO; goto out; @@ -1178,7 +1175,7 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) if (error) goto out; - x = ip->i_di.di_size + 1; + x = ip->i_disksize + 1; if (x > *len) { *buf = kmalloc(x, GFP_NOFS); if (!*buf) { @@ -1242,7 +1239,6 @@ int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) { - const struct gfs2_dinode_host *di = &ip->i_di; struct gfs2_dinode *str = buf; str->di_header.mh_magic = cpu_to_be32(GFS2_MAGIC); @@ -1256,7 +1252,7 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_uid = cpu_to_be32(ip->i_inode.i_uid); str->di_gid = cpu_to_be32(ip->i_inode.i_gid); str->di_nlink = cpu_to_be32(ip->i_inode.i_nlink); - str->di_size = cpu_to_be64(di->di_size); + str->di_size = cpu_to_be64(ip->i_disksize); str->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode)); str->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); str->di_mtime = cpu_to_be64(ip->i_inode.i_mtime.tv_sec); @@ -1264,17 +1260,17 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) str->di_goal_meta = cpu_to_be64(ip->i_goal); str->di_goal_data = cpu_to_be64(ip->i_goal); - str->di_generation = cpu_to_be64(di->di_generation); + str->di_generation = cpu_to_be64(ip->i_generation); - str->di_flags = cpu_to_be32(di->di_flags); + str->di_flags = cpu_to_be32(ip->i_diskflags); str->di_height = cpu_to_be16(ip->i_height); str->di_payload_format = cpu_to_be32(S_ISDIR(ip->i_inode.i_mode) && - !(ip->i_di.di_flags & GFS2_DIF_EXHASH) ? + !(ip->i_diskflags & GFS2_DIF_EXHASH) ? GFS2_FORMAT_DE : 0); str->di_depth = cpu_to_be16(ip->i_depth); - str->di_entries = cpu_to_be32(di->di_entries); + str->di_entries = cpu_to_be32(ip->i_entries); - str->di_eattr = cpu_to_be64(di->di_eattr); + str->di_eattr = cpu_to_be64(ip->i_eattr); str->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); str->di_mtime_nsec = cpu_to_be32(ip->i_inode.i_mtime.tv_nsec); str->di_ctime_nsec = cpu_to_be32(ip->i_inode.i_ctime.tv_nsec); @@ -1282,22 +1278,21 @@ void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf) void gfs2_dinode_print(const struct gfs2_inode *ip) { - const struct gfs2_dinode_host *di = &ip->i_di; - printk(KERN_INFO " no_formal_ino = %llu\n", (unsigned long long)ip->i_no_formal_ino); printk(KERN_INFO " no_addr = %llu\n", (unsigned long long)ip->i_no_addr); - printk(KERN_INFO " di_size = %llu\n", (unsigned long long)di->di_size); + printk(KERN_INFO " i_disksize = %llu\n", + (unsigned long long)ip->i_disksize); printk(KERN_INFO " blocks = %llu\n", (unsigned long long)gfs2_get_inode_blocks(&ip->i_inode)); printk(KERN_INFO " i_goal = %llu\n", (unsigned long long)ip->i_goal); - printk(KERN_INFO " di_flags = 0x%.8X\n", di->di_flags); + printk(KERN_INFO " i_diskflags = 0x%.8X\n", ip->i_diskflags); printk(KERN_INFO " i_height = %u\n", ip->i_height); printk(KERN_INFO " i_depth = %u\n", ip->i_depth); - printk(KERN_INFO " di_entries = %u\n", di->di_entries); - printk(KERN_INFO " di_eattr = %llu\n", - (unsigned long long)di->di_eattr); + printk(KERN_INFO " i_entries = %u\n", ip->i_entries); + printk(KERN_INFO " i_eattr = %llu\n", + (unsigned long long)ip->i_eattr); } diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 2d43f69610a..d5329364cdf 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -10,6 +10,7 @@ #ifndef __INODE_DOT_H__ #define __INODE_DOT_H__ +#include <linux/fs.h> #include "util.h" static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) @@ -19,7 +20,7 @@ static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) static inline int gfs2_is_jdata(const struct gfs2_inode *ip) { - return ip->i_di.di_flags & GFS2_DIF_JDATA; + return ip->i_diskflags & GFS2_DIF_JDATA; } static inline int gfs2_is_writeback(const struct gfs2_inode *ip) @@ -97,5 +98,15 @@ struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); void gfs2_dinode_print(const struct gfs2_inode *ip); +extern const struct inode_operations gfs2_file_iops; +extern const struct inode_operations gfs2_dir_iops; +extern const struct inode_operations gfs2_symlink_iops; +extern const struct file_operations gfs2_file_fops; +extern const struct file_operations gfs2_dir_fops; +extern const struct file_operations gfs2_file_fops_nolock; +extern const struct file_operations gfs2_dir_fops_nolock; + +extern void gfs2_set_inode_flags(struct inode *inode); + #endif /* __INODE_DOT_H__ */ diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 0c4cbe6c828..1aa7eb6a022 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -194,17 +194,25 @@ out: static void gdlm_recovery_done(void *lockspace, unsigned int jid, unsigned int message) { + char env_jid[20]; + char env_status[20]; + char *envp[] = { env_jid, env_status, NULL }; struct gdlm_ls *ls = lockspace; ls->recover_jid_done = jid; ls->recover_jid_status = message; - kobject_uevent(&ls->kobj, KOBJ_CHANGE); + sprintf(env_jid, "JID=%d", jid); + sprintf(env_status, "RECOVERY=%s", + message == LM_RD_SUCCESS ? "Done" : "Failed"); + kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp); } static void gdlm_others_may_mount(void *lockspace) { + char *message = "FIRSTMOUNT=Done"; + char *envp[] = { message, NULL }; struct gdlm_ls *ls = lockspace; ls->first_done = 1; - kobject_uevent(&ls->kobj, KOBJ_CHANGE); + kobject_uevent_env(&ls->kobj, KOBJ_CHANGE, envp); } /* Userspace gets the offline uevent, blocks new gfs locks on diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index 4ec571c3d8a..9b7edcf7bd4 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -195,9 +195,23 @@ void gdlm_kobject_release(struct gdlm_ls *ls) kobject_put(&ls->kobj); } +static int gdlm_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gdlm_ls *ls = container_of(kobj, struct gdlm_ls, kobj); + add_uevent_var(env, "LOCKTABLE=%s:%s", ls->clustername, ls->fsname); + add_uevent_var(env, "LOCKPROTO=lock_dlm"); + return 0; +} + +static struct kset_uevent_ops gdlm_uevent_ops = { + .uevent = gdlm_uevent, +}; + + int gdlm_sysfs_init(void) { - gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); + gdlm_kset = kset_create_and_add("lock_dlm", &gdlm_uevent_ops, kernel_kobj); if (!gdlm_kset) { printk(KERN_WARNING "%s: can not create kset\n", __func__); return -ENOMEM; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index bb2cc303ac2..7cacfde3219 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -19,7 +19,7 @@ #include "gfs2.h" #include "incore.h" -#include "ops_fstype.h" +#include "super.h" #include "sys.h" #include "util.h" #include "glock.h" @@ -30,6 +30,7 @@ static void gfs2_init_inode_once(void *foo) inode_init_once(&ip->i_inode); init_rwsem(&ip->i_rw_mutex); + INIT_LIST_HEAD(&ip->i_trunc_list); ip->i_alloc = NULL; } @@ -42,7 +43,7 @@ static void gfs2_init_glock_once(void *foo) INIT_LIST_HEAD(&gl->gl_holders); gl->gl_lvb = NULL; atomic_set(&gl->gl_lvb_count, 0); - INIT_LIST_HEAD(&gl->gl_reclaim); + INIT_LIST_HEAD(&gl->gl_lru); INIT_LIST_HEAD(&gl->gl_ail_list); atomic_set(&gl->gl_ail_count, 0); } @@ -93,6 +94,12 @@ static int __init init_gfs2_fs(void) if (!gfs2_rgrpd_cachep) goto fail; + gfs2_quotad_cachep = kmem_cache_create("gfs2_quotad", + sizeof(struct gfs2_quota_data), + 0, 0, NULL); + if (!gfs2_quotad_cachep) + goto fail; + error = register_filesystem(&gfs2_fs_type); if (error) goto fail; @@ -112,6 +119,9 @@ fail_unregister: fail: gfs2_glock_exit(); + if (gfs2_quotad_cachep) + kmem_cache_destroy(gfs2_quotad_cachep); + if (gfs2_rgrpd_cachep) kmem_cache_destroy(gfs2_rgrpd_cachep); @@ -140,6 +150,7 @@ static void __exit exit_gfs2_fs(void) unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); + kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); kmem_cache_destroy(gfs2_bufdata_cachep); kmem_cache_destroy(gfs2_inode_cachep); diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index f96eb90a2cf..3cb0a44ba02 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -32,7 +32,6 @@ enum { Opt_debug, Opt_nodebug, Opt_upgrade, - Opt_num_glockd, Opt_acl, Opt_noacl, Opt_quota_off, @@ -57,7 +56,6 @@ static const match_table_t tokens = { {Opt_debug, "debug"}, {Opt_nodebug, "nodebug"}, {Opt_upgrade, "upgrade"}, - {Opt_num_glockd, "num_glockd=%d"}, {Opt_acl, "acl"}, {Opt_noacl, "noacl"}, {Opt_quota_off, "quota=off"}, @@ -87,16 +85,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) int error = 0; if (!remount) { - /* If someone preloaded options, use those instead */ - spin_lock(&gfs2_sys_margs_lock); - if (gfs2_sys_margs) { - data = gfs2_sys_margs; - gfs2_sys_margs = NULL; - } - spin_unlock(&gfs2_sys_margs_lock); - /* Set some defaults */ - args->ar_num_glockd = GFS2_GLOCKD_DEFAULT; args->ar_quota = GFS2_QUOTA_DEFAULT; args->ar_data = GFS2_DATA_DEFAULT; } @@ -105,7 +94,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) process them */ for (options = data; (o = strsep(&options, ",")); ) { - int token, option; + int token; substring_t tmp[MAX_OPT_ARGS]; if (!*o) @@ -196,22 +185,6 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) goto cant_remount; args->ar_upgrade = 1; break; - case Opt_num_glockd: - if ((error = match_int(&tmp[0], &option))) { - fs_info(sdp, "problem getting num_glockd\n"); - goto out_error; - } - - if (remount && option != args->ar_num_glockd) - goto cant_remount; - if (!option || option > GFS2_GLOCKD_MAX) { - fs_info(sdp, "0 < num_glockd <= %u (not %u)\n", - GFS2_GLOCKD_MAX, option); - error = -EINVAL; - goto out_error; - } - args->ar_num_glockd = option; - break; case Opt_acl: args->ar_posix_acl = 1; sdp->sd_vfs->s_flags |= MS_POSIXACL; diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 27563816e1c..4ddab67867e 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -210,25 +210,23 @@ static int gfs2_jdata_writepage(struct page *page, struct writeback_control *wbc { struct inode *inode = page->mapping->host; struct gfs2_sbd *sdp = GFS2_SB(inode); - int error; + int ret; int done_trans = 0; - error = gfs2_writepage_common(page, wbc); - if (error <= 0) - return error; - if (PageChecked(page)) { if (wbc->sync_mode != WB_SYNC_ALL) goto out_ignore; - error = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); - if (error) + ret = gfs2_trans_begin(sdp, RES_DINODE + 1, 0); + if (ret) goto out_ignore; done_trans = 1; } - error = __gfs2_jdata_writepage(page, wbc); + ret = gfs2_writepage_common(page, wbc); + if (ret > 0) + ret = __gfs2_jdata_writepage(page, wbc); if (done_trans) gfs2_trans_end(sdp); - return error; + return ret; out_ignore: redirty_page_for_writepage(wbc, page); @@ -453,8 +451,8 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) kaddr = kmap_atomic(page, KM_USER0); memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), - ip->i_di.di_size); - memset(kaddr + ip->i_di.di_size, 0, PAGE_CACHE_SIZE - ip->i_di.di_size); + ip->i_disksize); + memset(kaddr + ip->i_disksize, 0, PAGE_CACHE_SIZE - ip->i_disksize); kunmap_atomic(kaddr, KM_USER0); flush_dcache_page(page); brelse(dibh); @@ -627,7 +625,7 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, { struct gfs2_inode *ip = GFS2_I(mapping->host); struct gfs2_sbd *sdp = GFS2_SB(mapping->host); - unsigned int data_blocks, ind_blocks, rblocks; + unsigned int data_blocks = 0, ind_blocks = 0, rblocks; int alloc_required; int error = 0; struct gfs2_alloc *al; @@ -641,11 +639,13 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, if (unlikely(error)) goto out_uninit; - gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); error = gfs2_write_alloc_required(ip, pos, len, &alloc_required); if (error) goto out_unlock; + if (alloc_required || gfs2_is_jdata(ip)) + gfs2_write_calc_reserv(ip, len, &data_blocks, &ind_blocks); + if (alloc_required) { al = gfs2_alloc_get(ip); if (!al) { @@ -675,7 +675,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, goto out_trans_fail; error = -ENOMEM; - page = __grab_cache_page(mapping, index); + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); *pagep = page; if (unlikely(!page)) goto out_endtrans; @@ -782,7 +783,7 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, if (inode->i_size < to) { i_size_write(inode, to); - ip->i_di.di_size = inode->i_size; + ip->i_disksize = inode->i_size; di->di_size = cpu_to_be64(inode->i_size); mark_inode_dirty(inode); } @@ -847,9 +848,9 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (likely(ret >= 0) && (inode->i_size > ip->i_di.di_size)) { + if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) { di = (struct gfs2_dinode *)dibh->b_data; - ip->i_di.di_size = inode->i_size; + ip->i_disksize = inode->i_size; di->di_size = cpu_to_be64(inode->i_size); mark_inode_dirty(inode); } diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/ops_dentry.c index 4a5e676b442..c2ad36330ca 100644 --- a/fs/gfs2/ops_dentry.c +++ b/fs/gfs2/ops_dentry.c @@ -19,7 +19,7 @@ #include "incore.h" #include "dir.h" #include "glock.h" -#include "ops_dentry.h" +#include "super.h" #include "util.h" #include "inode.h" diff --git a/fs/gfs2/ops_dentry.h b/fs/gfs2/ops_dentry.h deleted file mode 100644 index 5caa3db4d3f..00000000000 --- a/fs/gfs2/ops_dentry.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_DENTRY_DOT_H__ -#define __OPS_DENTRY_DOT_H__ - -#include <linux/dcache.h> - -extern struct dentry_operations gfs2_dops; - -#endif /* __OPS_DENTRY_DOT_H__ */ diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/ops_export.c index bbb8c36403a..7fdeb14ddd1 100644 --- a/fs/gfs2/ops_export.c +++ b/fs/gfs2/ops_export.c @@ -22,8 +22,7 @@ #include "glock.h" #include "glops.h" #include "inode.h" -#include "ops_dentry.h" -#include "ops_fstype.h" +#include "super.h" #include "rgrp.h" #include "util.h" @@ -214,7 +213,7 @@ static struct dentry *gfs2_get_dentry(struct super_block *sb, } error = -EIO; - if (GFS2_I(inode)->i_di.di_flags & GFS2_DIF_SYSTEM) { + if (GFS2_I(inode)->i_diskflags & GFS2_DIF_SYSTEM) { iput(inode); goto fail; } diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 3a747f8e218..93fe41b67f9 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -39,7 +39,6 @@ #include "util.h" #include "eaops.h" #include "ops_address.h" -#include "ops_inode.h" /** * gfs2_llseek - seek to a location in a file @@ -158,8 +157,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) if (error) return error; - fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_di.di_flags); - if (!S_ISDIR(inode->i_mode) && ip->i_di.di_flags & GFS2_DIF_JDATA) + fsflags = fsflags_cvt(gfs2_to_fsflags, ip->i_diskflags); + if (!S_ISDIR(inode->i_mode) && ip->i_diskflags & GFS2_DIF_JDATA) fsflags |= FS_JOURNAL_DATA_FL; if (put_user(fsflags, ptr)) error = -EFAULT; @@ -172,17 +171,16 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) void gfs2_set_inode_flags(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_dinode_host *di = &ip->i_di; unsigned int flags = inode->i_flags; flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - if (di->di_flags & GFS2_DIF_IMMUTABLE) + if (ip->i_diskflags & GFS2_DIF_IMMUTABLE) flags |= S_IMMUTABLE; - if (di->di_flags & GFS2_DIF_APPENDONLY) + if (ip->i_diskflags & GFS2_DIF_APPENDONLY) flags |= S_APPEND; - if (di->di_flags & GFS2_DIF_NOATIME) + if (ip->i_diskflags & GFS2_DIF_NOATIME) flags |= S_NOATIME; - if (di->di_flags & GFS2_DIF_SYNC) + if (ip->i_diskflags & GFS2_DIF_SYNC) flags |= S_SYNC; inode->i_flags = flags; } @@ -221,7 +219,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_drop_write; - flags = ip->i_di.di_flags; + flags = ip->i_diskflags; new_flags = (flags & ~mask) | (reqflags & mask); if ((new_flags ^ flags) == 0) goto out; @@ -260,7 +258,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask) if (error) goto out_trans_end; gfs2_trans_add_bh(ip->i_gl, bh, 1); - ip->i_di.di_flags = new_flags; + ip->i_diskflags = new_flags; gfs2_dinode_out(ip, bh->b_data); brelse(bh); gfs2_set_inode_flags(inode); @@ -344,7 +342,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned long last_index; - u64 pos = page->index << (PAGE_CACHE_SIZE - inode->i_blkbits); + u64 pos = page->index << PAGE_CACHE_SHIFT; unsigned int data_blocks, ind_blocks, rblocks; int alloc_required = 0; struct gfs2_holder gh; @@ -357,7 +355,6 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) goto out; set_bit(GIF_SW_PAGED, &ip->i_flags); - gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); ret = gfs2_write_alloc_required(ip, pos, PAGE_CACHE_SIZE, &alloc_required); if (ret || !alloc_required) goto out_unlock; @@ -369,6 +366,7 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) ret = gfs2_quota_lock_check(ip); if (ret) goto out_alloc_put; + gfs2_write_calc_reserv(ip, PAGE_CACHE_SIZE, &data_blocks, &ind_blocks); al->al_requested = data_blocks + ind_blocks; ret = gfs2_inplace_reserve(ip); if (ret) @@ -479,7 +477,7 @@ static int gfs2_open(struct inode *inode, struct file *file) goto fail; if (!(file->f_flags & O_LARGEFILE) && - ip->i_di.di_size > MAX_NON_LFS) { + ip->i_disksize > MAX_NON_LFS) { error = -EOVERFLOW; goto fail_gunlock; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b117fcf2c4f..f91eebdde58 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -22,20 +22,18 @@ #include "gfs2.h" #include "incore.h" #include "bmap.h" -#include "daemon.h" #include "glock.h" #include "glops.h" #include "inode.h" #include "mount.h" -#include "ops_fstype.h" -#include "ops_dentry.h" -#include "ops_super.h" #include "recovery.h" #include "rgrp.h" #include "super.h" #include "sys.h" #include "util.h" #include "log.h" +#include "quota.h" +#include "dir.h" #define DO 0 #define UNDO 1 @@ -58,12 +56,10 @@ static void gfs2_tune_init(struct gfs2_tune *gt) { spin_lock_init(>->gt_spin); - gt->gt_demote_secs = 300; gt->gt_incore_log_blocks = 1024; gt->gt_log_flush_secs = 60; gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; - gt->gt_quotad_secs = 5; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; gt->gt_quota_scale_num = 1; @@ -91,10 +87,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) gfs2_tune_init(&sdp->sd_tune); - INIT_LIST_HEAD(&sdp->sd_reclaim_list); - spin_lock_init(&sdp->sd_reclaim_lock); - init_waitqueue_head(&sdp->sd_reclaim_wq); - mutex_init(&sdp->sd_inum_mutex); spin_lock_init(&sdp->sd_statfs_spin); @@ -110,6 +102,9 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_quota_list); spin_lock_init(&sdp->sd_quota_spin); mutex_init(&sdp->sd_quota_mutex); + init_waitqueue_head(&sdp->sd_quota_wait); + INIT_LIST_HEAD(&sdp->sd_trunc_list); + spin_lock_init(&sdp->sd_trunc_lock); spin_lock_init(&sdp->sd_log_lock); @@ -443,24 +438,11 @@ out: static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, int undo) { - struct task_struct *p; int error = 0; if (undo) goto fail_trans; - for (sdp->sd_glockd_num = 0; - sdp->sd_glockd_num < sdp->sd_args.ar_num_glockd; - sdp->sd_glockd_num++) { - p = kthread_run(gfs2_glockd, sdp, "gfs2_glockd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start glockd thread: %d\n", error); - goto fail; - } - sdp->sd_glockd_process[sdp->sd_glockd_num] = p; - } - error = gfs2_glock_nq_num(sdp, GFS2_MOUNT_LOCK, &gfs2_nondisk_glops, LM_ST_EXCLUSIVE, LM_FLAG_NOEXP | GL_NOCACHE, @@ -493,7 +475,6 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh, fs_err(sdp, "can't create transaction glock: %d\n", error); goto fail_rename; } - set_bit(GLF_STICKY, &sdp->sd_trans_gl->gl_flags); return 0; @@ -506,9 +487,6 @@ fail_live: fail_mount: gfs2_glock_dq_uninit(mount_gh); fail: - while (sdp->sd_glockd_num--) - kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); - return error; } @@ -620,7 +598,7 @@ static int map_journal_extents(struct gfs2_sbd *sdp) prev_db = 0; - for (lb = 0; lb < ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; lb++) { + for (lb = 0; lb < ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; lb++) { bh.b_state = 0; bh.b_blocknr = 0; bh.b_size = 1 << ip->i_inode.i_blkbits; @@ -661,6 +639,72 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) sdp->sd_lockstruct.ls_lockspace); } +/** + * gfs2_jindex_hold - Grab a lock on the jindex + * @sdp: The GFS2 superblock + * @ji_gh: the holder for the jindex glock + * + * Returns: errno + */ + +static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) +{ + struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); + struct qstr name; + char buf[20]; + struct gfs2_jdesc *jd; + int error; + + name.name = buf; + + mutex_lock(&sdp->sd_jindex_mutex); + + for (;;) { + error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); + if (error) + break; + + name.len = sprintf(buf, "journal%u", sdp->sd_journals); + name.hash = gfs2_disk_hash(name.name, name.len); + + error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); + if (error == -ENOENT) { + error = 0; + break; + } + + gfs2_glock_dq_uninit(ji_gh); + + if (error) + break; + + error = -ENOMEM; + jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); + if (!jd) + break; + + INIT_LIST_HEAD(&jd->extent_list); + jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); + if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { + if (!jd->jd_inode) + error = -ENOENT; + else + error = PTR_ERR(jd->jd_inode); + kfree(jd); + break; + } + + spin_lock(&sdp->sd_jindex_spin); + jd->jd_jid = sdp->sd_journals++; + list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); + spin_unlock(&sdp->sd_jindex_spin); + } + + mutex_unlock(&sdp->sd_jindex_mutex); + + return error; +} + static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = sdp->sd_master_dir->d_inode; @@ -681,7 +725,6 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) return PTR_ERR(sdp->sd_jindex); } ip = GFS2_I(sdp->sd_jindex); - set_bit(GLF_STICKY, &ip->i_gl->gl_flags); /* Load in the journal index special file */ @@ -832,7 +875,6 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) goto fail_statfs; } ip = GFS2_I(sdp->sd_rindex); - set_bit(GLF_STICKY, &ip->i_gl->gl_flags); sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ @@ -973,9 +1015,6 @@ static int init_threads(struct gfs2_sbd *sdp, int undo) } sdp->sd_logd_process = p; - sdp->sd_statfs_sync_time = jiffies; - sdp->sd_quota_sync_time = jiffies; - p = kthread_run(gfs2_quotad, sdp, "gfs2_quotad"); error = IS_ERR(p); if (error) { @@ -1224,17 +1263,21 @@ static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, static void gfs2_kill_sb(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; - if (sdp) { - gfs2_meta_syncfs(sdp); - dput(sdp->sd_root_dir); - dput(sdp->sd_master_dir); - sdp->sd_root_dir = NULL; - sdp->sd_master_dir = NULL; + + if (sdp == NULL) { + kill_block_super(sb); + return; } + + gfs2_meta_syncfs(sdp); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; shrink_dcache_sb(sb); kill_block_super(sb); - if (sdp) - gfs2_delete_debugfs_file(sdp); + gfs2_delete_debugfs_file(sdp); + kfree(sdp); } struct file_system_type gfs2_fs_type = { diff --git a/fs/gfs2/ops_fstype.h b/fs/gfs2/ops_fstype.h deleted file mode 100644 index da849051183..00000000000 --- a/fs/gfs2/ops_fstype.h +++ /dev/null @@ -1,19 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_FSTYPE_DOT_H__ -#define __OPS_FSTYPE_DOT_H__ - -#include <linux/fs.h> - -extern struct file_system_type gfs2_fs_type; -extern struct file_system_type gfs2meta_fs_type; -extern const struct export_operations gfs2_export_ops; - -#endif /* __OPS_FSTYPE_DOT_H__ */ diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index d232991b904..49877546beb 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -19,6 +19,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/lm_interface.h> +#include <linux/fiemap.h> #include <asm/uaccess.h> #include "gfs2.h" @@ -31,12 +32,11 @@ #include "glock.h" #include "inode.h" #include "meta_io.h" -#include "ops_dentry.h" -#include "ops_inode.h" #include "quota.h" #include "rgrp.h" #include "trans.h" #include "util.h" +#include "super.h" /** * gfs2_create - Create a file @@ -185,7 +185,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, if (!dip->i_inode.i_nlink) goto out_gunlock; error = -EFBIG; - if (dip->i_di.di_entries == (u32)-1) + if (dip->i_entries == (u32)-1) goto out_gunlock; error = -EPERM; if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) @@ -371,7 +371,7 @@ static int gfs2_symlink(struct inode *dir, struct dentry *dentry, ip = ghs[1].gh_gl->gl_object; - ip->i_di.di_size = size; + ip->i_disksize = size; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -425,9 +425,9 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) ip = ghs[1].gh_gl->gl_object; ip->i_inode.i_nlink = 2; - ip->i_di.di_size = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); - ip->i_di.di_flags |= GFS2_DIF_JDATA; - ip->i_di.di_entries = 2; + ip->i_disksize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_dinode); + ip->i_diskflags |= GFS2_DIF_JDATA; + ip->i_entries = 2; error = gfs2_meta_inode_buffer(ip, &dibh); @@ -517,13 +517,13 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out_gunlock; - if (ip->i_di.di_entries < 2) { + if (ip->i_entries < 2) { if (gfs2_consist_inode(ip)) gfs2_dinode_print(ip); error = -EIO; goto out_gunlock; } - if (ip->i_di.di_entries > 2) { + if (ip->i_entries > 2) { error = -ENOTEMPTY; goto out_gunlock; } @@ -726,13 +726,13 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, goto out_gunlock; if (S_ISDIR(nip->i_inode.i_mode)) { - if (nip->i_di.di_entries < 2) { + if (nip->i_entries < 2) { if (gfs2_consist_inode(nip)) gfs2_dinode_print(nip); error = -EIO; goto out_gunlock; } - if (nip->i_di.di_entries > 2) { + if (nip->i_entries > 2) { error = -ENOTEMPTY; goto out_gunlock; } @@ -758,7 +758,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, error = -EINVAL; goto out_gunlock; } - if (ndip->i_di.di_entries == (u32)-1) { + if (ndip->i_entries == (u32)-1) { error = -EFBIG; goto out_gunlock; } @@ -990,7 +990,7 @@ static int setattr_size(struct inode *inode, struct iattr *attr) struct gfs2_sbd *sdp = GFS2_SB(inode); int error; - if (attr->ia_size != ip->i_di.di_size) { + if (attr->ia_size != ip->i_disksize) { error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) return error; @@ -1001,8 +1001,8 @@ static int setattr_size(struct inode *inode, struct iattr *attr) } error = gfs2_truncatei(ip, attr->ia_size); - if (error && (inode->i_size != ip->i_di.di_size)) - i_size_write(inode, ip->i_di.di_size); + if (error && (inode->i_size != ip->i_disksize)) + i_size_write(inode, ip->i_disksize); return error; } @@ -1212,6 +1212,48 @@ static int gfs2_removexattr(struct dentry *dentry, const char *name) return gfs2_ea_remove(GFS2_I(dentry->d_inode), &er); } +static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int ret; + + ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + if (ret) + return ret; + + mutex_lock(&inode->i_mutex); + + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + if (ret) + goto out; + + if (gfs2_is_stuffed(ip)) { + u64 phys = ip->i_no_addr << inode->i_blkbits; + u64 size = i_size_read(inode); + u32 flags = FIEMAP_EXTENT_LAST|FIEMAP_EXTENT_NOT_ALIGNED| + FIEMAP_EXTENT_DATA_INLINE; + phys += sizeof(struct gfs2_dinode); + phys += start; + if (start + len > size) + len = size - start; + if (start < size) + ret = fiemap_fill_next_extent(fieinfo, start, phys, + len, flags); + if (ret == 1) + ret = 0; + } else { + ret = __generic_block_fiemap(inode, fieinfo, start, len, + gfs2_block_map); + } + + gfs2_glock_dq_uninit(&gh); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + const struct inode_operations gfs2_file_iops = { .permission = gfs2_permission, .setattr = gfs2_setattr, @@ -1220,6 +1262,7 @@ const struct inode_operations gfs2_file_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, }; const struct inode_operations gfs2_dir_iops = { @@ -1239,6 +1282,7 @@ const struct inode_operations gfs2_dir_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, }; const struct inode_operations gfs2_symlink_iops = { @@ -1251,5 +1295,6 @@ const struct inode_operations gfs2_symlink_iops = { .getxattr = gfs2_getxattr, .listxattr = gfs2_listxattr, .removexattr = gfs2_removexattr, + .fiemap = gfs2_fiemap, }; diff --git a/fs/gfs2/ops_inode.h b/fs/gfs2/ops_inode.h deleted file mode 100644 index 14b4b797622..00000000000 --- a/fs/gfs2/ops_inode.h +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_INODE_DOT_H__ -#define __OPS_INODE_DOT_H__ - -#include <linux/fs.h> - -extern const struct inode_operations gfs2_file_iops; -extern const struct inode_operations gfs2_dir_iops; -extern const struct inode_operations gfs2_symlink_iops; -extern const struct file_operations gfs2_file_fops; -extern const struct file_operations gfs2_dir_fops; -extern const struct file_operations gfs2_file_fops_nolock; -extern const struct file_operations gfs2_dir_fops_nolock; - -extern void gfs2_set_inode_flags(struct inode *inode); - -#endif /* __OPS_INODE_DOT_H__ */ diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index d5355d9b592..777783deddc 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -28,7 +28,6 @@ #include "inode.h" #include "log.h" #include "mount.h" -#include "ops_super.h" #include "quota.h" #include "recovery.h" #include "rgrp.h" @@ -143,8 +142,6 @@ static void gfs2_put_super(struct super_block *sb) kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); kthread_stop(sdp->sd_recoverd_process); - while (sdp->sd_glockd_num--) - kthread_stop(sdp->sd_glockd_process[sdp->sd_glockd_num]); if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); @@ -185,7 +182,6 @@ static void gfs2_put_super(struct super_block *sb) /* At this point, we're through participating in the lockspace */ gfs2_sys_fs_del(sdp); - kfree(sdp); } /** @@ -260,6 +256,137 @@ static void gfs2_unlockfs(struct super_block *sb) } /** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + +out: + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** * gfs2_statfs - Gather and return stats about the filesystem * @sb: The superblock * @statfsbuf: The buffer @@ -370,7 +497,6 @@ static void gfs2_clear_inode(struct inode *inode) */ if (test_bit(GIF_USER, &ip->i_flags)) { ip->i_gl->gl_object = NULL; - gfs2_glock_schedule_for_reclaim(ip->i_gl); gfs2_glock_put(ip->i_gl); ip->i_gl = NULL; if (ip->i_iopen_gh.gh_gl) { @@ -423,8 +549,6 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",debug"); if (args->ar_upgrade) seq_printf(s, ",upgrade"); - if (args->ar_num_glockd != GFS2_GLOCKD_DEFAULT) - seq_printf(s, ",num_glockd=%u", args->ar_num_glockd); if (args->ar_posix_acl) seq_printf(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { @@ -494,16 +618,16 @@ static void gfs2_delete_inode(struct inode *inode) gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); error = gfs2_glock_nq(&ip->i_iopen_gh); if (error) - goto out_uninit; + goto out_truncate; if (S_ISDIR(inode->i_mode) && - (ip->i_di.di_flags & GFS2_DIF_EXHASH)) { + (ip->i_diskflags & GFS2_DIF_EXHASH)) { error = gfs2_dir_exhash_dealloc(ip); if (error) goto out_unlock; } - if (ip->i_di.di_eattr) { + if (ip->i_eattr) { error = gfs2_ea_dealloc(ip); if (error) goto out_unlock; @@ -519,6 +643,7 @@ static void gfs2_delete_inode(struct inode *inode) if (error) goto out_unlock; +out_truncate: error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); if (error) goto out_unlock; @@ -527,8 +652,8 @@ static void gfs2_delete_inode(struct inode *inode) gfs2_trans_end(sdp); out_unlock: - gfs2_glock_dq(&ip->i_iopen_gh); -out_uninit: + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); if (error && error != GLR_TRYFAILED) diff --git a/fs/gfs2/ops_super.h b/fs/gfs2/ops_super.h deleted file mode 100644 index 442a274c627..00000000000 --- a/fs/gfs2/ops_super.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_SUPER_DOT_H__ -#define __OPS_SUPER_DOT_H__ - -#include <linux/fs.h> - -extern const struct super_operations gfs2_super_ops; - -#endif /* __OPS_SUPER_DOT_H__ */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 3e073f5144f..b08d09696b3 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -46,6 +46,8 @@ #include <linux/bio.h> #include <linux/gfs2_ondisk.h> #include <linux/lm_interface.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "gfs2.h" #include "incore.h" @@ -94,7 +96,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, struct gfs2_quota_data *qd; int error; - qd = kzalloc(sizeof(struct gfs2_quota_data), GFP_NOFS); + qd = kmem_cache_zalloc(gfs2_quotad_cachep, GFP_NOFS); if (!qd) return -ENOMEM; @@ -119,7 +121,7 @@ static int qd_alloc(struct gfs2_sbd *sdp, int user, u32 id, return 0; fail: - kfree(qd); + kmem_cache_free(gfs2_quotad_cachep, qd); return error; } @@ -158,7 +160,7 @@ static int qd_get(struct gfs2_sbd *sdp, int user, u32 id, int create, if (qd || !create) { if (new_qd) { gfs2_lvb_unhold(new_qd->qd_gl); - kfree(new_qd); + kmem_cache_free(gfs2_quotad_cachep, new_qd); } *qdp = qd; return 0; @@ -1013,7 +1015,7 @@ void gfs2_quota_change(struct gfs2_inode *ip, s64 change, if (gfs2_assert_warn(GFS2_SB(&ip->i_inode), change)) return; - if (ip->i_di.di_flags & GFS2_DIF_SYSTEM) + if (ip->i_diskflags & GFS2_DIF_SYSTEM) return; for (x = 0; x < al->al_qd_num; x++) { @@ -1100,15 +1102,15 @@ static void gfs2_quota_change_in(struct gfs2_quota_change_host *qc, const void * int gfs2_quota_init(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_qc_inode); - unsigned int blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + unsigned int blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; unsigned int x, slot = 0; unsigned int found = 0; u64 dblock; u32 extlen = 0; int error; - if (!ip->i_di.di_size || ip->i_di.di_size > (64 << 20) || - ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1)) { + if (!ip->i_disksize || ip->i_disksize > (64 << 20) || + ip->i_disksize & (sdp->sd_sb.sb_bsize - 1)) { gfs2_consist_inode(ip); return -EIO; } @@ -1195,7 +1197,7 @@ fail: return error; } -void gfs2_quota_scan(struct gfs2_sbd *sdp) +static void gfs2_quota_scan(struct gfs2_sbd *sdp) { struct gfs2_quota_data *qd, *safe; LIST_HEAD(dead); @@ -1222,7 +1224,7 @@ void gfs2_quota_scan(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_lvb_unhold(qd->qd_gl); - kfree(qd); + kmem_cache_free(gfs2_quotad_cachep, qd); } } @@ -1257,7 +1259,7 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) gfs2_assert_warn(sdp, !qd->qd_bh_count); gfs2_lvb_unhold(qd->qd_gl); - kfree(qd); + kmem_cache_free(gfs2_quotad_cachep, qd); spin_lock(&sdp->sd_quota_spin); } @@ -1272,3 +1274,94 @@ void gfs2_quota_cleanup(struct gfs2_sbd *sdp) } } +static void quotad_error(struct gfs2_sbd *sdp, const char *msg, int error) +{ + if (error == 0 || error == -EROFS) + return; + if (!test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + fs_err(sdp, "gfs2_quotad: %s error %d\n", msg, error); +} + +static void quotad_check_timeo(struct gfs2_sbd *sdp, const char *msg, + int (*fxn)(struct gfs2_sbd *sdp), + unsigned long t, unsigned long *timeo, + unsigned int *new_timeo) +{ + if (t >= *timeo) { + int error = fxn(sdp); + quotad_error(sdp, msg, error); + *timeo = gfs2_tune_get_i(&sdp->sd_tune, new_timeo) * HZ; + } else { + *timeo -= t; + } +} + +static void quotad_check_trunc_list(struct gfs2_sbd *sdp) +{ + struct gfs2_inode *ip; + + while(1) { + ip = NULL; + spin_lock(&sdp->sd_trunc_lock); + if (!list_empty(&sdp->sd_trunc_list)) { + ip = list_entry(sdp->sd_trunc_list.next, + struct gfs2_inode, i_trunc_list); + list_del_init(&ip->i_trunc_list); + } + spin_unlock(&sdp->sd_trunc_lock); + if (ip == NULL) + return; + gfs2_glock_finish_truncate(ip); + } +} + +/** + * gfs2_quotad - Write cached quota changes into the quota file + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_quotad(void *data) +{ + struct gfs2_sbd *sdp = data; + struct gfs2_tune *tune = &sdp->sd_tune; + unsigned long statfs_timeo = 0; + unsigned long quotad_timeo = 0; + unsigned long t = 0; + DEFINE_WAIT(wait); + int empty; + + while (!kthread_should_stop()) { + + /* Update the master statfs file */ + quotad_check_timeo(sdp, "statfs", gfs2_statfs_sync, t, + &statfs_timeo, &tune->gt_statfs_quantum); + + /* Update quota file */ + quotad_check_timeo(sdp, "sync", gfs2_quota_sync, t, + "ad_timeo, &tune->gt_quota_quantum); + + /* FIXME: This should be turned into a shrinker */ + gfs2_quota_scan(sdp); + + /* Check for & recover partially truncated inodes */ + quotad_check_trunc_list(sdp); + + if (freezing(current)) + refrigerator(); + t = min(quotad_timeo, statfs_timeo); + + prepare_to_wait(&sdp->sd_quota_wait, &wait, TASK_UNINTERRUPTIBLE); + spin_lock(&sdp->sd_trunc_lock); + empty = list_empty(&sdp->sd_trunc_list); + spin_unlock(&sdp->sd_trunc_lock); + if (empty) + t -= schedule_timeout(t); + else + t = 0; + finish_wait(&sdp->sd_quota_wait, &wait); + } + + return 0; +} + diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h index 3b7f4b0e5df..cec9032be97 100644 --- a/fs/gfs2/quota.h +++ b/fs/gfs2/quota.h @@ -15,22 +15,22 @@ struct gfs2_sbd; #define NO_QUOTA_CHANGE ((u32)-1) -int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_unhold(struct gfs2_inode *ip); +extern int gfs2_quota_hold(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_unhold(struct gfs2_inode *ip); -int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_unlock(struct gfs2_inode *ip); +extern int gfs2_quota_lock(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_unlock(struct gfs2_inode *ip); -int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); -void gfs2_quota_change(struct gfs2_inode *ip, s64 change, - u32 uid, u32 gid); +extern int gfs2_quota_check(struct gfs2_inode *ip, u32 uid, u32 gid); +extern void gfs2_quota_change(struct gfs2_inode *ip, s64 change, + u32 uid, u32 gid); -int gfs2_quota_sync(struct gfs2_sbd *sdp); -int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); +extern int gfs2_quota_sync(struct gfs2_sbd *sdp); +extern int gfs2_quota_refresh(struct gfs2_sbd *sdp, int user, u32 id); -int gfs2_quota_init(struct gfs2_sbd *sdp); -void gfs2_quota_scan(struct gfs2_sbd *sdp); -void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quota_init(struct gfs2_sbd *sdp); +extern void gfs2_quota_cleanup(struct gfs2_sbd *sdp); +extern int gfs2_quotad(void *data); static inline int gfs2_quota_lock_check(struct gfs2_inode *ip) { diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index d5e91f4f6a0..efd09c3d2b2 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,6 +14,8 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/lm_interface.h> +#include <linux/kthread.h> +#include <linux/freezer.h> #include "gfs2.h" #include "incore.h" @@ -583,13 +585,35 @@ fail: return error; } +static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) +{ + struct gfs2_jdesc *jd; + int found = 0; + + spin_lock(&sdp->sd_jindex_spin); + + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (jd->jd_dirty) { + jd->jd_dirty = 0; + found = 1; + break; + } + } + spin_unlock(&sdp->sd_jindex_spin); + + if (!found) + jd = NULL; + + return jd; +} + /** * gfs2_check_journals - Recover any dirty journals * @sdp: the filesystem * */ -void gfs2_check_journals(struct gfs2_sbd *sdp) +static void gfs2_check_journals(struct gfs2_sbd *sdp) { struct gfs2_jdesc *jd; @@ -603,3 +627,25 @@ void gfs2_check_journals(struct gfs2_sbd *sdp) } } +/** + * gfs2_recoverd - Recover dead machine's journals + * @sdp: Pointer to GFS2 superblock + * + */ + +int gfs2_recoverd(void *data) +{ + struct gfs2_sbd *sdp = data; + unsigned long t; + + while (!kthread_should_stop()) { + gfs2_check_journals(sdp); + t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; + if (freezing(current)) + refrigerator(); + schedule_timeout_interruptible(t); + } + + return 0; +} + diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index f7235e61c72..a8218ea15b5 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -18,17 +18,17 @@ static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) *blk = 0; } -int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, +extern int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh); -int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); -void gfs2_revoke_clean(struct gfs2_sbd *sdp); +extern int gfs2_revoke_add(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +extern int gfs2_revoke_check(struct gfs2_sbd *sdp, u64 blkno, unsigned int where); +extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); -int gfs2_find_jhead(struct gfs2_jdesc *jd, +extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); -int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -void gfs2_check_journals(struct gfs2_sbd *sdp); +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); +extern int gfs2_recoverd(void *data); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 2d90fb25350..8b01c635d92 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -269,16 +269,14 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) bi->bi_len, x); } - if (count[0] != rgd->rd_rg.rg_free) { + if (count[0] != rgd->rd_free) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "free data mismatch: %u != %u\n", - count[0], rgd->rd_rg.rg_free); + count[0], rgd->rd_free); return; } - tmp = rgd->rd_data - - rgd->rd_rg.rg_free - - rgd->rd_rg.rg_dinodes; + tmp = rgd->rd_data - rgd->rd_free - rgd->rd_dinodes; if (count[1] + count[2] != tmp) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used data mismatch: %u != %u\n", @@ -286,10 +284,10 @@ void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd) return; } - if (count[3] != rgd->rd_rg.rg_dinodes) { + if (count[3] != rgd->rd_dinodes) { if (gfs2_consist_rgrpd(rgd)) fs_err(sdp, "used metadata mismatch: %u != %u\n", - count[3], rgd->rd_rg.rg_dinodes); + count[3], rgd->rd_dinodes); return; } @@ -501,7 +499,7 @@ u64 gfs2_ri_total(struct gfs2_sbd *sdp) for (rgrps = 0;; rgrps++) { loff_t pos = rgrps * sizeof(struct gfs2_rindex); - if (pos + sizeof(struct gfs2_rindex) >= ip->i_di.di_size) + if (pos + sizeof(struct gfs2_rindex) >= ip->i_disksize) break; error = gfs2_internal_read(ip, &ra_state, buf, &pos, sizeof(struct gfs2_rindex)); @@ -590,7 +588,7 @@ static int gfs2_ri_update(struct gfs2_inode *ip) struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct inode *inode = &ip->i_inode; struct file_ra_state ra_state; - u64 rgrp_count = ip->i_di.di_size; + u64 rgrp_count = ip->i_disksize; int error; if (do_div(rgrp_count, sizeof(struct gfs2_rindex))) { @@ -634,7 +632,7 @@ static int gfs2_ri_update_special(struct gfs2_inode *ip) for (sdp->sd_rgrps = 0;; sdp->sd_rgrps++) { /* Ignore partials */ if ((sdp->sd_rgrps + 1) * sizeof(struct gfs2_rindex) > - ip->i_di.di_size) + ip->i_disksize) break; error = read_rindex_entry(ip, &ra_state); if (error) { @@ -692,7 +690,6 @@ int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh) static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) { const struct gfs2_rgrp *str = buf; - struct gfs2_rgrp_host *rg = &rgd->rd_rg; u32 rg_flags; rg_flags = be32_to_cpu(str->rg_flags); @@ -700,24 +697,23 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) rgd->rd_flags |= GFS2_RDF_NOALLOC; else rgd->rd_flags &= ~GFS2_RDF_NOALLOC; - rg->rg_free = be32_to_cpu(str->rg_free); - rg->rg_dinodes = be32_to_cpu(str->rg_dinodes); - rg->rg_igeneration = be64_to_cpu(str->rg_igeneration); + rgd->rd_free = be32_to_cpu(str->rg_free); + rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); + rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); } static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrp *str = buf; - struct gfs2_rgrp_host *rg = &rgd->rd_rg; u32 rg_flags = 0; if (rgd->rd_flags & GFS2_RDF_NOALLOC) rg_flags |= GFS2_RGF_NOALLOC; str->rg_flags = cpu_to_be32(rg_flags); - str->rg_free = cpu_to_be32(rg->rg_free); - str->rg_dinodes = cpu_to_be32(rg->rg_dinodes); + str->rg_free = cpu_to_be32(rgd->rd_free); + str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); str->__pad = cpu_to_be32(0); - str->rg_igeneration = cpu_to_be64(rg->rg_igeneration); + str->rg_igeneration = cpu_to_be64(rgd->rd_igeneration); memset(&str->rg_reserved, 0, sizeof(str->rg_reserved)); } @@ -776,7 +772,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_rg.rg_free; + rgd->rd_free_clone = rgd->rd_free; rgd->rd_bh_count++; spin_unlock(&sdp->sd_rindex_spin); @@ -850,7 +846,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) } spin_lock(&sdp->sd_rindex_spin); - rgd->rd_free_clone = rgd->rd_rg.rg_free; + rgd->rd_free_clone = rgd->rd_free; spin_unlock(&sdp->sd_rindex_spin); } @@ -1403,8 +1399,8 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) block = rgd->rd_data0 + blk; ip->i_goal = block; - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free >= *n); - rgd->rd_rg.rg_free -= *n; + gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); + rgd->rd_free -= *n; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1445,10 +1441,10 @@ u64 gfs2_alloc_di(struct gfs2_inode *dip, u64 *generation) block = rgd->rd_data0 + blk; - gfs2_assert_withdraw(sdp, rgd->rd_rg.rg_free); - rgd->rd_rg.rg_free--; - rgd->rd_rg.rg_dinodes++; - *generation = rgd->rd_rg.rg_igeneration++; + gfs2_assert_withdraw(sdp, rgd->rd_free); + rgd->rd_free--; + rgd->rd_dinodes++; + *generation = rgd->rd_igeneration++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1481,7 +1477,7 @@ void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen) if (!rgd) return; - rgd->rd_rg.rg_free += blen; + rgd->rd_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1509,7 +1505,7 @@ void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen) if (!rgd) return; - rgd->rd_rg.rg_free += blen; + rgd->rd_free += blen; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); @@ -1546,10 +1542,10 @@ static void gfs2_free_uninit_di(struct gfs2_rgrpd *rgd, u64 blkno) return; gfs2_assert_withdraw(sdp, rgd == tmp_rgd); - if (!rgd->rd_rg.rg_dinodes) + if (!rgd->rd_dinodes) gfs2_consist_rgrpd(rgd); - rgd->rd_rg.rg_dinodes--; - rgd->rd_rg.rg_free++; + rgd->rd_dinodes--; + rgd->rd_free++; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); gfs2_rgrp_out(rgd, rgd->rd_bits[0].bi_bh->b_data); diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index c3ba3d9d0aa..141b781f2fc 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -34,76 +34,6 @@ #include "util.h" /** - * gfs2_jindex_hold - Grab a lock on the jindex - * @sdp: The GFS2 superblock - * @ji_gh: the holder for the jindex glock - * - * This is very similar to the gfs2_rindex_hold() function, except that - * in general we hold the jindex lock for longer periods of time and - * we grab it far less frequently (in general) then the rgrp lock. - * - * Returns: errno - */ - -int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) -{ - struct gfs2_inode *dip = GFS2_I(sdp->sd_jindex); - struct qstr name; - char buf[20]; - struct gfs2_jdesc *jd; - int error; - - name.name = buf; - - mutex_lock(&sdp->sd_jindex_mutex); - - for (;;) { - error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0, ji_gh); - if (error) - break; - - name.len = sprintf(buf, "journal%u", sdp->sd_journals); - name.hash = gfs2_disk_hash(name.name, name.len); - - error = gfs2_dir_check(sdp->sd_jindex, &name, NULL); - if (error == -ENOENT) { - error = 0; - break; - } - - gfs2_glock_dq_uninit(ji_gh); - - if (error) - break; - - error = -ENOMEM; - jd = kzalloc(sizeof(struct gfs2_jdesc), GFP_KERNEL); - if (!jd) - break; - - INIT_LIST_HEAD(&jd->extent_list); - jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); - if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { - if (!jd->jd_inode) - error = -ENOENT; - else - error = PTR_ERR(jd->jd_inode); - kfree(jd); - break; - } - - spin_lock(&sdp->sd_jindex_spin); - jd->jd_jid = sdp->sd_journals++; - list_add_tail(&jd->jd_list, &sdp->sd_jindex_list); - spin_unlock(&sdp->sd_jindex_spin); - } - - mutex_unlock(&sdp->sd_jindex_mutex); - - return error; -} - -/** * gfs2_jindex_free - Clear all the journal index information * @sdp: The GFS2 superblock * @@ -166,39 +96,6 @@ struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid) return jd; } -void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) -{ - struct gfs2_jdesc *jd; - - spin_lock(&sdp->sd_jindex_spin); - jd = jdesc_find_i(&sdp->sd_jindex_list, jid); - if (jd) - jd->jd_dirty = 1; - spin_unlock(&sdp->sd_jindex_spin); -} - -struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd; - int found = 0; - - spin_lock(&sdp->sd_jindex_spin); - - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_dirty) { - jd->jd_dirty = 0; - found = 1; - break; - } - } - spin_unlock(&sdp->sd_jindex_spin); - - if (!found) - jd = NULL; - - return jd; -} - int gfs2_jdesc_check(struct gfs2_jdesc *jd) { struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -206,14 +103,14 @@ int gfs2_jdesc_check(struct gfs2_jdesc *jd) int ar; int error; - if (ip->i_di.di_size < (8 << 20) || ip->i_di.di_size > (1 << 30) || - (ip->i_di.di_size & (sdp->sd_sb.sb_bsize - 1))) { + if (ip->i_disksize < (8 << 20) || ip->i_disksize > (1 << 30) || + (ip->i_disksize & (sdp->sd_sb.sb_bsize - 1))) { gfs2_consist_inode(ip); return -EIO; } - jd->jd_blocks = ip->i_di.di_size >> sdp->sd_sb.sb_bsize_shift; + jd->jd_blocks = ip->i_disksize >> sdp->sd_sb.sb_bsize_shift; - error = gfs2_write_alloc_required(ip, 0, ip->i_di.di_size, &ar); + error = gfs2_write_alloc_required(ip, 0, ip->i_disksize, &ar); if (!error && ar) { gfs2_consist_inode(ip); error = -EIO; @@ -423,137 +320,6 @@ out: return error; } -/** - * gfs2_statfs_i - Do a statfs - * @sdp: the filesystem - * @sg: the sg structure - * - * Returns: errno - */ - -int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - - spin_lock(&sdp->sd_statfs_spin); - - *sc = *m_sc; - sc->sc_total += l_sc->sc_total; - sc->sc_free += l_sc->sc_free; - sc->sc_dinodes += l_sc->sc_dinodes; - - spin_unlock(&sdp->sd_statfs_spin); - - if (sc->sc_free < 0) - sc->sc_free = 0; - if (sc->sc_free > sc->sc_total) - sc->sc_free = sc->sc_total; - if (sc->sc_dinodes < 0) - sc->sc_dinodes = 0; - - return 0; -} - -/** - * statfs_fill - fill in the sg for a given RG - * @rgd: the RG - * @sc: the sc structure - * - * Returns: 0 on success, -ESTALE if the LVB is invalid - */ - -static int statfs_slow_fill(struct gfs2_rgrpd *rgd, - struct gfs2_statfs_change_host *sc) -{ - gfs2_rgrp_verify(rgd); - sc->sc_total += rgd->rd_data; - sc->sc_free += rgd->rd_rg.rg_free; - sc->sc_dinodes += rgd->rd_rg.rg_dinodes; - return 0; -} - -/** - * gfs2_statfs_slow - Stat a filesystem using asynchronous locking - * @sdp: the filesystem - * @sc: the sc info that will be returned - * - * Any error (other than a signal) will cause this routine to fall back - * to the synchronous version. - * - * FIXME: This really shouldn't busy wait like this. - * - * Returns: errno - */ - -int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_holder ri_gh; - struct gfs2_rgrpd *rgd_next; - struct gfs2_holder *gha, *gh; - unsigned int slots = 64; - unsigned int x; - int done; - int error = 0, err; - - memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); - if (!gha) - return -ENOMEM; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - - rgd_next = gfs2_rgrpd_get_first(sdp); - - for (;;) { - done = 1; - - for (x = 0; x < slots; x++) { - gh = gha + x; - - if (gh->gh_gl && gfs2_glock_poll(gh)) { - err = gfs2_glock_wait(gh); - if (err) { - gfs2_holder_uninit(gh); - error = err; - } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); - gfs2_glock_dq_uninit(gh); - } - } - - if (gh->gh_gl) - done = 0; - else if (rgd_next && !error) { - error = gfs2_glock_nq_init(rgd_next->rd_gl, - LM_ST_SHARED, - GL_ASYNC, - gh); - rgd_next = gfs2_rgrpd_get_next(rgd_next); - done = 0; - } - - if (signal_pending(current)) - error = -ERESTARTSYS; - } - - if (done) - break; - - yield(); - } - - gfs2_glock_dq_uninit(&ri_gh); - -out: - kfree(gha); - return error; -} - struct lfcc { struct list_head list; struct gfs2_holder gh; @@ -580,10 +346,6 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp, struct gfs2_log_header_host lh; int error; - error = gfs2_jindex_hold(sdp, &ji_gh); - if (error) - return error; - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL); if (!lfcc) { diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 50a4c9b1215..f6b8b00ad88 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -10,6 +10,8 @@ #ifndef __SUPER_DOT_H__ #define __SUPER_DOT_H__ +#include <linux/fs.h> +#include <linux/dcache.h> #include "incore.h" void gfs2_lm_unmount(struct gfs2_sbd *sdp); @@ -23,12 +25,9 @@ static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) return x; } -int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh); void gfs2_jindex_free(struct gfs2_sbd *sdp); struct gfs2_jdesc *gfs2_jdesc_find(struct gfs2_sbd *sdp, unsigned int jid); -void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid); -struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp); int gfs2_jdesc_check(struct gfs2_jdesc *jd); int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, @@ -40,11 +39,15 @@ int gfs2_statfs_init(struct gfs2_sbd *sdp); void gfs2_statfs_change(struct gfs2_sbd *sdp, s64 total, s64 free, s64 dinodes); int gfs2_statfs_sync(struct gfs2_sbd *sdp); -int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc); -int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc); int gfs2_freeze_fs(struct gfs2_sbd *sdp); void gfs2_unfreeze_fs(struct gfs2_sbd *sdp); +extern struct file_system_type gfs2_fs_type; +extern struct file_system_type gfs2meta_fs_type; +extern const struct export_operations gfs2_export_ops; +extern const struct super_operations gfs2_super_ops; +extern struct dentry_operations gfs2_dops; + #endif /* __SUPER_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7e1879f1a02..26c1fa777a9 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -26,9 +26,6 @@ #include "quota.h" #include "util.h" -char *gfs2_sys_margs; -spinlock_t gfs2_sys_margs_lock; - static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { return snprintf(buf, PAGE_SIZE, "%u:%u\n", @@ -263,7 +260,6 @@ ARGS_ATTR(localcaching, "%d\n"); ARGS_ATTR(localflocks, "%d\n"); ARGS_ATTR(debug, "%d\n"); ARGS_ATTR(upgrade, "%d\n"); -ARGS_ATTR(num_glockd, "%u\n"); ARGS_ATTR(posix_acl, "%d\n"); ARGS_ATTR(quota, "%u\n"); ARGS_ATTR(suiddir, "%d\n"); @@ -279,7 +275,6 @@ static struct attribute *args_attrs[] = { &args_attr_localflocks.attr, &args_attr_debug.attr, &args_attr_upgrade.attr, - &args_attr_num_glockd.attr, &args_attr_posix_acl.attr, &args_attr_quota.attr, &args_attr_suiddir.attr, @@ -288,30 +283,6 @@ static struct attribute *args_attrs[] = { }; /* - * display counters from superblock - */ - -struct counters_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; - -#define COUNTERS_ATTR(name, fmt) \ -static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ -{ \ - return snprintf(buf, PAGE_SIZE, fmt, \ - (unsigned int)atomic_read(&sdp->sd_##name)); \ -} \ -static struct counters_attr counters_attr_##name = __ATTR_RO(name) - -COUNTERS_ATTR(reclaimed, "%u\n"); - -static struct attribute *counters_attrs[] = { - &counters_attr_reclaimed.attr, - NULL, -}; - -/* * get and set struct gfs2_tune fields */ @@ -393,7 +364,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -TUNE_ATTR(demote_secs, 0); TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); @@ -408,11 +378,9 @@ TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); TUNE_ATTR_DAEMON(logd_secs, logd_process); -TUNE_ATTR_DAEMON(quotad_secs, quotad_process); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { - &tune_attr_demote_secs.attr, &tune_attr_incore_log_blocks.attr, &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, @@ -426,7 +394,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_statfs_quantum.attr, &tune_attr_recoverd_secs.attr, &tune_attr_logd_secs.attr, - &tune_attr_quotad_secs.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, NULL, @@ -437,11 +404,6 @@ static struct attribute_group lockstruct_group = { .attrs = lockstruct_attrs, }; -static struct attribute_group counters_group = { - .name = "counters", - .attrs = counters_attrs, -}; - static struct attribute_group args_group = { .name = "args", .attrs = args_attrs, @@ -466,13 +428,9 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) if (error) goto fail_reg; - error = sysfs_create_group(&sdp->sd_kobj, &counters_group); - if (error) - goto fail_lockstruct; - error = sysfs_create_group(&sdp->sd_kobj, &args_group); if (error) - goto fail_counters; + goto fail_lockstruct; error = sysfs_create_group(&sdp->sd_kobj, &tune_group); if (error) @@ -483,8 +441,6 @@ int gfs2_sys_fs_add(struct gfs2_sbd *sdp) fail_args: sysfs_remove_group(&sdp->sd_kobj, &args_group); -fail_counters: - sysfs_remove_group(&sdp->sd_kobj, &counters_group); fail_lockstruct: sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); fail_reg: @@ -498,16 +454,27 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp) { sysfs_remove_group(&sdp->sd_kobj, &tune_group); sysfs_remove_group(&sdp->sd_kobj, &args_group); - sysfs_remove_group(&sdp->sd_kobj, &counters_group); sysfs_remove_group(&sdp->sd_kobj, &lockstruct_group); kobject_put(&sdp->sd_kobj); } +static int gfs2_uevent(struct kset *kset, struct kobject *kobj, + struct kobj_uevent_env *env) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + add_uevent_var(env, "LOCKTABLE=%s", sdp->sd_table_name); + add_uevent_var(env, "LOCKPROTO=%s", sdp->sd_proto_name); + return 0; +} + +static struct kset_uevent_ops gfs2_uevent_ops = { + .uevent = gfs2_uevent, +}; + + int gfs2_sys_init(void) { - gfs2_sys_margs = NULL; - spin_lock_init(&gfs2_sys_margs_lock); - gfs2_kset = kset_create_and_add("gfs2", NULL, fs_kobj); + gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj); if (!gfs2_kset) return -ENOMEM; return 0; @@ -515,7 +482,6 @@ int gfs2_sys_init(void) void gfs2_sys_uninit(void) { - kfree(gfs2_sys_margs); kset_unregister(gfs2_kset); } diff --git a/fs/gfs2/sys.h b/fs/gfs2/sys.h index 1ca8cdac530..e94560e836d 100644 --- a/fs/gfs2/sys.h +++ b/fs/gfs2/sys.h @@ -13,10 +13,6 @@ #include <linux/spinlock.h> struct gfs2_sbd; -/* Allow args to be passed to GFS2 when using an initial ram disk */ -extern char *gfs2_sys_margs; -extern spinlock_t gfs2_sys_margs_lock; - int gfs2_sys_fs_add(struct gfs2_sbd *sdp); void gfs2_sys_fs_del(struct gfs2_sbd *sdp); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index d31e355c61f..374f50e9549 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -25,6 +25,7 @@ struct kmem_cache *gfs2_glock_cachep __read_mostly; struct kmem_cache *gfs2_inode_cachep __read_mostly; struct kmem_cache *gfs2_bufdata_cachep __read_mostly; struct kmem_cache *gfs2_rgrpd_cachep __read_mostly; +struct kmem_cache *gfs2_quotad_cachep __read_mostly; void gfs2_assert_i(struct gfs2_sbd *sdp) { diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 7f48576289c..33e96b0ce9a 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -148,6 +148,7 @@ extern struct kmem_cache *gfs2_glock_cachep; extern struct kmem_cache *gfs2_inode_cachep; extern struct kmem_cache *gfs2_bufdata_cachep; extern struct kmem_cache *gfs2_rgrpd_cachep; +extern struct kmem_cache *gfs2_quotad_cachep; static inline unsigned int gfs2_tune_get_i(struct gfs2_tune *gt, unsigned int *p) diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 3a31451ac17..5c538e0ec14 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -501,7 +501,7 @@ int hostfs_write_begin(struct file *file, struct address_space *mapping, { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = __grab_cache_page(mapping, index); + *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) return -ENOMEM; return 0; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7d479ce3ace..6903d37af03 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -252,6 +252,7 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, for (;;) { struct page *page; unsigned long nr, ret; + int ra; /* nr is the maximum number of bytes to copy from this page */ nr = huge_page_size(h); @@ -274,16 +275,19 @@ static ssize_t hugetlbfs_read(struct file *filp, char __user *buf, */ ret = len < nr ? len : nr; if (clear_user(buf, ret)) - ret = -EFAULT; + ra = -EFAULT; + else + ra = 0; } else { /* * We have the page, copy it to user space buffer. */ - ret = hugetlbfs_read_actor(page, offset, buf, len, nr); + ra = hugetlbfs_read_actor(page, offset, buf, len, nr); + ret = ra; } - if (ret < 0) { + if (ra < 0) { if (retval == 0) - retval = ret; + retval = ra; if (page) page_cache_release(page); goto out; @@ -506,7 +510,6 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_mode = mode; inode->i_uid = uid; inode->i_gid = gid; - inode->i_blocks = 0; inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_mapping->backing_dev_info =&hugetlbfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/inode.c b/fs/inode.c index 7de1cda9248..913ab2d9a5d 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -22,6 +22,7 @@ #include <linux/bootmem.h> #include <linux/inotify.h> #include <linux/mount.h> +#include <linux/async.h> /* * This is needed for the following functions: @@ -110,8 +111,8 @@ static void wake_up_inode(struct inode *inode) /** * inode_init_always - perform inode structure intialisation - * @sb - superblock inode belongs to. - * @inode - inode to initialise + * @sb: superblock inode belongs to + * @inode: inode to initialise * * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. @@ -131,6 +132,8 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_op = &empty_iops; inode->i_fop = &empty_fops; inode->i_nlink = 1; + inode->i_uid = 0; + inode->i_gid = 0; atomic_set(&inode->i_writecount, 0); inode->i_size = 0; inode->i_blocks = 0; @@ -164,7 +167,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) mapping->a_ops = &empty_aops; mapping->host = inode; mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_HIGHUSER_PAGECACHE); + mapping_set_gfp_mask(mapping, GFP_HIGHUSER_MOVABLE); mapping->assoc_mapping = NULL; mapping->backing_dev_info = &default_backing_dev_info; mapping->writeback_index = 0; @@ -574,8 +577,8 @@ __inode_add_to_lists(struct super_block *sb, struct hlist_head *head, /** * inode_add_to_lists - add a new inode to relevant lists - * @sb - superblock inode belongs to. - * @inode - inode to mark in use + * @sb: superblock inode belongs to + * @inode: inode to mark in use * * When an inode is allocated it needs to be accounted for, added to the in use * list, the owning superblock and the inode hash. This needs to be done under @@ -599,7 +602,7 @@ EXPORT_SYMBOL_GPL(inode_add_to_lists); * @sb: superblock * * Allocates a new inode for given superblock. The default gfp_mask - * for allocations related to inode->i_mapping is GFP_HIGHUSER_PAGECACHE. + * for allocations related to inode->i_mapping is GFP_HIGHUSER_MOVABLE. * If HIGHMEM pages are unsuitable or it is known that pages allocated * for the page cache are not reclaimable or migratable, * mapping_set_gfp_mask() must be called with suitable flags on the diff --git a/fs/ioctl.c b/fs/ioctl.c index 43e8b2c0664..cc3f1aa1cf7 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -231,7 +231,8 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) #define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) #define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); -/* +/** + * __generic_block_fiemap - FIEMAP for block based inodes (no locking) * @inode - the inode to map * @arg - the pointer to userspace where we copy everything to * @get_block - the fs's get_block function @@ -242,11 +243,15 @@ static int ioctl_fiemap(struct file *filp, unsigned long arg) * * If it is possible to have data blocks beyond a hole past @inode->i_size, then * please do not use this function, it will stop at the first unmapped block - * beyond i_size + * beyond i_size. + * + * If you use this function directly, you need to do your own locking. Use + * generic_block_fiemap if you want the locking done for you. */ -int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block) + +int __generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) { struct buffer_head tmp; unsigned int start_blk; @@ -260,9 +265,6 @@ int generic_block_fiemap(struct inode *inode, start_blk = logical_to_blk(inode, start); - /* guard against change */ - mutex_lock(&inode->i_mutex); - length = (long long)min_t(u64, len, i_size_read(inode)); map_len = length; @@ -334,14 +336,36 @@ int generic_block_fiemap(struct inode *inode, cond_resched(); } while (1); - mutex_unlock(&inode->i_mutex); - /* if ret is 1 then we just hit the end of the extent array */ if (ret == 1) ret = 0; return ret; } +EXPORT_SYMBOL(__generic_block_fiemap); + +/** + * generic_block_fiemap - FIEMAP for block based inodes + * @inode: The inode to map + * @fieinfo: The mapping information + * @start: The initial block to map + * @len: The length of the extect to attempt to map + * @get_block: The block mapping function for the fs + * + * Calls __generic_block_fiemap to map the inode, after taking + * the inode's mutex lock. + */ + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) +{ + int ret; + mutex_lock(&inode->i_mutex); + ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); + mutex_unlock(&inode->i_mutex); + return ret; +} EXPORT_SYMBOL(generic_block_fiemap); #endif /* CONFIG_BLOCK */ diff --git a/fs/ioprio.c b/fs/ioprio.c index 3569e0ad86a..1a39ac37094 100644 --- a/fs/ioprio.c +++ b/fs/ioprio.c @@ -27,7 +27,7 @@ #include <linux/security.h> #include <linux/pid_namespace.h> -static int set_task_ioprio(struct task_struct *task, int ioprio) +int set_task_ioprio(struct task_struct *task, int ioprio) { int err; struct io_context *ioc; @@ -70,6 +70,7 @@ static int set_task_ioprio(struct task_struct *task, int ioprio) task_unlock(task); return err; } +EXPORT_SYMBOL_GPL(set_task_ioprio); asmlinkage long sys_ioprio_set(int which, int who, int ioprio) { diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 3f8af0f1505..6147ec3643a 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -855,10 +855,6 @@ root_found: } sbi->s_joliet_level = joliet_level; - /* check the root inode */ - if (!inode->i_op) - goto out_bad_root; - /* Make sure the root inode is a directory */ if (!S_ISDIR(inode->i_mode)) { printk(KERN_WARNING @@ -886,8 +882,6 @@ root_found: /* * Display error messages and free resources. */ -out_bad_root: - printk(KERN_WARNING "%s: root inode not initialized\n", __func__); out_iput: iput(inode); goto out_no_inode; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 25719d902c5..3fbffb1ea71 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -306,6 +306,8 @@ void journal_commit_transaction(journal_t *journal) int flags; int err; unsigned long blocknr; + ktime_t start_time; + u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; @@ -418,6 +420,7 @@ void journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; + start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); @@ -913,6 +916,18 @@ restart_loop: J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time*3 + + journal->j_average_commit_time) / 4; + else + journal->j_average_commit_time = commit_time; + spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 60d4c32c880..e6a11743127 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -25,6 +25,7 @@ #include <linux/timer.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/hrtimer.h> static void __journal_temp_unlink_buffer(struct journal_head *jh); @@ -49,6 +50,7 @@ get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); @@ -752,7 +754,6 @@ out: * int journal_get_write_access() - notify intent to modify a buffer for metadata (not data) update. * @handle: transaction to add buffer modifications to * @bh: bh to be used for metadata writes - * @credits: variable that will receive credits for the buffer * * Returns an error code or 0 on success. * @@ -1370,7 +1371,7 @@ int journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1399,6 +1400,17 @@ int journal_stop(handle_t *handle) * on IO anyway. Speeds up many-threaded, many-dir operations * by 30x or more... * + * We try and optimize the sleep time against what the underlying disk + * can do, instead of having a static sleep time. This is usefull for + * the case where our storage is so fast that it is more optimal to go + * ahead and force a flush and wait for the transaction to be committed + * than it is to wait for an arbitrary amount of time for new writers to + * join the transaction. We acheive this by measuring how long it takes + * to commit a transaction, and compare it with how long this + * transaction has been running, and if run time < commit time then we + * sleep for the delta and commit. This greatly helps super fast disks + * that would see slowdowns as more threads started doing fsyncs. + * * But don't do this if this process was the most recent one to * perform a synchronous write. We do this to detect the case where a * single process is doing a stream of sync writes. No point in waiting @@ -1406,11 +1418,26 @@ int journal_stop(handle_t *handle) */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + journal->j_last_sync_writer = pid; - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = min_t(u64, commit_time, + 1000*jiffies_to_usecs(1)); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 9497718fe92..17159cacbd9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -249,16 +249,14 @@ restart: return ret; } -#define NR_BATCH 64 - static void -__flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) +__flush_batch(journal_t *journal, int *batch_count) { int i; - ll_rw_block(SWRITE, *batch_count, bhs); + ll_rw_block(SWRITE, *batch_count, journal->j_chkpt_bhs); for (i = 0; i < *batch_count; i++) { - struct buffer_head *bh = bhs[i]; + struct buffer_head *bh = journal->j_chkpt_bhs[i]; clear_buffer_jwrite(bh); BUFFER_TRACE(bh, "brelse"); __brelse(bh); @@ -277,8 +275,7 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it */ static int __process_buffer(journal_t *journal, struct journal_head *jh, - struct buffer_head **bhs, int *batch_count, - transaction_t *transaction) + int *batch_count, transaction_t *transaction) { struct buffer_head *bh = jh2bh(jh); int ret = 0; @@ -325,14 +322,14 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, get_bh(bh); J_ASSERT_BH(bh, !buffer_jwrite(bh)); set_buffer_jwrite(bh); - bhs[*batch_count] = bh; + journal->j_chkpt_bhs[*batch_count] = bh; __buffer_relink_io(jh); jbd_unlock_bh_state(bh); transaction->t_chp_stats.cs_written++; (*batch_count)++; - if (*batch_count == NR_BATCH) { + if (*batch_count == JBD2_NR_BATCH) { spin_unlock(&journal->j_list_lock); - __flush_batch(journal, bhs, batch_count); + __flush_batch(journal, batch_count); ret = 1; } } @@ -388,7 +385,6 @@ restart: if (journal->j_checkpoint_transactions == transaction && transaction->t_tid == this_tid) { int batch_count = 0; - struct buffer_head *bhs[NR_BATCH]; struct journal_head *jh; int retry = 0, err; @@ -402,7 +398,7 @@ restart: retry = 1; break; } - retry = __process_buffer(journal, jh, bhs, &batch_count, + retry = __process_buffer(journal, jh, &batch_count, transaction); if (retry < 0 && !result) result = retry; @@ -419,7 +415,7 @@ restart: spin_unlock(&journal->j_list_lock); retry = 1; } - __flush_batch(journal, bhs, &batch_count); + __flush_batch(journal, &batch_count); } if (retry) { @@ -686,6 +682,7 @@ int __jbd2_journal_remove_checkpoint(struct journal_head *jh) safely remove this transaction from the log */ __jbd2_journal_drop_transaction(journal, transaction); + kfree(transaction); /* Just in case anybody was waiting for more transactions to be checkpointed... */ @@ -760,5 +757,4 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(journal->j_running_transaction != transaction); jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid); - kfree(transaction); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index ebc667bc54a..62804e57a44 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -25,6 +25,7 @@ #include <linux/crc32.h> #include <linux/writeback.h> #include <linux/backing-dev.h> +#include <linux/bio.h> /* * Default IO end handler for temporary BJ_IO buffer_heads. @@ -137,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal, set_buffer_ordered(bh); barrier_done = 1; } - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); if (barrier_done) clear_buffer_ordered(bh); @@ -158,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal, lock_buffer(bh); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - ret = submit_bh(WRITE, bh); + ret = submit_bh(WRITE_SYNC, bh); } *cbh = bh; return ret; @@ -168,12 +169,34 @@ static int journal_submit_commit_record(journal_t *journal, * This function along with journal_submit_commit_record * allows to write the commit record asynchronously. */ -static int journal_wait_on_commit_record(struct buffer_head *bh) +static int journal_wait_on_commit_record(journal_t *journal, + struct buffer_head *bh) { int ret = 0; +retry: clear_buffer_dirty(bh); wait_on_buffer(bh); + if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { + printk(KERN_WARNING + "JBD2: wait_on_commit_record: sync failed on %s - " + "disabling barriers\n", journal->j_devname); + spin_lock(&journal->j_state_lock); + journal->j_flags &= ~JBD2_BARRIER; + spin_unlock(&journal->j_state_lock); + + lock_buffer(bh); + clear_buffer_dirty(bh); + set_buffer_uptodate(bh); + bh->b_end_io = journal_end_buffer_io_sync; + + ret = submit_bh(WRITE_SYNC, bh); + if (ret) { + unlock_buffer(bh); + return ret; + } + goto retry; + } if (unlikely(!buffer_uptodate(bh))) ret = -EIO; @@ -332,13 +355,15 @@ void jbd2_journal_commit_transaction(journal_t *journal) int flags; int err; unsigned long long blocknr; + ktime_t start_time; + u64 commit_time; char *tagp = NULL; journal_header_t *header; journal_block_tag_t *tag = NULL; int space_left = 0; int first_tag = 0; int tag_flag; - int i; + int i, to_free = 0; int tag_bytes = journal_tag_bytes(journal); struct buffer_head *cbh = NULL; /* For transactional checksums */ __u32 crc32_sum = ~0; @@ -458,6 +483,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction->t_state = T_FLUSH; journal->j_committing_transaction = commit_transaction; journal->j_running_transaction = NULL; + start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); spin_unlock(&journal->j_state_lock); @@ -509,6 +535,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) if (is_journal_aborted(journal)) { clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); + jbd2_buffer_abort_trigger(jh, + jh->b_frozen_data ? + jh->b_frozen_triggers : + jh->b_triggers); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up * any descriptor buffers which may have been @@ -799,7 +829,7 @@ wait_for_iobuf: __jbd2_journal_abort_hard(journal); } if (!err && !is_journal_aborted(journal)) - err = journal_wait_on_commit_record(cbh); + err = journal_wait_on_commit_record(journal, cbh); if (err) jbd2_journal_abort(journal, err); @@ -844,6 +874,9 @@ restart_loop: * data. * * Otherwise, we can just throw away the frozen data now. + * + * We also know that the frozen data has already fired + * its triggers if they exist, so we can clear that too. */ if (jh->b_committed_data) { jbd2_free(jh->b_committed_data, bh->b_size); @@ -851,10 +884,12 @@ restart_loop: if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; } } else if (jh->b_frozen_data) { jbd2_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; + jh->b_frozen_triggers = NULL; } spin_lock(&journal->j_list_lock); @@ -972,14 +1007,23 @@ restart_loop: J_ASSERT(commit_transaction == journal->j_committing_transaction); journal->j_commit_sequence = commit_transaction->t_tid; journal->j_committing_transaction = NULL; - spin_unlock(&journal->j_state_lock); + commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); - if (journal->j_commit_callback) - journal->j_commit_callback(journal, commit_transaction); + /* + * weight the commit time higher than the average time so we don't + * react too strongly to vast changes in the commit time + */ + if (likely(journal->j_average_commit_time)) + journal->j_average_commit_time = (commit_time + + journal->j_average_commit_time*3) / 4; + else + journal->j_average_commit_time = commit_time; + spin_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { __jbd2_journal_drop_transaction(journal, commit_transaction); + to_free = 1; } else { if (journal->j_checkpoint_transactions == NULL) { journal->j_checkpoint_transactions = commit_transaction; @@ -998,11 +1042,16 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", - journal->j_devname, journal->j_commit_sequence, + journal->j_devname, commit_transaction->t_tid, journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); + if (to_free) + kfree(commit_transaction); wake_up(&journal->j_wait_done_commit); } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index e70d657a19f..56675306ed8 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -40,6 +40,7 @@ #include <asm/uaccess.h> #include <asm/page.h> +#include <asm/div64.h> EXPORT_SYMBOL(jbd2_journal_start); EXPORT_SYMBOL(jbd2_journal_restart); @@ -50,6 +51,7 @@ EXPORT_SYMBOL(jbd2_journal_unlock_updates); EXPORT_SYMBOL(jbd2_journal_get_write_access); EXPORT_SYMBOL(jbd2_journal_get_create_access); EXPORT_SYMBOL(jbd2_journal_get_undo_access); +EXPORT_SYMBOL(jbd2_journal_set_triggers); EXPORT_SYMBOL(jbd2_journal_dirty_metadata); EXPORT_SYMBOL(jbd2_journal_release_buffer); EXPORT_SYMBOL(jbd2_journal_forget); @@ -65,7 +67,6 @@ EXPORT_SYMBOL(jbd2_journal_update_format); EXPORT_SYMBOL(jbd2_journal_check_used_features); EXPORT_SYMBOL(jbd2_journal_check_available_features); EXPORT_SYMBOL(jbd2_journal_set_features); -EXPORT_SYMBOL(jbd2_journal_create); EXPORT_SYMBOL(jbd2_journal_load); EXPORT_SYMBOL(jbd2_journal_destroy); EXPORT_SYMBOL(jbd2_journal_abort); @@ -131,8 +132,9 @@ static int kjournald2(void *arg) journal->j_task = current; wake_up(&journal->j_wait_done_commit); - printk(KERN_INFO "kjournald2 starting. Commit interval %ld seconds\n", - journal->j_commit_interval / HZ); + printk(KERN_INFO "kjournald2 starting: pid %d, dev %s, " + "commit interval %ld seconds\n", current->pid, + journal->j_devname, journal->j_commit_interval / HZ); /* * And now, wait forever for commit wakeup events. @@ -290,6 +292,7 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, struct page *new_page; unsigned int new_offset; struct buffer_head *bh_in = jh2bh(jh_in); + struct jbd2_buffer_trigger_type *triggers; /* * The buffer really shouldn't be locked: only the current committing @@ -314,13 +317,23 @@ repeat: done_copy_out = 1; new_page = virt_to_page(jh_in->b_frozen_data); new_offset = offset_in_page(jh_in->b_frozen_data); + triggers = jh_in->b_frozen_triggers; } else { new_page = jh2bh(jh_in)->b_page; new_offset = offset_in_page(jh2bh(jh_in)->b_data); + triggers = jh_in->b_triggers; } mapped_data = kmap_atomic(new_page, KM_USER0); /* + * Fire any commit trigger. Do this before checking for escaping, + * as the trigger may modify the magic offset. If a copy-out + * happens afterwards, it will have the correct data in the buffer. + */ + jbd2_buffer_commit_trigger(jh_in, mapped_data + new_offset, + triggers); + + /* * Check for escaping */ if (*((__be32 *)(mapped_data + new_offset)) == @@ -352,6 +365,13 @@ repeat: new_page = virt_to_page(tmp); new_offset = offset_in_page(tmp); done_copy_out = 1; + + /* + * This isn't strictly necessary, as we're using frozen + * data for the escaping, but it keeps consistency with + * b_frozen_data usage. + */ + jh_in->b_frozen_triggers = jh_in->b_triggers; } /* @@ -631,6 +651,8 @@ struct journal_head *jbd2_journal_get_descriptor_buffer(journal_t *journal) return NULL; bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + if (!bh) + return NULL; lock_buffer(bh); memset(bh->b_data, 0, journal->j_blocksize); set_buffer_uptodate(bh); @@ -824,6 +846,8 @@ static int jbd2_seq_info_show(struct seq_file *seq, void *v) jiffies_to_msecs(s->stats->u.run.rs_flushing / s->stats->ts_tid)); seq_printf(seq, " %ums logging transaction\n", jiffies_to_msecs(s->stats->u.run.rs_logging / s->stats->ts_tid)); + seq_printf(seq, " %luus average transaction commit time\n", + do_div(s->journal->j_average_commit_time, 1000)); seq_printf(seq, " %lu handles per transaction\n", s->stats->u.run.rs_handle_count / s->stats->ts_tid); seq_printf(seq, " %lu blocks per transaction\n", @@ -961,6 +985,8 @@ static journal_t * journal_init_common (void) spin_lock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); + journal->j_min_batch_time = 0; + journal->j_max_batch_time = 15000; /* 15ms */ /* The journal is marked for error until we succeed with recovery! */ journal->j_flags = JBD2_ABORT; @@ -1016,15 +1042,14 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, /* journal descriptor can store up to n blocks -bzzz */ journal->j_blocksize = blocksize; + jbd2_stats_proc_init(journal); n = journal->j_blocksize / sizeof(journal_block_tag_t); journal->j_wbufsize = n; journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - kfree(journal); - journal = NULL; - goto out; + goto out_err; } journal->j_dev = bdev; journal->j_fs_dev = fs_dev; @@ -1034,14 +1059,22 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, p = journal->j_devname; while ((p = strchr(p, '/'))) *p = '!'; - jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; -out: + return journal; +out_err: + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; } /** @@ -1089,9 +1122,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", __func__); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; + goto out_err; } err = jbd2_journal_bmap(journal, 0, &blocknr); @@ -1099,17 +1130,24 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", __func__); - jbd2_stats_proc_exit(journal); - kfree(journal); - return NULL; + goto out_err; } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - J_ASSERT(bh != NULL); + if (!bh) { + printk(KERN_ERR + "%s: Cannot get buffer for journal superblock\n", + __func__); + goto out_err; + } journal->j_sb_buffer = bh; journal->j_superblock = (journal_superblock_t *)bh->b_data; return journal; +out_err: + jbd2_stats_proc_exit(journal); + kfree(journal); + return NULL; } /* @@ -1158,77 +1196,6 @@ static int journal_reset(journal_t *journal) } /** - * int jbd2_journal_create() - Initialise the new journal file - * @journal: Journal to create. This structure must have been initialised - * - * Given a journal_t structure which tells us which disk blocks we can - * use, create a new journal superblock and initialise all of the - * journal fields from scratch. - **/ -int jbd2_journal_create(journal_t *journal) -{ - unsigned long long blocknr; - struct buffer_head *bh; - journal_superblock_t *sb; - int i, err; - - if (journal->j_maxlen < JBD2_MIN_JOURNAL_BLOCKS) { - printk (KERN_ERR "Journal length (%d blocks) too short.\n", - journal->j_maxlen); - journal_fail_superblock(journal); - return -EINVAL; - } - - if (journal->j_inode == NULL) { - /* - * We don't know what block to start at! - */ - printk(KERN_EMERG - "%s: creation of journal on external device!\n", - __func__); - BUG(); - } - - /* Zero out the entire journal on disk. We cannot afford to - have any blocks on disk beginning with JBD2_MAGIC_NUMBER. */ - jbd_debug(1, "JBD: Zeroing out journal blocks...\n"); - for (i = 0; i < journal->j_maxlen; i++) { - err = jbd2_journal_bmap(journal, i, &blocknr); - if (err) - return err; - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - lock_buffer(bh); - memset (bh->b_data, 0, journal->j_blocksize); - BUFFER_TRACE(bh, "marking dirty"); - mark_buffer_dirty(bh); - BUFFER_TRACE(bh, "marking uptodate"); - set_buffer_uptodate(bh); - unlock_buffer(bh); - __brelse(bh); - } - - sync_blockdev(journal->j_dev); - jbd_debug(1, "JBD: journal cleared.\n"); - - /* OK, fill in the initial static fields in the new superblock */ - sb = journal->j_superblock; - - sb->s_header.h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER); - sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2); - - sb->s_blocksize = cpu_to_be32(journal->j_blocksize); - sb->s_maxlen = cpu_to_be32(journal->j_maxlen); - sb->s_first = cpu_to_be32(1); - - journal->j_transaction_sequence = 1; - - journal->j_flags &= ~JBD2_ABORT; - journal->j_format_version = 2; - - return journal_reset(journal); -} - -/** * void jbd2_journal_update_superblock() - Update journal sb on disk. * @journal: The journal to update. * @wait: Set to '0' if you don't want to wait for IO completion. @@ -1472,7 +1439,9 @@ int jbd2_journal_destroy(journal_t *journal) spin_lock(&journal->j_list_lock); while (journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 39b7805a599..46b4e347ed7 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -25,6 +25,7 @@ #include <linux/timer.h> #include <linux/mm.h> #include <linux/highmem.h> +#include <linux/hrtimer.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -48,6 +49,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) { transaction->t_journal = journal; transaction->t_state = T_RUNNING; + transaction->t_start_time = ktime_get(); transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); @@ -741,6 +743,12 @@ done: source = kmap_atomic(page, KM_USER0); memcpy(jh->b_frozen_data, source+offset, jh2bh(jh)->b_size); kunmap_atomic(source, KM_USER0); + + /* + * Now that the frozen data is saved off, we need to store + * any matching triggers. + */ + jh->b_frozen_triggers = jh->b_triggers; } jbd_unlock_bh_state(bh); @@ -944,6 +952,47 @@ out: } /** + * void jbd2_journal_set_triggers() - Add triggers for commit writeout + * @bh: buffer to trigger on + * @type: struct jbd2_buffer_trigger_type containing the trigger(s). + * + * Set any triggers on this journal_head. This is always safe, because + * triggers for a committing buffer will be saved off, and triggers for + * a running transaction will match the buffer in that transaction. + * + * Call with NULL to clear the triggers. + */ +void jbd2_journal_set_triggers(struct buffer_head *bh, + struct jbd2_buffer_trigger_type *type) +{ + struct journal_head *jh = bh2jh(bh); + + jh->b_triggers = type; +} + +void jbd2_buffer_commit_trigger(struct journal_head *jh, void *mapped_data, + struct jbd2_buffer_trigger_type *triggers) +{ + struct buffer_head *bh = jh2bh(jh); + + if (!triggers || !triggers->t_commit) + return; + + triggers->t_commit(triggers, bh, mapped_data, bh->b_size); +} + +void jbd2_buffer_abort_trigger(struct journal_head *jh, + struct jbd2_buffer_trigger_type *triggers) +{ + if (!triggers || !triggers->t_abort) + return; + + triggers->t_abort(triggers, jh2bh(jh)); +} + + + +/** * int jbd2_journal_dirty_metadata() - mark a buffer as containing dirty metadata * @handle: transaction to add buffer to. * @bh: buffer to mark @@ -1193,7 +1242,7 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int old_handle_count, err; + int err; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1216,24 +1265,54 @@ int jbd2_journal_stop(handle_t *handle) /* * Implement synchronous transaction batching. If the handle * was synchronous, don't force a commit immediately. Let's - * yield and let another thread piggyback onto this transaction. - * Keep doing that while new threads continue to arrive. - * It doesn't cost much - we're about to run a commit and sleep - * on IO anyway. Speeds up many-threaded, many-dir operations - * by 30x or more... + * yield and let another thread piggyback onto this + * transaction. Keep doing that while new threads continue to + * arrive. It doesn't cost much - we're about to run a commit + * and sleep on IO anyway. Speeds up many-threaded, many-dir + * operations by 30x or more... * - * But don't do this if this process was the most recent one to - * perform a synchronous write. We do this to detect the case where a - * single process is doing a stream of sync writes. No point in waiting - * for joiners in that case. + * We try and optimize the sleep time against what the + * underlying disk can do, instead of having a static sleep + * time. This is useful for the case where our storage is so + * fast that it is more optimal to go ahead and force a flush + * and wait for the transaction to be committed than it is to + * wait for an arbitrary amount of time for new writers to + * join the transaction. We achieve this by measuring how + * long it takes to commit a transaction, and compare it with + * how long this transaction has been running, and if run time + * < commit time then we sleep for the delta and commit. This + * greatly helps super fast disks that would see slowdowns as + * more threads started doing fsyncs. + * + * But don't do this if this process was the most recent one + * to perform a synchronous write. We do this to detect the + * case where a single process is doing a stream of sync + * writes. No point in waiting for joiners in that case. */ pid = current->pid; if (handle->h_sync && journal->j_last_sync_writer != pid) { + u64 commit_time, trans_time; + journal->j_last_sync_writer = pid; - do { - old_handle_count = transaction->t_handle_count; - schedule_timeout_uninterruptible(1); - } while (old_handle_count != transaction->t_handle_count); + + spin_lock(&journal->j_state_lock); + commit_time = journal->j_average_commit_time; + spin_unlock(&journal->j_state_lock); + + trans_time = ktime_to_ns(ktime_sub(ktime_get(), + transaction->t_start_time)); + + commit_time = max_t(u64, commit_time, + 1000*journal->j_min_batch_time); + commit_time = min_t(u64, commit_time, + 1000*journal->j_max_batch_time); + + if (trans_time < commit_time) { + ktime_t expires = ktime_add_ns(ktime_get(), + commit_time); + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_hrtimeout(&expires, HRTIMER_MODE_ABS); + } } current->journal_info = NULL; diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index c73fa89b5f8..170d289ac78 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -22,9 +22,7 @@ #define BIT_DIVIDER_MIPS 1043 -static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ - -#include <linux/errno.h> +static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241}; struct pushpull { unsigned char *buf; @@ -43,7 +41,9 @@ struct rubin_state { int bits[8]; }; -static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) +static inline void init_pushpull(struct pushpull *pp, char *buf, + unsigned buflen, unsigned ofs, + unsigned reserve) { pp->buf = buf; pp->buflen = buflen; @@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) { - if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { + if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) return -ENOSPC; - } - if (bit) { - pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); - } - else { - pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); - } + if (bit) + pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7))); + else + pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7))); + pp->ofs++; return 0; @@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits) rs->p = (long) (2 * UPPER_BIT_RUBIN); rs->bit_number = (long) 0; rs->bit_divider = div; + for (c=0; c<8; c++) rs->bits[c] = bits[c]; } @@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) long i0, i1; int ret; - while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { + while ((rs->q >= UPPER_BIT_RUBIN) || + ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { rs->bit_number++; ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); @@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) rs->p <<= 1; } i0 = A * rs->p / (A + B); - if (i0 <= 0) { + if (i0 <= 0) i0 = 1; - } - if (i0 >= rs->p) { + + if (i0 >= rs->p) i0 = rs->p - 1; - } + i1 = rs->p - i0; if (symbol == 0) @@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits) /* behalve lower */ rs->rec_q = 0; - for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) + for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; + rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) ; } -static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) +static void __do_decode(struct rubin_state *rs, unsigned long p, + unsigned long q) { register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; unsigned long rec_q; @@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B) __do_decode(rs, p, q); i0 = A * rs->p / (A + B); - if (i0 <= 0) { + if (i0 <= 0) i0 = 1; - } - if (i0 >= rs->p) { + + if (i0 >= rs->p) i0 = rs->p - 1; - } threshold = rs->q + i0; symbol = rs->rec_q >= threshold; @@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte) struct rubin_state rs_copy; rs_copy = *rs; - for (i=0;i<8;i++) { - ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); + for (i=0; i<8; i++) { + ret = encode(rs, rs->bit_divider-rs->bits[i], + rs->bits[i], byte & 1); if (ret) { /* Failed. Restore old state */ *rs = rs_copy; return ret; } - byte=byte>>1; + byte >>= 1 ; } return 0; } @@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs) int i, result = 0, bit_divider = rs->bit_divider; for (i = 0; i < 8; i++) - result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; + result |= decode(rs, bit_divider - rs->bits[i], + rs->bits[i]) << i; return result; } @@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs) static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, - unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) + unsigned char *cpage_out, uint32_t *sourcelen, + uint32_t *dstlen) { int outpos = 0; int pos=0; @@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen, void *model) { - return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); + return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); } #endif static int jffs2_dynrubin_compress(unsigned char *data_in, @@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, return -1; memset(histo, 0, 256); - for (i=0; i<mysrclen; i++) { + for (i=0; i<mysrclen; i++) histo[data_in[i]]++; - } memset(bits, 0, sizeof(int)*8); for (i=0; i<256; i++) { if (i&128) @@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, cpage_out[i] = bits[i]; } - ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); + ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, + &mydstlen); if (ret) return ret; @@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, return 0; } -static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, - unsigned char *page_out, uint32_t srclen, uint32_t destlen) +static void rubin_do_decompress(int bit_divider, int *bits, + unsigned char *cdata_in, + unsigned char *page_out, uint32_t srclen, + uint32_t destlen) { int outpos = 0; struct rubin_state rs; @@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); init_decode(&rs, bit_divider, bits); - while (outpos < destlen) { + while (outpos < destlen) page_out[outpos++] = in_byte(&rs); - } } @@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in, uint32_t sourcelen, uint32_t dstlen, void *model) { - rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); + rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); return 0; } @@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in, for (c=0; c<8; c++) bits[c] = data_in[c]; - rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); + rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, + dstlen); return 0; } static struct jffs2_compressor jffs2_rubinmips_comp = { - .priority = JFFS2_RUBINMIPS_PRIORITY, - .name = "rubinmips", - .compr = JFFS2_COMPR_DYNRUBIN, - .compress = NULL, /*&jffs2_rubinmips_compress,*/ - .decompress = &jffs2_rubinmips_decompress, + .priority = JFFS2_RUBINMIPS_PRIORITY, + .name = "rubinmips", + .compr = JFFS2_COMPR_DYNRUBIN, + .compress = NULL, /*&jffs2_rubinmips_compress,*/ + .decompress = &jffs2_rubinmips_decompress, #ifdef JFFS2_RUBINMIPS_DISABLED - .disabled = 1, + .disabled = 1, #else - .disabled = 0, + .disabled = 0, #endif }; int jffs2_rubinmips_init(void) { - return jffs2_register_compressor(&jffs2_rubinmips_comp); + return jffs2_register_compressor(&jffs2_rubinmips_comp); } void jffs2_rubinmips_exit(void) { - jffs2_unregister_compressor(&jffs2_rubinmips_comp); + jffs2_unregister_compressor(&jffs2_rubinmips_comp); } static struct jffs2_compressor jffs2_dynrubin_comp = { - .priority = JFFS2_DYNRUBIN_PRIORITY, - .name = "dynrubin", - .compr = JFFS2_COMPR_RUBINMIPS, - .compress = jffs2_dynrubin_compress, - .decompress = &jffs2_dynrubin_decompress, + .priority = JFFS2_DYNRUBIN_PRIORITY, + .name = "dynrubin", + .compr = JFFS2_COMPR_RUBINMIPS, + .compress = jffs2_dynrubin_compress, + .decompress = &jffs2_dynrubin_decompress, #ifdef JFFS2_DYNRUBIN_DISABLED - .disabled = 1, + .disabled = 1, #else - .disabled = 0, + .disabled = 0, #endif }; int jffs2_dynrubin_init(void) { - return jffs2_register_compressor(&jffs2_dynrubin_comp); + return jffs2_register_compressor(&jffs2_dynrubin_comp); } void jffs2_dynrubin_exit(void) { - jffs2_unregister_compressor(&jffs2_dynrubin_comp); + jffs2_unregister_compressor(&jffs2_dynrubin_comp); } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 259461b910a..c32b4a1ad6c 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { @@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr) struct erase_priv_struct *priv = (void *)instr->priv; if(instr->state != MTD_ERASE_DONE) { - printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); + printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", + (unsigned long long)instr->addr, instr->state); jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); } else { jffs2_erase_succeeded(priv->c, priv->jeb); diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5a98aa87c85..5edc2bf2058 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -132,7 +132,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, uint32_t pageofs = index << PAGE_CACHE_SHIFT; int ret = 0; - pg = __grab_cache_page(mapping, index); + pg = grab_cache_page_write_begin(mapping, index, flags); if (!pg) return -ENOMEM; *pagep = pg; diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index 1750445556c..507ed6ec184 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -366,9 +366,6 @@ void jffs2_free_ino_caches(struct jffs2_sb_info *c); void jffs2_free_raw_node_refs(struct jffs2_sb_info *c); struct jffs2_node_frag *jffs2_lookup_node_frag(struct rb_root *fragtree, uint32_t offset); void jffs2_kill_fragtree(struct rb_root *root, struct jffs2_sb_info *c_delete); -struct rb_node *rb_next(struct rb_node *); -struct rb_node *rb_prev(struct rb_node *); -void rb_replace_node(struct rb_node *victim, struct rb_node *new, struct rb_root *root); int jffs2_add_full_dnode_to_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn); uint32_t jffs2_truncate_fragtree (struct jffs2_sb_info *c, struct rb_root *list, uint32_t size); struct jffs2_raw_node_ref *jffs2_link_node_ref(struct jffs2_sb_info *c, diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index d6363d8309d..0f94381ca6d 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -58,9 +58,9 @@ /* * __mark_inode_dirty expects inodes to be hashed. Since we don't want - * special inodes in the fileset inode space, we hash them to a dummy head + * special inodes in the fileset inode space, we make them appear hashed, + * but do not put on any lists. */ -static HLIST_HEAD(aggregate_hash); /* * imap locks @@ -496,7 +496,11 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) /* release the page */ release_metapage(mp); - hlist_add_head(&ip->i_hash, &aggregate_hash); + /* + * that will look hashed, but won't be on any list; hlist_del() + * will work fine and require no locking. + */ + ip->i_hash.pprev = &ip->i_hash.next; return (ip); } diff --git a/fs/libfs.c b/fs/libfs.c index e960a832190..49b44099dab 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -231,7 +231,6 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, */ root->i_ino = 1; root->i_mode = S_IFDIR | S_IRUSR | S_IWUSR; - root->i_uid = root->i_gid = 0; root->i_atime = root->i_mtime = root->i_ctime = CURRENT_TIME; dentry = d_alloc(NULL, &d_name); if (!dentry) { @@ -360,7 +359,7 @@ int simple_write_begin(struct file *file, struct address_space *mapping, index = pos >> PAGE_CACHE_SHIFT; from = pos & (PAGE_CACHE_SIZE - 1); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; @@ -436,8 +435,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files */ inode->i_ino = 1; inode->i_mode = S_IFDIR | 0755; - inode->i_uid = inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; @@ -464,8 +461,6 @@ int simple_fill_super(struct super_block *s, int magic, struct tree_descr *files if (!inode) goto out; inode->i_mode = S_IFREG | files->mode; - inode->i_uid = inode->i_gid = 0; - inode->i_blocks = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_fop = files->ops; inode->i_ino = i; diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index 31668b690e0..dd7957064a8 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -16,7 +16,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #define NLMDBG_FACILITY NLMDBG_CLIENT #define NLMCLNT_GRACE_WAIT (5*HZ) @@ -518,11 +517,9 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) unsigned char fl_type; int status = -ENOLCK; - if (nsm_monitor(host) < 0) { - printk(KERN_NOTICE "lockd: failed to monitor %s\n", - host->h_name); + if (nsm_monitor(host) < 0) goto out; - } + fl->fl_flags |= FL_ACCESS; status = do_vfs_lock(fl); fl->fl_flags = fl_flags; diff --git a/fs/lockd/host.c b/fs/lockd/host.c index abdebf76b82..99d737bd432 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -15,7 +15,6 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #include <linux/mutex.h> #include <net/ipv6.h> @@ -32,11 +31,6 @@ static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); static void nlm_gc_hosts(void); -static struct nsm_handle *nsm_find(const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len, - const int create); struct nlm_lookup_host_info { const int server; /* search for server|client */ @@ -105,32 +99,6 @@ static void nlm_clear_port(struct sockaddr *sap) } } -static void nlm_display_address(const struct sockaddr *sap, - char *buf, const size_t len) -{ - const struct sockaddr_in *sin = (struct sockaddr_in *)sap; - const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; - - switch (sap->sa_family) { - case AF_UNSPEC: - snprintf(buf, len, "unspecified"); - break; - case AF_INET: - snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); - break; - case AF_INET6: - if (ipv6_addr_v4mapped(&sin6->sin6_addr)) - snprintf(buf, len, "%pI4", - &sin6->sin6_addr.s6_addr32[3]); - else - snprintf(buf, len, "%pI6", &sin6->sin6_addr); - break; - default: - snprintf(buf, len, "unsupported address family"); - break; - } -} - /* * Common host lookup routine for server & client */ @@ -190,8 +158,8 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) atomic_inc(&nsm->sm_count); else { host = NULL; - nsm = nsm_find(ni->sap, ni->salen, - ni->hostname, ni->hostname_len, 1); + nsm = nsm_get_handle(ni->sap, ni->salen, + ni->hostname, ni->hostname_len); if (!nsm) { dprintk("lockd: nlm_lookup_host failed; " "no nsm handle\n"); @@ -206,6 +174,7 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) goto out; } host->h_name = nsm->sm_name; + host->h_addrbuf = nsm->sm_addrbuf; memcpy(nlm_addr(host), ni->sap, ni->salen); host->h_addrlen = ni->salen; nlm_clear_port(nlm_addr(host)); @@ -232,11 +201,6 @@ static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) nrhosts++; - nlm_display_address((struct sockaddr *)&host->h_addr, - host->h_addrbuf, sizeof(host->h_addrbuf)); - nlm_display_address((struct sockaddr *)&host->h_srcaddr, - host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); - dprintk("lockd: nlm_lookup_host created host %s\n", host->h_name); @@ -256,10 +220,8 @@ nlm_destroy_host(struct nlm_host *host) BUG_ON(!list_empty(&host->h_lockowners)); BUG_ON(atomic_read(&host->h_count)); - /* - * Release NSM handle and unmonitor host. - */ nsm_unmonitor(host); + nsm_release(host->h_nsmhandle); clnt = host->h_rpcclnt; if (clnt != NULL) @@ -378,8 +340,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", - host->h_name, host->h_addrbuf, host->h_srcaddrbuf); + dprintk("lockd: nlm_bind_host %s (%s)\n", + host->h_name, host->h_addrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -481,35 +443,23 @@ void nlm_release_host(struct nlm_host *host) } } -/* - * We were notified that the host indicated by address &sin - * has rebooted. - * Release all resources held by that peer. +/** + * nlm_host_rebooted - Release all resources held by rebooted host + * @info: pointer to decoded results of NLM_SM_NOTIFY call + * + * We were notified that the specified host has rebooted. Release + * all resources held by that peer. */ -void nlm_host_rebooted(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len, - u32 new_state) +void nlm_host_rebooted(const struct nlm_reboot *info) { struct hlist_head *chain; struct hlist_node *pos; struct nsm_handle *nsm; struct nlm_host *host; - nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), - hostname, hostname_len, 0); - if (nsm == NULL) { - dprintk("lockd: never saw rebooted peer '%.*s' before\n", - hostname_len, hostname); + nsm = nsm_reboot_lookup(info); + if (unlikely(nsm == NULL)) return; - } - - dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", - hostname_len, hostname, nsm->sm_addrbuf); - - /* When reclaiming locks on this peer, make sure that - * we set up a new notification */ - nsm->sm_monitored = 0; /* Mark all hosts tied to this NSM state as having rebooted. * We run the loop repeatedly, because we drop the host table @@ -520,8 +470,8 @@ again: mutex_lock(&nlm_host_mutex); for (chain = nlm_hosts; chain < nlm_hosts + NLM_HOST_NRHASH; ++chain) { hlist_for_each_entry(host, pos, chain, h_hash) { if (host->h_nsmhandle == nsm - && host->h_nsmstate != new_state) { - host->h_nsmstate = new_state; + && host->h_nsmstate != info->state) { + host->h_nsmstate = info->state; host->h_state++; nlm_get_host(host); @@ -629,89 +579,3 @@ nlm_gc_hosts(void) next_gc = jiffies + NLM_HOST_COLLECT; } - - -/* - * Manage NSM handles - */ -static LIST_HEAD(nsm_handles); -static DEFINE_SPINLOCK(nsm_lock); - -static struct nsm_handle *nsm_find(const struct sockaddr *sap, - const size_t salen, - const char *hostname, - const size_t hostname_len, - const int create) -{ - struct nsm_handle *nsm = NULL; - struct nsm_handle *pos; - - if (!sap) - return NULL; - - if (hostname && memchr(hostname, '/', hostname_len) != NULL) { - if (printk_ratelimit()) { - printk(KERN_WARNING "Invalid hostname \"%.*s\" " - "in NFS lock request\n", - (int)hostname_len, hostname); - } - return NULL; - } - -retry: - spin_lock(&nsm_lock); - list_for_each_entry(pos, &nsm_handles, sm_link) { - - if (hostname && nsm_use_hostnames) { - if (strlen(pos->sm_name) != hostname_len - || memcmp(pos->sm_name, hostname, hostname_len)) - continue; - } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) - continue; - atomic_inc(&pos->sm_count); - kfree(nsm); - nsm = pos; - goto found; - } - if (nsm) { - list_add(&nsm->sm_link, &nsm_handles); - goto found; - } - spin_unlock(&nsm_lock); - - if (!create) - return NULL; - - nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL); - if (nsm == NULL) - return NULL; - - memcpy(nsm_addr(nsm), sap, salen); - nsm->sm_addrlen = salen; - nsm->sm_name = (char *) (nsm + 1); - memcpy(nsm->sm_name, hostname, hostname_len); - nsm->sm_name[hostname_len] = '\0'; - nlm_display_address((struct sockaddr *)&nsm->sm_addr, - nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); - atomic_set(&nsm->sm_count, 1); - goto retry; - -found: - spin_unlock(&nsm_lock); - return nsm; -} - -/* - * Release an NSM handle - */ -void -nsm_release(struct nsm_handle *nsm) -{ - if (!nsm) - return; - if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { - list_del(&nsm->sm_link); - spin_unlock(&nsm_lock); - kfree(nsm); - } -} diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index ffd3461f75e..5e2c4d5ac82 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -9,35 +9,123 @@ #include <linux/types.h> #include <linux/utsname.h> #include <linux/kernel.h> +#include <linux/ktime.h> + #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprtsock.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> - #define NLMDBG_FACILITY NLMDBG_MONITOR +#define NSM_PROGRAM 100024 +#define NSM_VERSION 1 + +enum { + NSMPROC_NULL, + NSMPROC_STAT, + NSMPROC_MON, + NSMPROC_UNMON, + NSMPROC_UNMON_ALL, + NSMPROC_SIMU_CRASH, + NSMPROC_NOTIFY, +}; + +struct nsm_args { + struct nsm_private *priv; + u32 prog; /* RPC callback info */ + u32 vers; + u32 proc; -#define XDR_ADDRBUF_LEN (20) + char *mon_name; +}; -static struct rpc_clnt * nsm_create(void); +struct nsm_res { + u32 status; + u32 state; +}; static struct rpc_program nsm_program; +static LIST_HEAD(nsm_handles); +static DEFINE_SPINLOCK(nsm_lock); /* * Local NSM state */ -int nsm_local_state; +int __read_mostly nsm_local_state; +int __read_mostly nsm_use_hostnames; -/* - * Common procedure for SM_MON/SM_UNMON calls - */ -static int -nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) +static inline struct sockaddr *nsm_addr(const struct nsm_handle *nsm) +{ + return (struct sockaddr *)&nsm->sm_addr; +} + +static void nsm_display_ipv4_address(const struct sockaddr *sap, char *buf, + const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + snprintf(buf, len, "%pI4", &sin->sin_addr.s_addr); +} + +static void nsm_display_ipv6_address(const struct sockaddr *sap, char *buf, + const size_t len) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, "%pI4", &sin6->sin6_addr.s6_addr32[3]); + else if (sin6->sin6_scope_id != 0) + snprintf(buf, len, "%pI6%%%u", &sin6->sin6_addr, + sin6->sin6_scope_id); + else + snprintf(buf, len, "%pI6", &sin6->sin6_addr); +} + +static void nsm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + switch (sap->sa_family) { + case AF_INET: + nsm_display_ipv4_address(sap, buf, len); + break; + case AF_INET6: + nsm_display_ipv6_address(sap, buf, len); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} + +static struct rpc_clnt *nsm_create(void) +{ + struct sockaddr_in sin = { + .sin_family = AF_INET, + .sin_addr.s_addr = htonl(INADDR_LOOPBACK), + }; + struct rpc_create_args args = { + .protocol = XPRT_TRANSPORT_UDP, + .address = (struct sockaddr *)&sin, + .addrsize = sizeof(sin), + .servername = "rpc.statd", + .program = &nsm_program, + .version = NSM_VERSION, + .authflavor = RPC_AUTH_NULL, + }; + + return rpc_create(&args); +} + +static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) { struct rpc_clnt *clnt; int status; - struct nsm_args args; + struct nsm_args args = { + .priv = &nsm->sm_priv, + .prog = NLM_PROGRAM, + .vers = 3, + .proc = NLMPROC_NSM_NOTIFY, + .mon_name = nsm->sm_mon_name, + }; struct rpc_message msg = { .rpc_argp = &args, .rpc_resp = res, @@ -46,22 +134,18 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) clnt = nsm_create(); if (IS_ERR(clnt)) { status = PTR_ERR(clnt); + dprintk("lockd: failed to create NSM upcall transport, " + "status=%d\n", status); goto out; } - memset(&args, 0, sizeof(args)); - args.mon_name = nsm->sm_name; - args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; - args.prog = NLM_PROGRAM; - args.vers = 3; - args.proc = NLMPROC_NSM_NOTIFY; memset(res, 0, sizeof(*res)); msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, 0); if (status < 0) - printk(KERN_DEBUG "nsm_mon_unmon: rpc failed, status=%d\n", - status); + dprintk("lockd: NSM upcall RPC failed, status=%d\n", + status); else status = 0; rpc_shutdown_client(clnt); @@ -69,82 +153,272 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) return status; } -/* - * Set up monitoring of a remote host +/** + * nsm_monitor - Notify a peer in case we reboot + * @host: pointer to nlm_host of peer to notify + * + * If this peer is not already monitored, this function sends an + * upcall to the local rpc.statd to record the name/address of + * the peer to notify in case we reboot. + * + * Returns zero if the peer is monitored by the local rpc.statd; + * otherwise a negative errno value is returned. */ -int -nsm_monitor(struct nlm_host *host) +int nsm_monitor(const struct nlm_host *host) { struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; int status; - dprintk("lockd: nsm_monitor(%s)\n", host->h_name); - BUG_ON(nsm == NULL); + dprintk("lockd: nsm_monitor(%s)\n", nsm->sm_name); if (nsm->sm_monitored) return 0; - status = nsm_mon_unmon(nsm, SM_MON, &res); + /* + * Choose whether to record the caller_name or IP address of + * this peer in the local rpc.statd's database. + */ + nsm->sm_mon_name = nsm_use_hostnames ? nsm->sm_name : nsm->sm_addrbuf; - if (status < 0 || res.status != 0) - printk(KERN_NOTICE "lockd: cannot monitor %s\n", host->h_name); + status = nsm_mon_unmon(nsm, NSMPROC_MON, &res); + if (res.status != 0) + status = -EIO; + if (status < 0) + printk(KERN_NOTICE "lockd: cannot monitor %s\n", nsm->sm_name); else nsm->sm_monitored = 1; return status; } -/* - * Cease to monitor remote host +/** + * nsm_unmonitor - Unregister peer notification + * @host: pointer to nlm_host of peer to stop monitoring + * + * If this peer is monitored, this function sends an upcall to + * tell the local rpc.statd not to send this peer a notification + * when we reboot. */ -int -nsm_unmonitor(struct nlm_host *host) +void nsm_unmonitor(const struct nlm_host *host) { struct nsm_handle *nsm = host->h_nsmhandle; struct nsm_res res; - int status = 0; - - if (nsm == NULL) - return 0; - host->h_nsmhandle = NULL; + int status; if (atomic_read(&nsm->sm_count) == 1 && nsm->sm_monitored && !nsm->sm_sticky) { - dprintk("lockd: nsm_unmonitor(%s)\n", host->h_name); + dprintk("lockd: nsm_unmonitor(%s)\n", nsm->sm_name); - status = nsm_mon_unmon(nsm, SM_UNMON, &res); + status = nsm_mon_unmon(nsm, NSMPROC_UNMON, &res); + if (res.status != 0) + status = -EIO; if (status < 0) printk(KERN_NOTICE "lockd: cannot unmonitor %s\n", - host->h_name); + nsm->sm_name); else nsm->sm_monitored = 0; } - nsm_release(nsm); - return status; +} + +static struct nsm_handle *nsm_lookup_hostname(const char *hostname, + const size_t len) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (strlen(nsm->sm_name) == len && + memcmp(nsm->sm_name, hostname, len) == 0) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_addr(const struct sockaddr *sap) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (nlm_cmp_addr(nsm_addr(nsm), sap)) + return nsm; + return NULL; +} + +static struct nsm_handle *nsm_lookup_priv(const struct nsm_private *priv) +{ + struct nsm_handle *nsm; + + list_for_each_entry(nsm, &nsm_handles, sm_link) + if (memcmp(nsm->sm_priv.data, priv->data, + sizeof(priv->data)) == 0) + return nsm; + return NULL; } /* - * Create NSM client for the local host + * Construct a unique cookie to match this nsm_handle to this monitored + * host. It is passed to the local rpc.statd via NSMPROC_MON, and + * returned via NLMPROC_SM_NOTIFY, in the "priv" field of these + * requests. + * + * The NSM protocol requires that these cookies be unique while the + * system is running. We prefer a stronger requirement of making them + * unique across reboots. If user space bugs cause a stale cookie to + * be sent to the kernel, it could cause the wrong host to lose its + * lock state if cookies were not unique across reboots. + * + * The cookies are exposed only to local user space via loopback. They + * do not appear on the physical network. If we want greater security + * for some reason, nsm_init_private() could perform a one-way hash to + * obscure the contents of the cookie. */ -static struct rpc_clnt * -nsm_create(void) +static void nsm_init_private(struct nsm_handle *nsm) { - struct sockaddr_in sin = { - .sin_family = AF_INET, - .sin_addr.s_addr = htonl(INADDR_LOOPBACK), - .sin_port = 0, - }; - struct rpc_create_args args = { - .protocol = XPRT_TRANSPORT_UDP, - .address = (struct sockaddr *)&sin, - .addrsize = sizeof(sin), - .servername = "localhost", - .program = &nsm_program, - .version = SM_VERSION, - .authflavor = RPC_AUTH_NULL, - }; + u64 *p = (u64 *)&nsm->sm_priv.data; + struct timespec ts; - return rpc_create(&args); + ktime_get_ts(&ts); + *p++ = timespec_to_ns(&ts); + *p = (unsigned long)nsm; +} + +static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *new; + + new = kzalloc(sizeof(*new) + hostname_len + 1, GFP_KERNEL); + if (unlikely(new == NULL)) + return NULL; + + atomic_set(&new->sm_count, 1); + new->sm_name = (char *)(new + 1); + memcpy(nsm_addr(new), sap, salen); + new->sm_addrlen = salen; + nsm_init_private(new); + nsm_display_address((const struct sockaddr *)&new->sm_addr, + new->sm_addrbuf, sizeof(new->sm_addrbuf)); + memcpy(new->sm_name, hostname, hostname_len); + new->sm_name[hostname_len] = '\0'; + + return new; +} + +/** + * nsm_get_handle - Find or create a cached nsm_handle + * @sap: pointer to socket address of handle to find + * @salen: length of socket address + * @hostname: pointer to C string containing hostname to find + * @hostname_len: length of C string + * + * Behavior is modulated by the global nsm_use_hostnames variable. + * + * Returns a cached nsm_handle after bumping its ref count, or + * returns a fresh nsm_handle if a handle that matches @sap and/or + * @hostname cannot be found in the handle cache. Returns NULL if + * an error occurs. + */ +struct nsm_handle *nsm_get_handle(const struct sockaddr *sap, + const size_t salen, const char *hostname, + const size_t hostname_len) +{ + struct nsm_handle *cached, *new = NULL; + + if (hostname && memchr(hostname, '/', hostname_len) != NULL) { + if (printk_ratelimit()) { + printk(KERN_WARNING "Invalid hostname \"%.*s\" " + "in NFS lock request\n", + (int)hostname_len, hostname); + } + return NULL; + } + +retry: + spin_lock(&nsm_lock); + + if (nsm_use_hostnames && hostname != NULL) + cached = nsm_lookup_hostname(hostname, hostname_len); + else + cached = nsm_lookup_addr(sap); + + if (cached != NULL) { + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + kfree(new); + dprintk("lockd: found nsm_handle for %s (%s), " + "cnt %d\n", cached->sm_name, + cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; + } + + if (new != NULL) { + list_add(&new->sm_link, &nsm_handles); + spin_unlock(&nsm_lock); + dprintk("lockd: created nsm_handle for %s (%s)\n", + new->sm_name, new->sm_addrbuf); + return new; + } + + spin_unlock(&nsm_lock); + + new = nsm_create_handle(sap, salen, hostname, hostname_len); + if (unlikely(new == NULL)) + return NULL; + goto retry; +} + +/** + * nsm_reboot_lookup - match NLMPROC_SM_NOTIFY arguments to an nsm_handle + * @info: pointer to NLMPROC_SM_NOTIFY arguments + * + * Returns a matching nsm_handle if found in the nsm cache; the returned + * nsm_handle's reference count is bumped and sm_monitored is cleared. + * Otherwise returns NULL if some error occurred. + */ +struct nsm_handle *nsm_reboot_lookup(const struct nlm_reboot *info) +{ + struct nsm_handle *cached; + + spin_lock(&nsm_lock); + + cached = nsm_lookup_priv(&info->priv); + if (unlikely(cached == NULL)) { + spin_unlock(&nsm_lock); + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + info->len, info->mon); + return cached; + } + + atomic_inc(&cached->sm_count); + spin_unlock(&nsm_lock); + + /* + * During subsequent lock activity, force a fresh + * notification to be set up for this host. + */ + cached->sm_monitored = 0; + + dprintk("lockd: host %s (%s) rebooted, cnt %d\n", + cached->sm_name, cached->sm_addrbuf, + atomic_read(&cached->sm_count)); + return cached; +} + +/** + * nsm_release - Release an NSM handle + * @nsm: pointer to handle to be released + * + */ +void nsm_release(struct nsm_handle *nsm) +{ + if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + list_del(&nsm->sm_link); + spin_unlock(&nsm_lock); + dprintk("lockd: destroyed nsm_handle for %s (%s)\n", + nsm->sm_name, nsm->sm_addrbuf); + kfree(nsm); + } } /* @@ -154,127 +428,132 @@ nsm_create(void) * Status Monitor wire protocol. */ -static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) +static int encode_nsm_string(struct xdr_stream *xdr, const char *string) { - size_t len = strlen(string); - - if (len > SM_MAXSTRLEN) - len = SM_MAXSTRLEN; - return xdr_encode_opaque(p, string, len); + const u32 len = strlen(string); + __be32 *p; + + if (unlikely(len > SM_MAXSTRLEN)) + return -EIO; + p = xdr_reserve_space(xdr, sizeof(u32) + len); + if (unlikely(p == NULL)) + return -EIO; + xdr_encode_opaque(p, string, len); + return 0; } /* * "mon_name" specifies the host to be monitored. - * - * Linux uses a text version of the IP address of the remote - * host as the host identifier (the "mon_name" argument). - * - * Linux statd always looks up the canonical hostname first for - * whatever remote hostname it receives, so this works alright. */ -static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) +static int encode_mon_name(struct xdr_stream *xdr, const struct nsm_args *argp) { - char buffer[XDR_ADDRBUF_LEN + 1]; - char *name = argp->mon_name; - - if (!nsm_use_hostnames) { - snprintf(buffer, XDR_ADDRBUF_LEN, - "%pI4", &argp->addr); - name = buffer; - } - - return xdr_encode_nsm_string(p, name); + return encode_nsm_string(xdr, argp->mon_name); } /* * The "my_id" argument specifies the hostname and RPC procedure * to be called when the status manager receives notification - * (via the SM_NOTIFY call) that the state of host "mon_name" + * (via the NLMPROC_SM_NOTIFY call) that the state of host "mon_name" * has changed. */ -static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) +static int encode_my_id(struct xdr_stream *xdr, const struct nsm_args *argp) { - p = xdr_encode_nsm_string(p, utsname()->nodename); - if (!p) - return ERR_PTR(-EIO); - + int status; + __be32 *p; + + status = encode_nsm_string(xdr, utsname()->nodename); + if (unlikely(status != 0)) + return status; + p = xdr_reserve_space(xdr, 3 * sizeof(u32)); + if (unlikely(p == NULL)) + return -EIO; *p++ = htonl(argp->prog); *p++ = htonl(argp->vers); *p++ = htonl(argp->proc); - - return p; + return 0; } /* * The "mon_id" argument specifies the non-private arguments - * of an SM_MON or SM_UNMON call. + * of an NSMPROC_MON or NSMPROC_UNMON call. */ -static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) +static int encode_mon_id(struct xdr_stream *xdr, const struct nsm_args *argp) { - p = xdr_encode_mon_name(p, argp); - if (!p) - return ERR_PTR(-EIO); + int status; - return xdr_encode_my_id(p, argp); + status = encode_mon_name(xdr, argp); + if (unlikely(status != 0)) + return status; + return encode_my_id(xdr, argp); } /* * The "priv" argument may contain private information required - * by the SM_MON call. This information will be supplied in the - * SM_NOTIFY call. - * - * Linux provides the raw IP address of the monitored host, - * left in network byte order. + * by the NSMPROC_MON call. This information will be supplied in the + * NLMPROC_SM_NOTIFY call. */ -static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) +static int encode_priv(struct xdr_stream *xdr, const struct nsm_args *argp) { - *p++ = argp->addr; - *p++ = 0; - *p++ = 0; - *p++ = 0; + __be32 *p; - return p; + p = xdr_reserve_space(xdr, SM_PRIV_SIZE); + if (unlikely(p == NULL)) + return -EIO; + xdr_encode_opaque_fixed(p, argp->priv->data, SM_PRIV_SIZE); + return 0; } -static int -xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +static int xdr_enc_mon(struct rpc_rqst *req, __be32 *p, + const struct nsm_args *argp) { - p = xdr_encode_mon_id(p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); - - p = xdr_encode_priv(p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); - - rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); - return 0; + struct xdr_stream xdr; + int status; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + status = encode_mon_id(&xdr, argp); + if (unlikely(status)) + return status; + return encode_priv(&xdr, argp); } -static int -xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +static int xdr_enc_unmon(struct rpc_rqst *req, __be32 *p, + const struct nsm_args *argp) { - p = xdr_encode_mon_id(p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); - rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); - return 0; + struct xdr_stream xdr; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + return encode_mon_id(&xdr, argp); } -static int -xdr_decode_stat_res(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) +static int xdr_dec_stat_res(struct rpc_rqst *rqstp, __be32 *p, + struct nsm_res *resp) { + struct xdr_stream xdr; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + p = xdr_inline_decode(&xdr, 2 * sizeof(u32)); + if (unlikely(p == NULL)) + return -EIO; resp->status = ntohl(*p++); - resp->state = ntohl(*p++); - dprintk("nsm: xdr_decode_stat_res status %d state %d\n", + resp->state = ntohl(*p); + + dprintk("lockd: xdr_dec_stat_res status %d state %d\n", resp->status, resp->state); return 0; } -static int -xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) +static int xdr_dec_stat(struct rpc_rqst *rqstp, __be32 *p, + struct nsm_res *resp) { - resp->state = ntohl(*p++); + struct xdr_stream xdr; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + p = xdr_inline_decode(&xdr, sizeof(u32)); + if (unlikely(p == NULL)) + return -EIO; + resp->state = ntohl(*p); + + dprintk("lockd: xdr_dec_stat state %d\n", resp->state); return 0; } @@ -288,22 +567,22 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) #define SM_unmonres_sz 1 static struct rpc_procinfo nsm_procedures[] = { -[SM_MON] = { - .p_proc = SM_MON, - .p_encode = (kxdrproc_t) xdr_encode_mon, - .p_decode = (kxdrproc_t) xdr_decode_stat_res, +[NSMPROC_MON] = { + .p_proc = NSMPROC_MON, + .p_encode = (kxdrproc_t)xdr_enc_mon, + .p_decode = (kxdrproc_t)xdr_dec_stat_res, .p_arglen = SM_mon_sz, .p_replen = SM_monres_sz, - .p_statidx = SM_MON, + .p_statidx = NSMPROC_MON, .p_name = "MONITOR", }, -[SM_UNMON] = { - .p_proc = SM_UNMON, - .p_encode = (kxdrproc_t) xdr_encode_unmon, - .p_decode = (kxdrproc_t) xdr_decode_stat, +[NSMPROC_UNMON] = { + .p_proc = NSMPROC_UNMON, + .p_encode = (kxdrproc_t)xdr_enc_unmon, + .p_decode = (kxdrproc_t)xdr_dec_stat, .p_arglen = SM_mon_id_sz, .p_replen = SM_unmonres_sz, - .p_statidx = SM_UNMON, + .p_statidx = NSMPROC_UNMON, .p_name = "UNMONITOR", }, }; @@ -322,7 +601,7 @@ static struct rpc_stat nsm_stats; static struct rpc_program nsm_program = { .name = "statd", - .number = SM_PROGRAM, + .number = NSM_PROGRAM, .nrvers = ARRAY_SIZE(nsm_version), .version = nsm_version, .stats = &nsm_stats diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 252d80163d0..64f1c31b585 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -35,7 +35,6 @@ #include <linux/sunrpc/svcsock.h> #include <net/ip.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #include <linux/nfs.h> #define NLMDBG_FACILITY NLMDBG_SVC @@ -54,13 +53,26 @@ static struct svc_rqst *nlmsvc_rqst; unsigned long nlmsvc_timeout; /* + * If the kernel has IPv6 support available, always listen for + * both AF_INET and AF_INET6 requests. + */ +#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \ + defined(CONFIG_SUNRPC_REGISTER_V4) +static const sa_family_t nlmsvc_family = AF_INET6; +#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ +static const sa_family_t nlmsvc_family = AF_INET; +#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */ + +/* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 */ static unsigned long nlm_grace_period; static unsigned long nlm_timeout = LOCKD_DFLT_TIMEO; static int nlm_udpport, nlm_tcpport; -int nsm_use_hostnames = 0; + +/* RLIM_NOFILE defaults to 1024. That seems like a reasonable default here. */ +static unsigned int nlm_max_connections = 1024; /* * Constants needed for the sysctl interface. @@ -143,6 +155,9 @@ lockd(void *vrqstp) long timeout = MAX_SCHEDULE_TIMEOUT; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); + /* update sv_maxconn if it has changed */ + rqstp->rq_server->sv_maxconn = nlm_max_connections; + if (signalled()) { flush_signals(current); if (nlmsvc_ops) { @@ -189,6 +204,19 @@ lockd(void *vrqstp) return 0; } +static int create_lockd_listener(struct svc_serv *serv, char *name, + unsigned short port) +{ + struct svc_xprt *xprt; + + xprt = svc_find_xprt(serv, name, 0, 0); + if (xprt == NULL) + return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS); + + svc_xprt_put(xprt); + return 0; +} + /* * Ensure there are active UDP and TCP listeners for lockd. * @@ -202,29 +230,23 @@ lockd(void *vrqstp) static int make_socks(struct svc_serv *serv) { static int warned; - struct svc_xprt *xprt; - int err = 0; + int err; - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - if (err >= 0) { - xprt = svc_find_xprt(serv, "tcp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "tcp", nlm_tcpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0) { - warned = 0; - err = 0; - } else if (warned++ == 0) + err = create_lockd_listener(serv, "udp", nlm_udpport); + if (err < 0) + goto out_err; + + err = create_lockd_listener(serv, "tcp", nlm_tcpport); + if (err < 0) + goto out_err; + + warned = 0; + return 0; + +out_err: + if (warned++ == 0) printk(KERN_WARNING - "lockd_up: makesock failed, error=%d\n", err); + "lockd_up: makesock failed, error=%d\n", err); return err; } @@ -252,7 +274,7 @@ int lockd_up(void) "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; @@ -276,6 +298,7 @@ int lockd_up(void) } svc_sock_update_bufs(serv); + serv->sv_maxconn = nlm_max_connections; nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, serv->sv_name); if (IS_ERR(nlmsvc_task)) { @@ -485,6 +508,7 @@ module_param_call(nlm_udpport, param_set_port, param_get_int, module_param_call(nlm_tcpport, param_set_port, param_get_int, &nlm_tcpport, 0644); module_param(nsm_use_hostnames, bool, 0644); +module_param(nlm_max_connections, uint, 0644); /* * Initialising and terminating the module. diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4dfdcbc6bf6..1725037374c 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -16,8 +16,6 @@ #include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> -#include <linux/lockd/sm_inter.h> - #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -419,8 +417,6 @@ static __be32 nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { - struct sockaddr_in saddr; - dprintk("lockd: SM_NOTIFY called\n"); if (!nlm_privileged_requester(rqstp)) { @@ -430,14 +426,7 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - /* Obtain the host pointer for this NFS server and try to - * reclaim all locks we hold on this server. - */ - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = argp->addr; - nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); - + nlm_host_rebooted(argp); return rpc_success; } diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 3ca89e2a938..3688e55901f 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -16,8 +16,6 @@ #include <linux/nfsd/nfsd.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> -#include <linux/lockd/sm_inter.h> - #define NLMDBG_FACILITY NLMDBG_CLIENT @@ -451,8 +449,6 @@ static __be32 nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, void *resp) { - struct sockaddr_in saddr; - dprintk("lockd: SM_NOTIFY called\n"); if (!nlm_privileged_requester(rqstp)) { @@ -462,14 +458,7 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, return rpc_system_err; } - /* Obtain the host pointer for this NFS server and try to - * reclaim all locks we hold on this server. - */ - memset(&saddr, 0, sizeof(saddr)); - saddr.sin_family = AF_INET; - saddr.sin_addr.s_addr = argp->addr; - nlm_host_rebooted(&saddr, argp->mon, argp->len, argp->state); - + nlm_host_rebooted(argp); return rpc_success; } diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 34c2766e27c..9e4d6aab611 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -17,7 +17,6 @@ #include <linux/nfsd/export.h> #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> -#include <linux/lockd/sm_inter.h> #include <linux/module.h> #include <linux/mount.h> diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 1f226290c67..0336f2beacd 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -16,7 +16,6 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #define NLMDBG_FACILITY NLMDBG_XDR @@ -349,8 +348,8 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) return 0; argp->state = ntohl(*p++); - /* Preserve the address in network byte order */ - argp->addr = *p++; + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 50c493a8ad8..e1d52865319 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -17,7 +17,6 @@ #include <linux/sunrpc/svc.h> #include <linux/sunrpc/stats.h> #include <linux/lockd/lockd.h> -#include <linux/lockd/sm_inter.h> #define NLMDBG_FACILITY NLMDBG_XDR @@ -356,8 +355,8 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp if (!(p = xdr_decode_string_inplace(p, &argp->mon, &argp->len, SM_MAXSTRLEN))) return 0; argp->state = ntohl(*p++); - /* Preserve the address in network byte order */ - argp->addr = *p++; + memcpy(&argp->priv.data, p, sizeof(argp->priv.data)); + p += XDR_QUADLEN(SM_PRIV_SIZE); return xdr_argsize_check(rqstp, p); } diff --git a/fs/minix/dir.c b/fs/minix/dir.c index f70433816a3..d4946c4c90e 100644 --- a/fs/minix/dir.c +++ b/fs/minix/dir.c @@ -280,7 +280,7 @@ int minix_add_link(struct dentry *dentry, struct inode *inode) return -EINVAL; got_it: - pos = (page->index >> PAGE_CACHE_SHIFT) + p - (char*)page_address(page); + pos = page_offset(page) + p - (char *)page_address(page); err = __minix_write_begin(NULL, page->mapping, pos, sbi->s_dirsize, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); if (err) diff --git a/fs/mpage.c b/fs/mpage.c index 552b80b3fac..16c3ef37eae 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -241,7 +241,6 @@ do_mpage_readpage(struct bio *bio, struct page *page, unsigned nr_pages, first_hole = page_block; page_block++; block_in_file++; - clear_buffer_mapped(map_bh); continue; } @@ -308,7 +307,10 @@ alloc_new: goto alloc_new; } - if (buffer_boundary(map_bh) || (first_hole != blocks_per_page)) + relative_block = block_in_file - *first_logical_block; + nblocks = map_bh->b_size >> blkbits; + if ((buffer_boundary(map_bh) && relative_block == nblocks) || + (first_hole != blocks_per_page)) bio = mpage_bio_submit(READ, bio); else *last_block_in_bio = blocks[blocks_per_page - 1]; diff --git a/fs/namei.c b/fs/namei.c index dd5c9f0bf82..f05bed24242 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -257,7 +257,7 @@ int inode_permission(struct inode *inode, int mask) return -EACCES; } - if (inode->i_op && inode->i_op->permission) + if (inode->i_op->permission) retval = inode->i_op->permission(inode, mask); else retval = generic_permission(inode, mask, NULL); @@ -432,7 +432,7 @@ static int exec_permission_lite(struct inode *inode) { umode_t mode = inode->i_mode; - if (inode->i_op && inode->i_op->permission) + if (inode->i_op->permission) return -EAGAIN; if (current_fsuid() == inode->i_uid) @@ -908,9 +908,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) inode = next.dentry->d_inode; if (!inode) goto out_dput; - err = -ENOTDIR; - if (!inode->i_op) - goto out_dput; if (inode->i_op->follow_link) { err = do_follow_link(&next, nd); @@ -920,9 +917,6 @@ static int __link_path_walk(const char *name, struct nameidata *nd) inode = nd->path.dentry->d_inode; if (!inode) break; - err = -ENOTDIR; - if (!inode->i_op) - break; } else path_to_nameidata(&next, nd); err = -ENOTDIR; @@ -961,7 +955,7 @@ last_component: break; inode = next.dentry->d_inode; if ((lookup_flags & LOOKUP_FOLLOW) - && inode && inode->i_op && inode->i_op->follow_link) { + && inode && inode->i_op->follow_link) { err = do_follow_link(&next, nd); if (err) goto return_err; @@ -973,7 +967,7 @@ last_component: break; if (lookup_flags & LOOKUP_DIRECTORY) { err = -ENOTDIR; - if (!inode->i_op || !inode->i_op->lookup) + if (!inode->i_op->lookup) break; } goto return_base; @@ -1469,7 +1463,7 @@ int vfs_create(struct inode *dir, struct dentry *dentry, int mode, if (error) return error; - if (!dir->i_op || !dir->i_op->create) + if (!dir->i_op->create) return -EACCES; /* shouldn't it be ENOSYS? */ mode &= S_IALLUGO; mode |= S_IFREG; @@ -1752,7 +1746,7 @@ do_last: error = -ENOENT; if (!path.dentry->d_inode) goto exit_dput; - if (path.dentry->d_inode->i_op && path.dentry->d_inode->i_op->follow_link) + if (path.dentry->d_inode->i_op->follow_link) goto do_link; path_to_nameidata(&path, &nd); @@ -1933,7 +1927,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD)) return -EPERM; - if (!dir->i_op || !dir->i_op->mknod) + if (!dir->i_op->mknod) return -EPERM; error = devcgroup_inode_mknod(mode, dev); @@ -2035,7 +2029,7 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) if (error) return error; - if (!dir->i_op || !dir->i_op->mkdir) + if (!dir->i_op->mkdir) return -EPERM; mode &= (S_IRWXUGO|S_ISVTX); @@ -2126,7 +2120,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) return error; - if (!dir->i_op || !dir->i_op->rmdir) + if (!dir->i_op->rmdir) return -EPERM; DQUOT_INIT(dir); @@ -2213,7 +2207,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry) if (error) return error; - if (!dir->i_op || !dir->i_op->unlink) + if (!dir->i_op->unlink) return -EPERM; DQUOT_INIT(dir); @@ -2320,7 +2314,7 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname) if (error) return error; - if (!dir->i_op || !dir->i_op->symlink) + if (!dir->i_op->symlink) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); @@ -2401,7 +2395,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de */ if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; - if (!dir->i_op || !dir->i_op->link) + if (!dir->i_op->link) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; @@ -2608,7 +2602,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (error) return error; - if (!old_dir->i_op || !old_dir->i_op->rename) + if (!old_dir->i_op->rename) return -EPERM; DQUOT_INIT(old_dir); @@ -2817,18 +2811,23 @@ void page_put_link(struct dentry *dentry, struct nameidata *nd, void *cookie) } } -int __page_symlink(struct inode *inode, const char *symname, int len, - gfp_t gfp_mask) +/* + * The nofs argument instructs pagecache_write_begin to pass AOP_FLAG_NOFS + */ +int __page_symlink(struct inode *inode, const char *symname, int len, int nofs) { struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; int err; char *kaddr; + unsigned int flags = AOP_FLAG_UNINTERRUPTIBLE; + if (nofs) + flags |= AOP_FLAG_NOFS; retry: err = pagecache_write_begin(NULL, mapping, 0, len-1, - AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); + flags, &page, &fsdata); if (err) goto fail; @@ -2852,7 +2851,7 @@ fail: int page_symlink(struct inode *inode, const char *symname, int len) { return __page_symlink(inode, symname, len, - mapping_gfp_mask(inode->i_mapping)); + !(mapping_gfp_mask(inode->i_mapping) & __GFP_FS)); } const struct inode_operations page_symlink_inode_operations = { diff --git a/fs/ncpfs/getopt.c b/fs/ncpfs/getopt.c index 335b003dddf..0af3349de85 100644 --- a/fs/ncpfs/getopt.c +++ b/fs/ncpfs/getopt.c @@ -16,7 +16,6 @@ * @opts: an array of &struct option entries controlling parser operations * @optopt: output; will contain the current option * @optarg: output; will contain the value (if one exists) - * @flag: output; may be NULL; should point to a long for or'ing flags * @value: output; may be NULL; will be overwritten with the integer value * of the current argument. * diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index 6d04e050c74..f54360f50a9 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -98,7 +98,7 @@ struct compat_ncp_objectname_ioctl { s32 auth_type; u32 object_name_len; - compat_caddr_t object_name; /* an userspace data, in most cases user name */ + compat_caddr_t object_name; /* a userspace data, in most cases user name */ }; struct compat_ncp_fs_info_v2 { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index d319b49f8f0..90f292b520d 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -354,7 +354,7 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping, file->f_path.dentry->d_name.name, mapping->host->i_ino, len, (long long) pos); - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index 0184fe9b514..c903e04aa21 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -76,10 +76,10 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) ret = set_groups(new, gi); put_group_info(gi); - if (!ret) + if (ret < 0) goto error; - if (new->uid) + if (new->fsuid) new->cap_effective = cap_drop_nfsd_set(new->cap_effective); else new->cap_effective = cap_raise_nfsd_set(new->cap_effective, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 6d7d8c02c19..c464181b599 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -53,9 +53,6 @@ #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 -/* declarations */ -static const struct rpc_call_ops nfs4_cb_null_ops; - /* Index of predefined Linux callback client operations */ enum { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 669461e291a..9fa60a3ad48 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -946,6 +946,11 @@ encode_op: nfsd4_encode_operation(resp, op); status = op->status; } + + dprintk("nfsv4 compound op %p opcnt %d #%d: %d: status %d\n", + args->ops, args->opcnt, resp->opcnt, op->opnum, + be32_to_cpu(status)); + if (cstate->replay_owner) { nfs4_put_stateowner(cstate->replay_owner); cstate->replay_owner = NULL; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 0f9d6efaa62..74f7b67567f 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -116,9 +116,9 @@ nfs4_make_rec_clidname(char *dname, struct xdr_netobj *clname) md5_to_hex(dname, cksum.data); - kfree(cksum.data); status = nfs_ok; out: + kfree(cksum.data); crypto_free_hash(desc.tfm); out_no_tfm: return status; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 13e0e074dbb..88db7d3ec12 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -2416,6 +2416,26 @@ out: #define LOCK_HASH_SIZE (1 << LOCK_HASH_BITS) #define LOCK_HASH_MASK (LOCK_HASH_SIZE - 1) +static inline u64 +end_offset(u64 start, u64 len) +{ + u64 end; + + end = start + len; + return end >= start ? end: NFS4_MAX_UINT64; +} + +/* last octet in a range */ +static inline u64 +last_byte_offset(u64 start, u64 len) +{ + u64 end; + + BUG_ON(!len); + end = start + len; + return end > start ? end - 1: NFS4_MAX_UINT64; +} + #define lockownerid_hashval(id) \ ((id) & LOCK_HASH_MASK) @@ -2435,13 +2455,13 @@ static struct list_head lockstateid_hashtbl[STATEID_HASH_SIZE]; static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags) { - struct nfs4_stateid *local = NULL; + struct nfs4_stateid *local; u32 st_id = stid->si_stateownerid; u32 f_id = stid->si_fileid; unsigned int hashval; dprintk("NFSD: find_stateid flags 0x%x\n",flags); - if ((flags & LOCK_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { + if (flags & (LOCK_STATE | RD_STATE | WR_STATE)) { hashval = stateid_hashval(st_id, f_id); list_for_each_entry(local, &lockstateid_hashtbl[hashval], st_hash) { if ((local->st_stateid.si_stateownerid == st_id) && @@ -2449,7 +2469,8 @@ find_stateid(stateid_t *stid, int flags) return local; } } - if ((flags & OPEN_STATE) || (flags & RD_STATE) || (flags & WR_STATE)) { + + if (flags & (OPEN_STATE | RD_STATE | WR_STATE)) { hashval = stateid_hashval(st_id, f_id); list_for_each_entry(local, &stateid_hashtbl[hashval], st_hash) { if ((local->st_stateid.si_stateownerid == st_id) && @@ -2518,8 +2539,8 @@ nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; - deny->ld_length = ~(u64)0; - if (fl->fl_end != ~(u64)0) + deny->ld_length = NFS4_MAX_UINT64; + if (fl->fl_end != NFS4_MAX_UINT64) deny->ld_length = fl->fl_end - fl->fl_start + 1; deny->ld_type = NFS4_READ_LT; if (fl->fl_type != F_RDLCK) @@ -2616,7 +2637,7 @@ out: static int check_lock_length(u64 offset, u64 length) { - return ((length == 0) || ((length != ~(u64)0) && + return ((length == 0) || ((length != NFS4_MAX_UINT64) && LOFF_OVERFLOW(offset, length))); } @@ -2736,11 +2757,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; - if ((lock->lk_length == ~(u64)0) || - LOFF_OVERFLOW(lock->lk_offset, lock->lk_length)) - file_lock.fl_end = ~(u64)0; - else - file_lock.fl_end = lock->lk_offset + lock->lk_length - 1; + file_lock.fl_end = last_byte_offset(lock->lk_offset, lock->lk_length); nfs4_transform_lock_offset(&file_lock); /* @@ -2781,6 +2798,25 @@ out: } /* + * The NFSv4 spec allows a client to do a LOCKT without holding an OPEN, + * so we do a temporary open here just to get an open file to pass to + * vfs_test_lock. (Arguably perhaps test_lock should be done with an + * inode operation.) + */ +static int nfsd_test_lock(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file_lock *lock) +{ + struct file *file; + int err; + + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (err) + return err; + err = vfs_test_lock(file, lock); + nfsd_close(file); + return err; +} + +/* * LOCKT operation */ __be32 @@ -2788,7 +2824,6 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_lockt *lockt) { struct inode *inode; - struct file file; struct file_lock file_lock; int error; __be32 status; @@ -2839,23 +2874,12 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; - if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) - file_lock.fl_end = ~(u64)0; - else - file_lock.fl_end = lockt->lt_offset + lockt->lt_length - 1; + file_lock.fl_end = last_byte_offset(lockt->lt_offset, lockt->lt_length); nfs4_transform_lock_offset(&file_lock); - /* vfs_test_lock uses the struct file _only_ to resolve the inode. - * since LOCKT doesn't require an OPEN, and therefore a struct - * file may not exist, pass vfs_test_lock a struct file with - * only the dentry:inode set. - */ - memset(&file, 0, sizeof (struct file)); - file.f_path.dentry = cstate->current_fh.fh_dentry; - status = nfs_ok; - error = vfs_test_lock(&file, &file_lock); + error = nfsd_test_lock(rqstp, &cstate->current_fh, &file_lock); if (error) { status = nfserrno(error); goto out; @@ -2906,10 +2930,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; - if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) - file_lock.fl_end = ~(u64)0; - else - file_lock.fl_end = locku->lu_offset + locku->lu_length - 1; + file_lock.fl_end = last_byte_offset(locku->lu_offset, locku->lu_length); nfs4_transform_lock_offset(&file_lock); /* diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index afcdf4b7684..f65953be39c 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -1,6 +1,4 @@ /* - * fs/nfs/nfs4xdr.c - * * Server-side XDR for NFSv4 * * Copyright (c) 2002 The Regents of the University of Michigan. diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 77d7b8c531a..3d93b2064ce 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -84,6 +84,8 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size); static ssize_t write_getfd(struct file *file, char *buf, size_t size); static ssize_t write_getfs(struct file *file, char *buf, size_t size); static ssize_t write_filehandle(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size); +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size); static ssize_t write_threads(struct file *file, char *buf, size_t size); static ssize_t write_pool_threads(struct file *file, char *buf, size_t size); static ssize_t write_versions(struct file *file, char *buf, size_t size); @@ -94,9 +96,6 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif -static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size); -static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size); - static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Svc] = write_svc, [NFSD_Add] = write_add, @@ -106,8 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Getfd] = write_getfd, [NFSD_Getfs] = write_getfs, [NFSD_Fh] = write_filehandle, - [NFSD_FO_UnlockIP] = failover_unlock_ip, - [NFSD_FO_UnlockFS] = failover_unlock_fs, + [NFSD_FO_UnlockIP] = write_unlock_ip, + [NFSD_FO_UnlockFS] = write_unlock_fs, [NFSD_Threads] = write_threads, [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, @@ -176,10 +175,24 @@ static const struct file_operations exports_operations = { /*----------------------------------------------------------------------------*/ /* * payload - write methods - * If the method has a response, the response should be put in buf, - * and the length returned. Otherwise return 0 or and -error. */ +/** + * write_svc - Start kernel's NFSD server + * + * Deprecated. /proc/fs/nfsd/threads is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_svc + * svc_port: port number of this + * server's listener + * svc_nthreads: number of threads to start + * size: size in bytes of passed in nfsctl_svc + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ static ssize_t write_svc(struct file *file, char *buf, size_t size) { struct nfsctl_svc *data; @@ -189,6 +202,30 @@ static ssize_t write_svc(struct file *file, char *buf, size_t size) return nfsd_svc(data->svc_port, data->svc_nthreads); } +/** + * write_add - Add or modify client entry in auth unix cache + * + * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_client + * cl_ident: '\0'-terminated C string + * containing domain name + * of client + * cl_naddr: no. of items in cl_addrlist + * cl_addrlist: array of client addresses + * cl_fhkeytype: ignored + * cl_fhkeylen: ignored + * cl_fhkey: ignored + * size: size in bytes of passed in nfsctl_client + * Output: + * On success: returns zero + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since + * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. + */ static ssize_t write_add(struct file *file, char *buf, size_t size) { struct nfsctl_client *data; @@ -198,6 +235,30 @@ static ssize_t write_add(struct file *file, char *buf, size_t size) return exp_addclient(data); } +/** + * write_del - Remove client from auth unix cache + * + * Deprecated. /proc/net/rpc/auth.unix.ip is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_client + * cl_ident: '\0'-terminated C string + * containing domain name + * of client + * cl_naddr: ignored + * cl_addrlist: ignored + * cl_fhkeytype: ignored + * cl_fhkeylen: ignored + * cl_fhkey: ignored + * size: size in bytes of passed in nfsctl_client + * Output: + * On success: returns zero + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since + * nfsctl_client.cl_addrlist contains only in_addr fields for addresses. + */ static ssize_t write_del(struct file *file, char *buf, size_t size) { struct nfsctl_client *data; @@ -207,6 +268,33 @@ static ssize_t write_del(struct file *file, char *buf, size_t size) return exp_delclient(data); } +/** + * write_export - Export part or all of a local file system + * + * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_export + * ex_client: '\0'-terminated C string + * containing domain name + * of client allowed to access + * this export + * ex_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * ex_dev: fsid to use for this export + * ex_ino: ignored + * ex_flags: export flags for this export + * ex_anon_uid: UID to use for anonymous + * requests + * ex_anon_gid: GID to use for anonymous + * requests + * size: size in bytes of passed in nfsctl_export + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ static ssize_t write_export(struct file *file, char *buf, size_t size) { struct nfsctl_export *data; @@ -216,6 +304,31 @@ static ssize_t write_export(struct file *file, char *buf, size_t size) return exp_export(data); } +/** + * write_unexport - Unexport a previously exported file system + * + * Deprecated. /proc/net/rpc/{nfsd.export,nfsd.fh} are preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_export + * ex_client: '\0'-terminated C string + * containing domain name + * of client no longer allowed + * to access this export + * ex_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * ex_dev: ignored + * ex_ino: ignored + * ex_flags: ignored + * ex_anon_uid: ignored + * ex_anon_gid: ignored + * size: size in bytes of passed in nfsctl_export + * Output: + * On success: returns zero + * On error: return code is negative errno value + */ static ssize_t write_unexport(struct file *file, char *buf, size_t size) { struct nfsctl_export *data; @@ -226,6 +339,30 @@ static ssize_t write_unexport(struct file *file, char *buf, size_t size) return exp_unexport(data); } +/** + * write_getfs - Get a variable-length NFS file handle by path + * + * Deprecated. /proc/fs/nfsd/filehandle is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_fsparm + * gd_addr: socket address of client + * gd_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * gd_maxlen: maximum size of returned file + * handle + * size: size in bytes of passed in nfsctl_fsparm + * Output: + * On success: passed-in buffer filled with a knfsd_fh structure + * (a variable-length raw NFS file handle); + * return code is the size in bytes of the file handle + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since gd_addr + * is the same size as a struct sockaddr_in. + */ static ssize_t write_getfs(struct file *file, char *buf, size_t size) { struct nfsctl_fsparm *data; @@ -265,6 +402,29 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) return err; } +/** + * write_getfd - Get a fixed-length NFS file handle by path (used by mountd) + * + * Deprecated. /proc/fs/nfsd/filehandle is preferred. + * Function remains to support old versions of nfs-utils. + * + * Input: + * buf: struct nfsctl_fdparm + * gd_addr: socket address of client + * gd_path: '\0'-terminated C string + * containing pathname of + * directory in local file system + * gd_version: fdparm structure version + * size: size in bytes of passed in nfsctl_fdparm + * Output: + * On success: passed-in buffer filled with nfsctl_res + * (a fixed-length raw NFS file handle); + * return code is the size in bytes of the file handle + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in, since gd_addr + * is the same size as a struct sockaddr_in. + */ static ssize_t write_getfd(struct file *file, char *buf, size_t size) { struct nfsctl_fdparm *data; @@ -309,7 +469,23 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) return err; } -static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) +/** + * write_unlock_ip - Release all locks used by a client + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing a + * presentation format IPv4 address + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + * + * Note: Only AF_INET client addresses are passed in + */ +static ssize_t write_unlock_ip(struct file *file, char *buf, size_t size) { struct sockaddr_in sin = { .sin_family = AF_INET, @@ -339,7 +515,21 @@ static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) return nlmsvc_unlock_all_by_ip((struct sockaddr *)&sin); } -static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) +/** + * write_unlock_fs - Release all locks on a local file system + * + * Experimental. + * + * Input: + * buf: '\n'-terminated C string containing the + * absolute pathname of a local file system + * size: length of C string in @buf + * Output: + * On success: returns zero if all specified locks were released; + * returns one if one or more locks were not released + * On error: return code is negative errno value + */ +static ssize_t write_unlock_fs(struct file *file, char *buf, size_t size) { struct path path; char *fo_path; @@ -360,21 +550,44 @@ static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) if (error) return error; + /* + * XXX: Needs better sanity checking. Otherwise we could end up + * releasing locks on the wrong file system. + * + * For example: + * 1. Does the path refer to a directory? + * 2. Is that directory a mount point, or + * 3. Is that directory the root of an exported file system? + */ error = nlmsvc_unlock_all_by_sb(path.mnt->mnt_sb); path_put(&path); return error; } +/** + * write_filehandle - Get a variable-length NFS file handle by path + * + * On input, the buffer contains a '\n'-terminated C string comprised of + * three alphanumeric words separated by whitespace. The string may + * contain escape sequences. + * + * Input: + * buf: + * domain: client domain name + * path: export pathname + * maxsize: numeric maximum size of + * @buf + * size: length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing a ASCII hex text version + * of the NFS file handle; + * return code is the size in bytes of the string + * On error: return code is negative errno value + */ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { - /* request is: - * domain path maxsize - * response is - * filehandle - * - * qword quoting is used, so filehandle will be \x.... - */ char *dname, *path; int uninitialized_var(maxsize); char *mesg = buf; @@ -391,11 +604,13 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) dname = mesg; len = qword_get(&mesg, dname, size); - if (len <= 0) return -EINVAL; + if (len <= 0) + return -EINVAL; path = dname+len+1; len = qword_get(&mesg, path, size); - if (len <= 0) return -EINVAL; + if (len <= 0) + return -EINVAL; len = get_int(&mesg, &maxsize); if (len) @@ -419,17 +634,43 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) if (len) return len; - mesg = buf; len = SIMPLE_TRANSACTION_LIMIT; + mesg = buf; + len = SIMPLE_TRANSACTION_LIMIT; qword_addhex(&mesg, &len, (char*)&fh.fh_base, fh.fh_size); mesg[-1] = '\n'; return mesg - buf; } +/** + * write_threads - Start NFSD, or report the current number of running threads + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the + * number of NFSD threads to start + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with '\n'-terminated C + * string numeric value representing the number of + * running NFSD threads; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_threads(struct file *file, char *buf, size_t size) { - /* if size > 0, look for a number of threads and call nfsd_svc - * then write out number of threads as reply - */ char *mesg = buf; int rv; if (size > 0) { @@ -437,9 +678,9 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) rv = get_int(&mesg, &newthreads); if (rv) return rv; - if (newthreads <0) + if (newthreads < 0) return -EINVAL; - rv = nfsd_svc(2049, newthreads); + rv = nfsd_svc(NFS_PORT, newthreads); if (rv) return rv; } @@ -447,6 +688,28 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return strlen(buf); } +/** + * write_pool_threads - Set or report the current number of threads per pool + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated unsigned integer values + * representing the number of NFSD + * threads to start in each pool + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing integer values representing the + * number of NFSD threads in each pool; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) { /* if size > 0, look for an array of number of threads per node @@ -517,10 +780,6 @@ out_free: static ssize_t __write_versions(struct file *file, char *buf, size_t size) { - /* - * Format: - * [-/+]vers [-/+]vers ... - */ char *mesg = buf; char *vers, sign; int len, num; @@ -578,6 +837,38 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size) return len; } +/** + * write_versions - Set or report the available NFS protocol versions + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing positive or negative integer + * values representing the current status of each + * protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing whitespace- + * separated positive or negative + * integer values representing NFS + * protocol versions to enable ("+n") + * or disable ("-n") + * size: non-zero length of C string in @buf + * Output: + * On success: status of zero or more protocol versions has + * been updated; passed-in buffer filled with + * '\n'-terminated C string containing positive + * or negative integer values representing the + * current status of each protocol version; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_versions(struct file *file, char *buf, size_t size) { ssize_t rv; @@ -687,6 +978,75 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; } +/** + * write_ports - Pass a socket file descriptor or transport name to listen on + * + * Input: + * buf: ignored + * size: zero + * Output: + * On success: passed-in buffer filled with a '\n'-terminated C + * string containing a whitespace-separated list of + * named NFSD listeners; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing a bound + * but unconnected socket that is to be + * used as an NFSD listener + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service is started; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique alphanumeric name of + * the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a "-" followed + * by an integer value representing a + * previously passed in socket file + * descriptor + * size: non-zero length of C string in @buf + * Output: + * On success: NFS service no longer listens on that socket; + * passed-in buffer filled with a '\n'-terminated C + * string containing a unique name of the listener; + * return code is the size in bytes of the string + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a transport + * name and an unsigned integer value + * representing the port to listen on, + * separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service is started + * On error: return code is a negative errno value + * + * OR + * + * Input: + * buf: C string containing a "-" followed + * by a transport name and an unsigned + * integer value representing the port + * to listen on, separated by whitespace + * size: non-zero length of C string in @buf + * Output: + * On success: returns zero; NFS service no longer listens + * on that transport + * On error: return code is a negative errno value + */ static ssize_t write_ports(struct file *file, char *buf, size_t size) { ssize_t rv; @@ -700,6 +1060,27 @@ static ssize_t write_ports(struct file *file, char *buf, size_t size) int nfsd_max_blksize; +/** + * write_maxblksize - Set or report the current NFS blksize + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFS blksize + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing numeric value of the current NFS blksize + * setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) { char *mesg = buf; @@ -752,6 +1133,27 @@ static ssize_t __write_leasetime(struct file *file, char *buf, size_t size) return strlen(buf); } +/** + * write_leasetime - Set or report the current NFSv4 lease time + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing an unsigned + * integer value representing the new + * NFSv4 lease expiry time + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C + * string containing unsigned integer value of the + * current lease expiry time; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_leasetime(struct file *file, char *buf, size_t size) { ssize_t rv; @@ -788,6 +1190,27 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) return strlen(buf); } +/** + * write_recoverydir - Set or report the pathname of the recovery directory + * + * Input: + * buf: ignored + * size: zero + * + * OR + * + * Input: + * buf: C string containing the pathname + * of the directory on a local file + * system containing permanent NFSv4 + * recovery data + * size: non-zero length of C string in @buf + * Output: + * On success: passed-in buffer filled with '\n'-terminated C string + * containing the current recovery pathname setting; + * return code is the size in bytes of the string + * On error: return code is zero or a negative errno value + */ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) { ssize_t rv; diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index f0da7d9c3a9..9f1ca17293d 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -258,14 +258,32 @@ out: return error; } -/* - * Perform sanity checks on the dentry in a client's file handle. +/** + * fh_verify - filehandle lookup and access checking + * @rqstp: pointer to current rpc request + * @fhp: filehandle to be verified + * @type: expected type of object pointed to by filehandle + * @access: type of access needed to object + * + * Look up a dentry from the on-the-wire filehandle, check the client's + * access to the export, and set the current task's credentials. + * + * Regardless of success or failure of fh_verify(), fh_put() should be + * called on @fhp when the caller is finished with the filehandle. * - * Note that the file handle dentry may need to be freed even after - * an error return. + * fh_verify() may be called multiple times on a given filehandle, for + * example, when processing an NFSv4 compound. The first call will look + * up a dentry using the on-the-wire filehandle. Subsequent calls will + * skip the lookup and just perform the other checks and possibly change + * the current task's credentials. * - * This is only called at the start of an nfsproc call, so fhp points to - * a svc_fh which is all 0 except for the over-the-wire file handle. + * @type specifies the type of object expected using one of the S_IF* + * constants defined in include/linux/stat.h. The caller may use zero + * to indicate that it doesn't care, or a negative integer to indicate + * that it expects something not of the given type. + * + * @access is formed from the NFSD_MAY_* constants defined in + * include/linux/nfsd/nfsd.h. */ __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) @@ -466,6 +484,8 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, goto retry; break; } + } else if (exp->ex_flags & NFSEXP_FSID) { + fsid_type = FSID_NUM; } else if (exp->ex_uuid) { if (fhp->fh_maxsize >= 64) { if (root_export) @@ -478,9 +498,7 @@ fh_compose(struct svc_fh *fhp, struct svc_export *exp, struct dentry *dentry, else fsid_type = FSID_UUID4_INUM; } - } else if (exp->ex_flags & NFSEXP_FSID) - fsid_type = FSID_NUM; - else if (!old_valid_dev(ex_dev)) + } else if (!old_valid_dev(ex_dev)) /* for newer device numbers, we must use a newer fsid format */ fsid_type = FSID_ENCODE_DEV; else diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 5cffeca7ace..6f7f2635122 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -622,6 +622,7 @@ nfserrno (int errno) { nfserr_badname, -ESRCH }, { nfserr_io, -ETXTBSY }, { nfserr_notsupp, -EOPNOTSUPP }, + { nfserr_toosmall, -ETOOSMALL }, }; int i; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index d1c5f787b36..6e50aaa56ca 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -764,7 +764,6 @@ static inline int nfsd_dosync(struct file *filp, struct dentry *dp, return err; } - static int nfsd_sync(struct file *filp) @@ -1211,7 +1210,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, dirp = dentry->d_inode; err = nfserr_notdir; - if(!dirp->i_op || !dirp->i_op->lookup) + if (!dirp->i_op->lookup) goto out; /* * Check whether the response file handle has been verified yet. @@ -1347,7 +1346,7 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp, /* Get all the sanity checks out of the way before * we lock the parent. */ err = nfserr_notdir; - if(!dirp->i_op || !dirp->i_op->lookup) + if (!dirp->i_op->lookup) goto out; fh_lock_nested(fhp, I_MUTEX_PARENT); @@ -1482,7 +1481,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp) inode = dentry->d_inode; err = nfserr_inval; - if (!inode->i_op || !inode->i_op->readlink) + if (!inode->i_op->readlink) goto out; touch_atime(fhp->fh_export->ex_path.mnt, dentry); @@ -2162,7 +2161,7 @@ nfsd_set_posix_acl(struct svc_fh *fhp, int type, struct posix_acl *acl) size_t size; int error; - if (!IS_POSIXACL(inode) || !inode->i_op || + if (!IS_POSIXACL(inode) || !inode->i_op->setxattr || !inode->i_op->removexattr) return -EOPNOTSUPP; switch(type) { diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index 400f8064a54..81b8644b013 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -704,7 +704,7 @@ fput_and_out: return ret; } -asmlinkage long sys_inotify_rm_watch(int fd, u32 wd) +asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd) { struct file *filp; struct inotify_device *dev; diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index e9da092e277..86bef156cf0 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1406,9 +1406,6 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) ni->allocated_size = sle64_to_cpu( a->data.non_resident.allocated_size); } - /* Setup the operations for this attribute inode. */ - vi->i_op = NULL; - vi->i_fop = NULL; if (NInoMstProtected(ni)) vi->i_mapping->a_ops = &ntfs_mst_aops; else diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index 589dcdfdfe3..01596079dd6 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -12,6 +12,7 @@ obj-$(CONFIG_OCFS2_FS_USERSPACE_CLUSTER) += ocfs2_stack_user.o ocfs2-objs := \ alloc.o \ aops.o \ + blockcheck.o \ buffer_head_io.o \ dcache.o \ dir.o \ @@ -35,8 +36,14 @@ ocfs2-objs := \ sysfile.o \ uptodate.o \ ver.o \ + quota_local.o \ + quota_global.o \ xattr.o +ifeq ($(CONFIG_OCFS2_FS_POSIX_ACL),y) +ocfs2-objs += acl.o +endif + ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o ocfs2_stack_user-objs := stack_user.o diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c new file mode 100644 index 00000000000..12dfb44c22e --- /dev/null +++ b/fs/ocfs2/acl.c @@ -0,0 +1,479 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.c + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is copy from linux/fs/ext3/acl.c. + * Copyright (C) 2001-2003 Andreas Gruenbacher, <agruen@suse.de> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#define MLOG_MASK_PREFIX ML_INODE +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "ocfs2_fs.h" + +#include "xattr.h" +#include "acl.h" + +/* + * Convert from xattr value to acl struct. + */ +static struct posix_acl *ocfs2_acl_from_xattr(const void *value, size_t size) +{ + int n, count; + struct posix_acl *acl; + + if (!value) + return NULL; + if (size < sizeof(struct posix_acl_entry)) + return ERR_PTR(-EINVAL); + + count = size / sizeof(struct posix_acl_entry); + if (count < 0) + return ERR_PTR(-EINVAL); + if (count == 0) + return NULL; + + acl = posix_acl_alloc(count, GFP_NOFS); + if (!acl) + return ERR_PTR(-ENOMEM); + for (n = 0; n < count; n++) { + struct ocfs2_acl_entry *entry = + (struct ocfs2_acl_entry *)value; + + acl->a_entries[n].e_tag = le16_to_cpu(entry->e_tag); + acl->a_entries[n].e_perm = le16_to_cpu(entry->e_perm); + acl->a_entries[n].e_id = le32_to_cpu(entry->e_id); + value += sizeof(struct posix_acl_entry); + + } + return acl; +} + +/* + * Convert acl struct to xattr value. + */ +static void *ocfs2_acl_to_xattr(const struct posix_acl *acl, size_t *size) +{ + struct ocfs2_acl_entry *entry = NULL; + char *ocfs2_acl; + size_t n; + + *size = acl->a_count * sizeof(struct posix_acl_entry); + + ocfs2_acl = kmalloc(*size, GFP_NOFS); + if (!ocfs2_acl) + return ERR_PTR(-ENOMEM); + + entry = (struct ocfs2_acl_entry *)ocfs2_acl; + for (n = 0; n < acl->a_count; n++, entry++) { + entry->e_tag = cpu_to_le16(acl->a_entries[n].e_tag); + entry->e_perm = cpu_to_le16(acl->a_entries[n].e_perm); + entry->e_id = cpu_to_le32(acl->a_entries[n].e_id); + } + return ocfs2_acl; +} + +static struct posix_acl *ocfs2_get_acl_nolock(struct inode *inode, + int type, + struct buffer_head *di_bh) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int name_index; + char *value = NULL; + struct posix_acl *acl; + int retval; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return NULL; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + break; + default: + return ERR_PTR(-EINVAL); + } + + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, "", NULL, 0); + if (retval > 0) { + value = kmalloc(retval, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + retval = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + "", value, retval); + } + + if (retval > 0) + acl = ocfs2_acl_from_xattr(value, retval); + else if (retval == -ENODATA || retval == 0) + acl = NULL; + else + acl = ERR_PTR(retval); + + kfree(value); + + return acl; +} + + +/* + * Get posix acl. + */ +static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct buffer_head *di_bh = NULL; + struct posix_acl *acl; + int ret; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + acl = ERR_PTR(ret); + return acl; + } + + acl = ocfs2_get_acl_nolock(inode, type, di_bh); + + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return acl; +} + +/* + * Set the access or default ACL of an inode. + */ +static int ocfs2_set_acl(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int type, + struct posix_acl *acl, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + int name_index; + void *value = NULL; + size_t size = 0; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + switch (type) { + case ACL_TYPE_ACCESS: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS; + if (acl) { + mode_t mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + else { + inode->i_mode = mode; + if (ret == 0) + acl = NULL; + } + } + break; + case ACL_TYPE_DEFAULT: + name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT; + if (!S_ISDIR(inode->i_mode)) + return acl ? -EACCES : 0; + break; + default: + return -EINVAL; + } + + if (acl) { + value = ocfs2_acl_to_xattr(acl, &size); + if (IS_ERR(value)) + return (int)PTR_ERR(value); + } + + if (handle) + ret = ocfs2_xattr_set_handle(handle, inode, di_bh, name_index, + "", value, size, 0, + meta_ac, data_ac); + else + ret = ocfs2_xattr_set(inode, name_index, "", value, size, 0); + + kfree(value); + + return ret; +} + +int ocfs2_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + int ret = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + return ret; + } + + return -EAGAIN; +} + +int ocfs2_acl_chmod(struct inode *inode) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl, *clone; + int ret; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + acl = ocfs2_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = ocfs2_set_acl(NULL, inode, NULL, ACL_TYPE_ACCESS, + clone, NULL, NULL); + posix_acl_release(clone); + return ret; +} + +/* + * Initialize the ACLs of a new inode. If parent directory has default ACL, + * then clone to new inode. Called from ocfs2_mknod. + */ +int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl = NULL; + int ret = 0; + + if (!S_ISLNK(inode->i_mode)) { + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, + dir_bh); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = ocfs2_set_acl(handle, inode, di_bh, + ACL_TYPE_DEFAULT, acl, + meta_ac, data_ac); + if (ret) + goto cleanup; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto cleanup; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + ret = ocfs2_set_acl(handle, inode, + di_bh, ACL_TYPE_ACCESS, + clone, meta_ac, data_ac); + } + } + posix_acl_release(clone); + } +cleanup: + posix_acl_release(acl); + return ret; +} + +static size_t ocfs2_xattr_list_acl_access(struct inode *inode, + char *list, + size_t list_len, + const char *name, + size_t name_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + const size_t size = sizeof(POSIX_ACL_XATTR_ACCESS); + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_ACCESS, size); + return size; +} + +static size_t ocfs2_xattr_list_acl_default(struct inode *inode, + char *list, + size_t list_len, + const char *name, + size_t name_len) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + const size_t size = sizeof(POSIX_ACL_XATTR_DEFAULT); + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return 0; + + if (list && size <= list_len) + memcpy(list, POSIX_ACL_XATTR_DEFAULT, size); + return size; +} + +static int ocfs2_xattr_get_acl(struct inode *inode, + int type, + void *buffer, + size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return -EOPNOTSUPP; + + acl = ocfs2_get_acl(inode, type); + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, buffer, size); + posix_acl_release(acl); + + return ret; +} + +static int ocfs2_xattr_get_acl_access(struct inode *inode, + const char *name, + void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_get_acl(inode, ACL_TYPE_ACCESS, buffer, size); +} + +static int ocfs2_xattr_get_acl_default(struct inode *inode, + const char *name, + void *buffer, + size_t size) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_get_acl(inode, ACL_TYPE_DEFAULT, buffer, size); +} + +static int ocfs2_xattr_set_acl(struct inode *inode, + int type, + const void *value, + size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct posix_acl *acl; + int ret = 0; + + if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) + return -EOPNOTSUPP; + + if (!is_owner_or_cap(inode)) + return -EPERM; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return PTR_ERR(acl); + else if (acl) { + ret = posix_acl_valid(acl); + if (ret) + goto cleanup; + } + } else + acl = NULL; + + ret = ocfs2_set_acl(NULL, inode, NULL, type, acl, NULL, NULL); + +cleanup: + posix_acl_release(acl); + return ret; +} + +static int ocfs2_xattr_set_acl_access(struct inode *inode, + const char *name, + const void *value, + size_t size, + int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int ocfs2_xattr_set_acl_default(struct inode *inode, + const char *name, + const void *value, + size_t size, + int flags) +{ + if (strcmp(name, "") != 0) + return -EINVAL; + return ocfs2_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +struct xattr_handler ocfs2_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .list = ocfs2_xattr_list_acl_access, + .get = ocfs2_xattr_get_acl_access, + .set = ocfs2_xattr_set_acl_access, +}; + +struct xattr_handler ocfs2_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .list = ocfs2_xattr_list_acl_default, + .get = ocfs2_xattr_get_acl_default, + .set = ocfs2_xattr_set_acl_default, +}; diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h new file mode 100644 index 00000000000..8f6389ed4da --- /dev/null +++ b/fs/ocfs2/acl.h @@ -0,0 +1,58 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * acl.h + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_ACL_H +#define OCFS2_ACL_H + +#include <linux/posix_acl_xattr.h> + +struct ocfs2_acl_entry { + __le16 e_tag; + __le16 e_perm; + __le32 e_id; +}; + +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + +extern int ocfs2_check_acl(struct inode *, int); +extern int ocfs2_acl_chmod(struct inode *); +extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, + struct buffer_head *, struct buffer_head *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); + +#else /* CONFIG_OCFS2_FS_POSIX_ACL*/ + +#define ocfs2_check_acl NULL +static inline int ocfs2_acl_chmod(struct inode *inode) +{ + return 0; +} +static inline int ocfs2_init_acl(handle_t *handle, + struct inode *inode, + struct inode *dir, + struct buffer_head *di_bh, + struct buffer_head *dir_bh, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + return 0; +} + +#endif /* CONFIG_OCFS2_FS_POSIX_ACL*/ + +#endif /* OCFS2_ACL_H */ diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 0cc2deb9394..d861096c9d8 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/swap.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -36,6 +37,7 @@ #include "alloc.h" #include "aops.h" +#include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "inode.h" @@ -46,6 +48,7 @@ #include "file.h" #include "super.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -187,20 +190,12 @@ static int ocfs2_dinode_insert_check(struct inode *inode, static int ocfs2_dinode_sanity_check(struct inode *inode, struct ocfs2_extent_tree *et) { - int ret = 0; - struct ocfs2_dinode *di; + struct ocfs2_dinode *di = et->et_object; BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); - di = et->et_object; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - } - - return ret; + return 0; } static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) @@ -213,36 +208,33 @@ static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) { - struct ocfs2_xattr_value_root *xv = et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - et->et_root_el = &xv->xr_list; + et->et_root_el = &vb->vb_xv->xr_list; } static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, u64 blkno) { - struct ocfs2_xattr_value_root *xv = - (struct ocfs2_xattr_value_root *)et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - xv->xr_last_eb_blk = cpu_to_le64(blkno); + vb->vb_xv->xr_last_eb_blk = cpu_to_le64(blkno); } static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) { - struct ocfs2_xattr_value_root *xv = - (struct ocfs2_xattr_value_root *) et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - return le64_to_cpu(xv->xr_last_eb_blk); + return le64_to_cpu(vb->vb_xv->xr_last_eb_blk); } static void ocfs2_xattr_value_update_clusters(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters) { - struct ocfs2_xattr_value_root *xv = - (struct ocfs2_xattr_value_root *)et->et_object; + struct ocfs2_xattr_value_buf *vb = et->et_object; - le32_add_cpu(&xv->xr_clusters, clusters); + le32_add_cpu(&vb->vb_xv->xr_clusters, clusters); } static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { @@ -304,11 +296,13 @@ static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh, + ocfs2_journal_access_func access, void *obj, struct ocfs2_extent_tree_operations *ops) { et->et_ops = ops; et->et_root_bh = bh; + et->et_root_journal_access = access; if (!obj) obj = (void *)bh->b_data; et->et_object = obj; @@ -324,23 +318,23 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); + __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_di, + NULL, &ocfs2_dinode_et_ops); } void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh) { - __ocfs2_init_extent_tree(et, inode, bh, NULL, - &ocfs2_xattr_tree_et_ops); + __ocfs2_init_extent_tree(et, inode, bh, ocfs2_journal_access_xb, + NULL, &ocfs2_xattr_tree_et_ops); } void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, - struct buffer_head *bh, - struct ocfs2_xattr_value_root *xv) + struct ocfs2_xattr_value_buf *vb) { - __ocfs2_init_extent_tree(et, inode, bh, xv, + __ocfs2_init_extent_tree(et, inode, vb->vb_bh, vb->vb_access, vb, &ocfs2_xattr_value_et_ops); } @@ -362,6 +356,15 @@ static inline void ocfs2_et_update_clusters(struct inode *inode, et->et_ops->eo_update_clusters(inode, et, clusters); } +static inline int ocfs2_et_root_journal_access(handle_t *handle, + struct inode *inode, + struct ocfs2_extent_tree *et, + int type) +{ + return et->et_root_journal_access(handle, inode, et->et_root_bh, + type); +} + static inline int ocfs2_et_insert_check(struct inode *inode, struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec) @@ -402,12 +405,14 @@ struct ocfs2_path_item { #define OCFS2_MAX_PATH_DEPTH 5 struct ocfs2_path { - int p_tree_depth; - struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; + int p_tree_depth; + ocfs2_journal_access_func p_root_access; + struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; }; #define path_root_bh(_path) ((_path)->p_node[0].bh) #define path_root_el(_path) ((_path)->p_node[0].el) +#define path_root_access(_path)((_path)->p_root_access) #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) #define path_num_items(_path) ((_path)->p_tree_depth + 1) @@ -440,6 +445,8 @@ static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) */ if (keep_root) depth = le16_to_cpu(path_root_el(path)->l_tree_depth); + else + path_root_access(path) = NULL; path->p_tree_depth = depth; } @@ -465,6 +472,7 @@ static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) BUG_ON(path_root_bh(dest) != path_root_bh(src)); BUG_ON(path_root_el(dest) != path_root_el(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); ocfs2_reinit_path(dest, 1); @@ -486,6 +494,7 @@ static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) int i; BUG_ON(path_root_bh(dest) != path_root_bh(src)); + BUG_ON(path_root_access(dest) != path_root_access(src)); for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { brelse(dest->p_node[i].bh); @@ -521,7 +530,8 @@ static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, } static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, - struct ocfs2_extent_list *root_el) + struct ocfs2_extent_list *root_el, + ocfs2_journal_access_func access) { struct ocfs2_path *path; @@ -533,11 +543,48 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, get_bh(root_bh); path_root_bh(path) = root_bh; path_root_el(path) = root_el; + path_root_access(path) = access; } return path; } +static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) +{ + return ocfs2_new_path(path_root_bh(path), path_root_el(path), + path_root_access(path)); +} + +static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) +{ + return ocfs2_new_path(et->et_root_bh, et->et_root_el, + et->et_root_journal_access); +} + +/* + * Journal the buffer at depth idx. All idx>0 are extent_blocks, + * otherwise it's the root_access function. + * + * I don't like the way this function's name looks next to + * ocfs2_journal_access_path(), but I don't have a better one. + */ +static int ocfs2_path_bh_journal_access(handle_t *handle, + struct inode *inode, + struct ocfs2_path *path, + int idx) +{ + ocfs2_journal_access_func access = path_root_access(path); + + if (!access) + access = ocfs2_journal_access; + + if (idx) + access = ocfs2_journal_access_eb; + + return access(handle, inode, path->p_node[idx].bh, + OCFS2_JOURNAL_ACCESS_WRITE); +} + /* * Convenience function to journal all components in a path. */ @@ -550,8 +597,7 @@ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, goto out; for(i = 0; i < path_num_items(path); i++) { - ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, path, i); if (ret < 0) { mlog_errno(ret); goto out; @@ -686,6 +732,80 @@ struct ocfs2_merge_ctxt { int c_split_covers_rec; }; +static int ocfs2_validate_extent_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_extent_block *eb = + (struct ocfs2_extent_block *)bh->b_data; + + mlog(0, "Validating extent block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for extent block %llu\n", + (unsigned long long)bh->b_blocknr); + return rc; + } + + /* + * Errors after here are fatal. + */ + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ocfs2_error(sb, + "Extent block #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + eb->h_signature); + return -EINVAL; + } + + if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extent block #%llu has an invalid h_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(eb->h_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extent block #%llu has an invalid " + "h_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(eb->h_fs_generation)); + return -EINVAL; + } + + return 0; +} + +int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(inode, eb_blkno, &tmp, + ocfs2_validate_extent_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + + /* * How many free extents have we got before we need more meta data? */ @@ -705,8 +825,7 @@ int ocfs2_num_free_extents(struct ocfs2_super *osb, last_eb_blk = ocfs2_et_get_last_eb_blk(et); if (last_eb_blk) { - retval = ocfs2_read_block(inode, last_eb_blk, - &eb_bh); + retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; @@ -768,8 +887,8 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, } ocfs2_set_new_buffer_uptodate(inode, bhs[i]); - status = ocfs2_journal_access(handle, inode, bhs[i], - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -908,15 +1027,12 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, for(i = 0; i < new_blocks; i++) { bh = new_eb_bhs[i]; eb = (struct ocfs2_extent_block *) bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -955,21 +1071,21 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, * journal_dirty erroring as it won't unless we've aborted the * handle (in which case we would never be here) so reserving * the write with journal_access is all we need to do. */ - status = ocfs2_journal_access(handle, inode, *last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, inode, *last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } if (eb_bh) { - status = ocfs2_journal_access(handle, inode, eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, inode, eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1052,17 +1168,14 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } + /* ocfs2_create_new_meta_bhs() should create it right! */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); eb_el = &eb->h_list; root_el = et->et_root_el; - status = ocfs2_journal_access(handle, inode, new_eb_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_eb(handle, inode, new_eb_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -1080,8 +1193,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1176,18 +1289,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, brelse(bh); bh = NULL; - status = ocfs2_read_block(inode, blkno, &bh); + status = ocfs2_read_extent_block(inode, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - status = -EIO; - goto bail; - } el = &eb->h_list; if (le16_to_cpu(el->l_next_free_rec) < @@ -1540,7 +1648,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_extent_block(inode, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -1548,11 +1656,6 @@ static int __ocfs2_find_path(struct inode *inode, eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EIO; - goto out; - } if (le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)) { @@ -1860,25 +1963,23 @@ static int ocfs2_rotate_subtree_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -2102,8 +2203,7 @@ static int ocfs2_rotate_tree_right(struct inode *inode, *ret_left_path = NULL; - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2398,9 +2498,9 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, return -EAGAIN; if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { - ret = ocfs2_journal_access(handle, inode, - path_leaf_bh(right_path), - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_eb(handle, inode, + path_leaf_bh(right_path), + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -2417,8 +2517,8 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -2433,25 +2533,23 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, */ BUG_ON(right_has_empty && !del_right_subtree); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; } for(i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -2596,16 +2694,17 @@ out: static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, handle_t *handle, - struct buffer_head *bh, - struct ocfs2_extent_list *el) + struct ocfs2_path *path) { int ret; + struct buffer_head *bh = path_leaf_bh(path); + struct ocfs2_extent_list *el = path_leaf_el(path); if (!ocfs2_is_empty_extent(&el->l_recs[0])) return 0; - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, path, + path_num_items(path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -2644,8 +2743,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, goto out; } - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2654,8 +2752,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ocfs2_cp_path(left_path, path); - right_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + right_path = ocfs2_new_path_from_path(path); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2689,9 +2786,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, * Caller might still want to make changes to the * tree root, so re-add it to the journal here. */ - ret = ocfs2_journal_access(handle, inode, - path_root_bh(left_path), - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, 0); if (ret) { mlog_errno(ret); goto out; @@ -2785,8 +2881,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, * We have a path to the left of this one - it needs * an update too. */ - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -2875,8 +2970,7 @@ rightmost_no_delete: * it up front. */ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, - path_leaf_bh(path), - path_leaf_el(path)); + path); if (ret) mlog_errno(ret); goto out; @@ -3027,8 +3121,7 @@ static int ocfs2_get_right_path(struct inode *inode, /* This function shouldn't be called for the rightmost leaf. */ BUG_ON(right_cpos == 0); - right_path = ocfs2_new_path(path_root_bh(left_path), - path_root_el(left_path)); + right_path = ocfs2_new_path_from_path(left_path); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3111,8 +3204,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; @@ -3120,17 +3213,15 @@ static int ocfs2_merge_rec_right(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -3142,8 +3233,8 @@ static int ocfs2_merge_rec_right(struct inode *inode, right_rec = &el->l_recs[index + 1]; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, left_path, + path_num_items(left_path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -3199,8 +3290,7 @@ static int ocfs2_get_left_path(struct inode *inode, /* This function shouldn't be called for the leftmost leaf. */ BUG_ON(left_cpos == 0); - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3283,8 +3373,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, root_bh = left_path->p_node[subtree_index].bh; BUG_ON(root_bh != right_path->p_node[subtree_index].bh); - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + subtree_index); if (ret) { mlog_errno(ret); goto out; @@ -3292,17 +3382,15 @@ static int ocfs2_merge_rec_left(struct inode *inode, for (i = subtree_index + 1; i < path_num_items(right_path); i++) { - ret = ocfs2_journal_access(handle, inode, - right_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + right_path, i); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, - left_path->p_node[i].bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, + left_path, i); if (ret) { mlog_errno(ret); goto out; @@ -3314,8 +3402,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, has_empty_extent = 1; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_path_bh_journal_access(handle, inode, right_path, + path_num_items(right_path) - 1); if (ret) { mlog_errno(ret); goto out; @@ -3732,8 +3820,7 @@ static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, * leftmost leaf. */ if (left_cpos) { - left_path = ocfs2_new_path(path_root_bh(right_path), - path_root_el(right_path)); + left_path = ocfs2_new_path_from_path(right_path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3781,7 +3868,7 @@ static void ocfs2_split_record(struct inode *inode, struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; struct ocfs2_extent_rec *rec, *tmprec; - right_el = path_leaf_el(right_path);; + right_el = path_leaf_el(right_path); if (left_path) left_el = path_leaf_el(left_path); @@ -3958,8 +4045,8 @@ static int ocfs2_do_insert_extent(struct inode *inode, el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -3970,7 +4057,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, goto out_update_clusters; } - right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + right_path = ocfs2_new_path_from_et(et); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4020,8 +4107,8 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -4082,8 +4169,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, goto out; if (left_cpos != 0) { - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) goto out; @@ -4097,8 +4183,15 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, le16_to_cpu(new_el->l_count)) { bh = path_leaf_bh(left_path); eb = (struct ocfs2_extent_block *)bh->b_data; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); + ocfs2_error(inode->i_sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of " + "%d. It should have " + "matched the l_count of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec), + le16_to_cpu(new_el->l_count)); + status = -EINVAL; goto out; } rec = &new_el->l_recs[ @@ -4132,8 +4225,7 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (right_cpos == 0) goto out; - right_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + right_path = ocfs2_new_path_from_path(path); if (!right_path) goto out; @@ -4147,8 +4239,12 @@ ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { bh = path_leaf_bh(right_path); eb = (struct ocfs2_extent_block *)bh->b_data; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, - eb); + ocfs2_error(inode->i_sb, + "Extent block #%llu has an " + "invalid l_next_free_rec of %d", + (unsigned long long)le64_to_cpu(eb->h_blkno), + le16_to_cpu(new_el->l_next_free_rec)); + status = -EINVAL; goto out; } rec = &new_el->l_recs[1]; @@ -4294,7 +4390,9 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); + ret = ocfs2_read_extent_block(inode, + ocfs2_et_get_last_eb_blk(et), + &bh); if (ret) { mlog_exit(ret); goto out; @@ -4320,7 +4418,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, return 0; } - path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4531,9 +4629,9 @@ int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, BUG_ON(num_bits > clusters_to_add); - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, et->et_root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + /* reserve our write early -- insert_extent may update the tree root */ + status = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -4760,20 +4858,15 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), - &last_eb_bh); + ret = ocfs2_read_extent_block(inode, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { mlog_exit(ret); goto out; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EROFS; - goto out; - } - rightmost_el = &eb->h_list; } else rightmost_el = path_root_el(path); @@ -4854,7 +4947,7 @@ int ocfs2_mark_extent_written(struct inode *inode, if (et->et_ops == &ocfs2_dinode_et_ops) ocfs2_extent_map_trunc(inode, 0); - left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + left_path = ocfs2_new_path_from_et(et); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4918,8 +5011,9 @@ static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), - &last_eb_bh); + ret = ocfs2_read_extent_block(inode, + ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -5025,8 +5119,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, } if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { - left_path = ocfs2_new_path(path_root_bh(path), - path_root_el(path)); + left_path = ocfs2_new_path_from_path(path); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -5135,7 +5228,7 @@ int ocfs2_remove_extent(struct inode *inode, ocfs2_extent_map_trunc(inode, 0); - path = ocfs2_new_path(et->et_root_bh, et->et_root_el); + path = ocfs2_new_path_from_et(et); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -5255,6 +5348,78 @@ out: return ret; } +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + + ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_et_root_journal_access(handle, inode, et, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ocfs2_et_update_clusters(inode, et, -len); + + ret = ocfs2_journal_dirty(handle, et->et_root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) { struct buffer_head *tl_bh = osb->osb_tl_bh; @@ -5308,13 +5473,13 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); - status = -EIO; - goto bail; - } + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; tl_count = le16_to_cpu(tl->tl_count); mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || tl_count == 0, @@ -5332,8 +5497,8 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, tl_inode, tl_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -5394,8 +5559,8 @@ static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, while (i >= 0) { /* Caller has given us at least enough credits to * update the truncate log dinode */ - status = ocfs2_journal_access(handle, tl_inode, tl_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, tl_inode, tl_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -5464,13 +5629,13 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) BUG_ON(mutex_trylock(&tl_inode->i_mutex)); di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); - status = -EIO; - goto out; - } + /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated + * by the underlying call to ocfs2_read_inode_block(), so any + * corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; num_to_flush = le16_to_cpu(tl->tl_used); mlog(0, "Flush %u records from truncate log #%llu\n", num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); @@ -5586,7 +5751,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -5625,13 +5790,13 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, } di = (struct ocfs2_dinode *) tl_bh->b_data; - tl = &di->id2.i_dealloc; - if (!OCFS2_IS_VALID_DINODE(di)) { - OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); - status = -EIO; - goto bail; - } + /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's + * validated by the underlying call to ocfs2_read_inode_block(), + * so any corruption is a code bug */ + BUG_ON(!OCFS2_IS_VALID_DINODE(di)); + + tl = &di->id2.i_dealloc; if (le16_to_cpu(tl->tl_used)) { mlog(0, "We'll have %u logs to recover\n", le16_to_cpu(tl->tl_used)); @@ -5651,6 +5816,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, * tl_used. */ tl->tl_used = 0; + ocfs2_compute_meta_ecc(osb->sb, tl_bh->b_data, &di->i_check); status = ocfs2_write_block(osb, tl_bh, tl_inode); if (status < 0) { mlog_errno(status); @@ -5800,7 +5966,10 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb) */ /* - * Describes a single block free from a suballocator + * Describe a single bit freed from a suballocator. For the block + * suballocators, it represents one block. For the global cluster + * allocator, it represents some clusters and free_bit indicates + * clusters number. */ struct ocfs2_cached_block_free { struct ocfs2_cached_block_free *free_next; @@ -5815,10 +5984,10 @@ struct ocfs2_per_slot_free_list { struct ocfs2_cached_block_free *f_first; }; -static int ocfs2_free_cached_items(struct ocfs2_super *osb, - int sysfile_type, - int slot, - struct ocfs2_cached_block_free *head) +static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, + int sysfile_type, + int slot, + struct ocfs2_cached_block_free *head) { int ret; u64 bg_blkno; @@ -5893,6 +6062,82 @@ out: return ret; } +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit) +{ + int ret = 0; + struct ocfs2_cached_block_free *item; + + item = kmalloc(sizeof(*item), GFP_NOFS); + if (item == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + return ret; + } + + mlog(0, "Insert clusters: (bit %u, blk %llu)\n", + bit, (unsigned long long)blkno); + + item->free_blk = blkno; + item->free_bit = bit; + item->free_next = ctxt->c_global_allocator; + + ctxt->c_global_allocator = item; + return ret; +} + +static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, + struct ocfs2_cached_block_free *head) +{ + struct ocfs2_cached_block_free *tmp; + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + int ret = 0; + + mutex_lock(&tl_inode->i_mutex); + + while (head) { + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, + head->free_bit); + + ocfs2_commit_trans(osb, handle); + tmp = head; + head = head->free_next; + kfree(tmp); + + if (ret < 0) { + mlog_errno(ret); + break; + } + } + + mutex_unlock(&tl_inode->i_mutex); + + while (head) { + /* Premature exit may have left some dangling items. */ + tmp = head; + head = head->free_next; + kfree(tmp); + } + + return ret; +} + int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt) { @@ -5908,8 +6153,10 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, if (fl->f_first) { mlog(0, "Free items: (type %u, slot %d)\n", fl->f_inode_type, fl->f_slot); - ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, - fl->f_slot, fl->f_first); + ret2 = ocfs2_free_cached_blocks(osb, + fl->f_inode_type, + fl->f_slot, + fl->f_first); if (ret2) mlog_errno(ret2); if (!ret) @@ -5920,6 +6167,17 @@ int ocfs2_run_deallocs(struct ocfs2_super *osb, kfree(fl); } + if (ctxt->c_global_allocator) { + ret2 = ocfs2_free_cached_clusters(osb, + ctxt->c_global_allocator); + if (ret2) + mlog_errno(ret2); + if (!ret) + ret = ret2; + + ctxt->c_global_allocator = NULL; + } + return ret; } @@ -6075,11 +6333,10 @@ static int ocfs2_find_new_last_ext_blk(struct inode *inode, eb = (struct ocfs2_extent_block *) bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - ret = -EROFS; - goto out; - } + + /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block(). + * Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); *new_last_eb = bh; get_bh(*new_last_eb); @@ -6326,8 +6583,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } if (last_eb_bh) { - status = ocfs2_journal_access(handle, inode, last_eb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_eb(handle, inode, last_eb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -6350,6 +6607,8 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, goto bail; } + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); spin_lock(&OCFS2_I(inode)->ip_lock); OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - clusters_to_del; @@ -6436,11 +6695,6 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, mlog_errno(ret); else if (ocfs2_should_order_data(inode)) { ret = ocfs2_jbd2_file_inode(handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - ret = walk_page_buffers(handle, page_buffers(page), - from, to, &partial, - ocfs2_journal_dirty_data); -#endif if (ret < 0) mlog_errno(ret); } @@ -6663,6 +6917,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct page **pages = NULL; loff_t end = osb->s_clustersize; struct ocfs2_extent_tree et; + int did_quota = 0; has_data = i_size_read(inode) ? 1 : 0; @@ -6682,15 +6937,16 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } } - handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); + handle = ocfs2_start_trans(osb, + ocfs2_inline_to_extents_credits(osb->sb)); if (IS_ERR(handle)) { ret = PTR_ERR(handle); mlog_errno(ret); goto out_unlock; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -6701,6 +6957,13 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, unsigned int page_end; u64 phys; + if (vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; + ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &num); if (ret) { @@ -6774,6 +7037,10 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, } out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + ocfs2_commit_trans(osb, handle); out_unlock: @@ -6813,7 +7080,8 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_path(fe_bh, &di->id2.i_list); + path = ocfs2_new_path(fe_bh, &di->id2.i_list, + ocfs2_journal_access_di); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6984,20 +7252,14 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh); + status = ocfs2_read_extent_block(inode, + le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - - brelse(last_eb_bh); - status = -EIO; - goto bail; - } } (*tc)->tc_last_eb_bh = last_eb_bh; @@ -7052,8 +7314,8 @@ int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, goto out; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 70257c84cfb..cceff5c37f4 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -45,7 +45,9 @@ * * ocfs2_extent_tree contains info for the root of the b-tree, it must have a * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree - * functions. + * functions. With metadata ecc, we now call different journal_access + * functions for each type of metadata, so it must have the + * root_journal_access function. * ocfs2_extent_tree_operations abstract the normal operations we do for * the root of extent b-tree. */ @@ -54,6 +56,7 @@ struct ocfs2_extent_tree { struct ocfs2_extent_tree_operations *et_ops; struct buffer_head *et_root_bh; struct ocfs2_extent_list *et_root_el; + ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; }; @@ -68,10 +71,18 @@ void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, struct buffer_head *bh); +struct ocfs2_xattr_value_buf; void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, struct inode *inode, - struct buffer_head *bh, - struct ocfs2_xattr_value_root *xv); + struct ocfs2_xattr_value_buf *vb); + +/* + * Read an extent block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The extent block will be validated + * with ocfs2_validate_extent_block(). + */ +int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, + struct buffer_head **bh); struct ocfs2_alloc_context; int ocfs2_insert_extent(struct ocfs2_super *osb, @@ -110,6 +121,11 @@ int ocfs2_remove_extent(struct inode *inode, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); +int ocfs2_remove_btree_range(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc); + int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, struct ocfs2_extent_tree *et); @@ -167,10 +183,18 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb); */ struct ocfs2_cached_dealloc_ctxt { struct ocfs2_per_slot_free_list *c_first_suballocator; + struct ocfs2_cached_block_free *c_global_allocator; }; static inline void ocfs2_init_dealloc_ctxt(struct ocfs2_cached_dealloc_ctxt *c) { c->c_first_suballocator = NULL; + c->c_global_allocator = NULL; +} +int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, + u64 blkno, unsigned int bit); +static inline int ocfs2_dealloc_has_cluster(struct ocfs2_cached_dealloc_ctxt *c) +{ + return c->c_global_allocator != NULL; } int ocfs2_run_deallocs(struct ocfs2_super *osb, struct ocfs2_cached_dealloc_ctxt *ctxt); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index c22543b3342..a067a6cffb0 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -27,6 +27,7 @@ #include <linux/swap.h> #include <linux/pipe_fs_i.h> #include <linux/mpage.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_FILE_IO #include <cluster/masklog.h> @@ -68,20 +69,13 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto bail; } fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(ML_ERROR, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature); - goto bail; - } - if ((u64)iblock >= ocfs2_clusters_to_blocks(inode->i_sb, le32_to_cpu(fe->i_clusters))) { mlog(ML_ERROR, "block offset is outside the allocated size: " @@ -262,7 +256,7 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -481,12 +475,6 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, if (ocfs2_should_order_data(inode)) { ret = ocfs2_jbd2_file_inode(handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, NULL, - ocfs2_journal_dirty_data); -#endif if (ret < 0) mlog_errno(ret); } @@ -1072,15 +1060,8 @@ static void ocfs2_write_failure(struct inode *inode, tmppage = wc->w_pages[i]; if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) { + if (ocfs2_should_order_data(inode)) ocfs2_jbd2_file_inode(wc->w_handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - walk_page_buffers(wc->w_handle, - page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); -#endif - } block_commit_write(tmppage, from, to); } @@ -1531,8 +1512,8 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, goto out; } - ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { ocfs2_commit_trans(osb, handle); @@ -1750,15 +1731,20 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, wc->w_handle = handle; + if (clusters_to_alloc && vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc))) { + ret = -EDQUOT; + goto out_commit; + } /* * We don't want this to fail in ocfs2_write_end(), so do it * here. */ - ret = ocfs2_journal_access(handle, inode, wc->w_di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, wc->w_di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } /* @@ -1771,14 +1757,14 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, mmap_page); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, len); if (ret) { mlog_errno(ret); - goto out_commit; + goto out_quota; } if (data_ac) @@ -1790,6 +1776,10 @@ success: *pagep = wc->w_target_page; *fsdata = wc; return 0; +out_quota: + if (clusters_to_alloc) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_alloc)); out_commit: ocfs2_commit_trans(osb, handle); @@ -1919,15 +1909,8 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) { + if (ocfs2_should_order_data(inode)) ocfs2_jbd2_file_inode(wc->w_handle, inode); -#ifdef CONFIG_OCFS2_COMPAT_JBD - walk_page_buffers(wc->w_handle, - page_buffers(tmppage), - from, to, NULL, - ocfs2_journal_dirty_data); -#endif - } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/blockcheck.c b/fs/ocfs2/blockcheck.c new file mode 100644 index 00000000000..2a947c44e59 --- /dev/null +++ b/fs/ocfs2/blockcheck.c @@ -0,0 +1,477 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.c + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2006, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/crc32.h> +#include <linux/buffer_head.h> +#include <linux/bitops.h> +#include <asm/byteorder.h> + +#include <cluster/masklog.h> + +#include "ocfs2.h" + +#include "blockcheck.h" + + +/* + * We use the following conventions: + * + * d = # data bits + * p = # parity bits + * c = # total code bits (d + p) + */ + + +/* + * Calculate the bit offset in the hamming code buffer based on the bit's + * offset in the data buffer. Since the hamming code reserves all + * power-of-two bits for parity, the data bit number and the code bit + * number are offest by all the parity bits beforehand. + * + * Recall that bit numbers in hamming code are 1-based. This function + * takes the 0-based data bit from the caller. + * + * An example. Take bit 1 of the data buffer. 1 is a power of two (2^0), + * so it's a parity bit. 2 is a power of two (2^1), so it's a parity bit. + * 3 is not a power of two. So bit 1 of the data buffer ends up as bit 3 + * in the code buffer. + * + * The caller can pass in *p if it wants to keep track of the most recent + * number of parity bits added. This allows the function to start the + * calculation at the last place. + */ +static unsigned int calc_code_bit(unsigned int i, unsigned int *p_cache) +{ + unsigned int b, p = 0; + + /* + * Data bits are 0-based, but we're talking code bits, which + * are 1-based. + */ + b = i + 1; + + /* Use the cache if it is there */ + if (p_cache) + p = *p_cache; + b += p; + + /* + * For every power of two below our bit number, bump our bit. + * + * We compare with (b + 1) because we have to compare with what b + * would be _if_ it were bumped up by the parity bit. Capice? + * + * p is set above. + */ + for (; (1 << p) < (b + 1); p++) + b++; + + if (p_cache) + *p_cache = p; + + return b; +} + +/* + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, unsigned int nr) +{ + unsigned int i, b, p = 0; + + BUG_ON(!d); + + /* + * b is the hamming code bit number. Hamming code specifies a + * 1-based array, but C uses 0-based. So 'i' is for C, and 'b' is + * for the algorithm. + * + * The i++ in the for loop is so that the start offset passed + * to ocfs2_find_next_bit_set() is one greater than the previously + * found bit. + */ + for (i = 0; (i = ocfs2_find_next_bit(data, d, i)) < d; i++) + { + /* + * i is the offset in this hunk, nr + i is the total bit + * offset. + */ + b = calc_code_bit(nr + i, &p); + + /* + * Data bits in the resultant code are checked by + * parity bits that are part of the bit number + * representation. Huh? + * + * <wikipedia href="http://en.wikipedia.org/wiki/Hamming_code"> + * In other words, the parity bit at position 2^k + * checks bits in positions having bit k set in + * their binary representation. Conversely, for + * instance, bit 13, i.e. 1101(2), is checked by + * bits 1000(2) = 8, 0100(2)=4 and 0001(2) = 1. + * </wikipedia> + * + * Note that 'k' is the _code_ bit number. 'b' in + * our loop. + */ + parity ^= b; + } + + /* While the data buffer was treated as little endian, the + * return value is in host endian. */ + return parity; +} + +u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize) +{ + return ocfs2_hamming_encode(0, data, blocksize * 8, 0); +} + +/* + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one hunk, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix) +{ + unsigned int i, b; + + BUG_ON(!d); + + /* + * If the bit to fix has an hweight of 1, it's a parity bit. One + * busted parity bit is its own error. Nothing to do here. + */ + if (hweight32(fix) == 1) + return; + + /* + * nr + d is the bit right past the data hunk we're looking at. + * If fix after that, nothing to do + */ + if (fix >= calc_code_bit(nr + d, NULL)) + return; + + /* + * nr is the offset in the data hunk we're starting at. Let's + * start b at the offset in the code buffer. See hamming_encode() + * for a more detailed description of 'b'. + */ + b = calc_code_bit(nr, NULL); + /* If the fix is before this hunk, nothing to do */ + if (fix < b) + return; + + for (i = 0; i < d; i++, b++) + { + /* Skip past parity bits */ + while (hweight32(b) == 1) + b++; + + /* + * i is the offset in this data hunk. + * nr + i is the offset in the total data buffer. + * b is the offset in the total code buffer. + * + * Thus, when b == fix, bit i in the current hunk needs + * fixing. + */ + if (b == fix) + { + if (ocfs2_test_bit(i, data)) + ocfs2_clear_bit(i, data); + else + ocfs2_set_bit(i, data); + break; + } + } +} + +void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix) +{ + ocfs2_hamming_fix(data, blocksize * 8, 0, fix); +} + +/* + * This function generates check information for a block. + * data is the block to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc) +{ + u32 crc; + u32 ecc; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + crc = crc32_le(~0, data, blocksize); + ecc = ocfs2_hamming_encode_block(data, blocksize); + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHORT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information. Like _compute, + * the function will take care of zeroing bc before calculating check codes. + * If bc is not a pointer inside data, the caller must have zeroed any + * inline ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc) +{ + int rc = 0; + struct ocfs2_block_check check; + u32 crc, ecc; + + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); + check.bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + crc = crc32_le(~0, data, blocksize); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, + "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + ecc = ocfs2_hamming_encode_block(data, blocksize); + ocfs2_hamming_fix_block(data, blocksize, ecc ^ check.bc_ecc); + + /* And check the crc32 again */ + crc = crc32_le(~0, data, blocksize); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); + bc->bc_ecc = cpu_to_le16(check.bc_ecc); + + return rc; +} + +/* + * This function generates check information for a list of buffer_heads. + * bhs is the blocks to be checked. bc is a pointer to the + * ocfs2_block_check structure describing the crc32 and the ecc. + * + * bc should be a pointer inside data, as the function will + * take care of zeroing it before calculating the check information. If + * bc does not point inside data, the caller must make sure any inline + * ocfs2_block_check structures are zeroed. + * + * The data buffer must be in on-disk endian (little endian for ocfs2). + * bc will be filled with little-endian values and will be ready to go to + * disk. + */ +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int i; + u32 crc, ecc; + + BUG_ON(nr < 0); + + if (!nr) + return; + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + for (i = 0, crc = ~0, ecc = 0; i < nr; i++) { + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + + /* + * No ecc'd ocfs2 structure is larger than 4K, so ecc will be no + * larger than 16 bits. + */ + BUG_ON(ecc > USHORT_MAX); + + bc->bc_crc32e = cpu_to_le32(crc); + bc->bc_ecc = cpu_to_le16((u16)ecc); +} + +/* + * This function validates existing check information on a list of + * buffer_heads. Like _compute_bhs, the function will take care of + * zeroing bc before calculating check codes. If bc is not a pointer + * inside data, the caller must have zeroed any inline + * ocfs2_block_check structures. + * + * Again, the data passed in should be the on-disk endian. + */ +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int i, rc = 0; + struct ocfs2_block_check check; + u32 crc, ecc, fix; + + BUG_ON(nr < 0); + + if (!nr) + return 0; + + check.bc_crc32e = le32_to_cpu(bc->bc_crc32e); + check.bc_ecc = le16_to_cpu(bc->bc_ecc); + + memset(bc, 0, sizeof(struct ocfs2_block_check)); + + /* Fast path - if the crc32 validates, we're good to go */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, + "CRC32 failed: stored: %u, computed %u. Applying ECC.\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + /* Ok, try ECC fixups */ + for (i = 0, ecc = 0; i < nr; i++) { + /* + * The number of bits in a buffer is obviously b_size*8. + * The offset of this buffer is b_size*i, so the bit offset + * of this buffer is b_size*8*i. + */ + ecc = (u16)ocfs2_hamming_encode(ecc, bhs[i]->b_data, + bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i); + } + fix = ecc ^ check.bc_ecc; + for (i = 0; i < nr; i++) { + /* + * Try the fix against each buffer. It will only affect + * one of them. + */ + ocfs2_hamming_fix(bhs[i]->b_data, bhs[i]->b_size * 8, + bhs[i]->b_size * 8 * i, fix); + } + + /* And check the crc32 again */ + for (i = 0, crc = ~0; i < nr; i++) + crc = crc32_le(crc, bhs[i]->b_data, bhs[i]->b_size); + if (crc == check.bc_crc32e) + goto out; + + mlog(ML_ERROR, "Fixed CRC32 failed: stored: %u, computed %u\n", + (unsigned int)check.bc_crc32e, (unsigned int)crc); + + rc = -EIO; + +out: + bc->bc_crc32e = cpu_to_le32(check.bc_crc32e); + bc->bc_ecc = cpu_to_le16(check.bc_ecc); + + return rc; +} + +/* + * These are the main API. They check the superblock flag before + * calling the underlying operations. + * + * They expect the buffer(s) to be in disk format. + */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute(data, sb->s_blocksize, bc); +} + +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc) +{ + int rc = 0; + + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + rc = ocfs2_block_check_validate(data, sb->s_blocksize, bc); + + return rc; +} + +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + ocfs2_block_check_compute_bhs(bhs, nr, bc); +} + +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc) +{ + int rc = 0; + + if (ocfs2_meta_ecc(OCFS2_SB(sb))) + rc = ocfs2_block_check_validate_bhs(bhs, nr, bc); + + return rc; +} + diff --git a/fs/ocfs2/blockcheck.h b/fs/ocfs2/blockcheck.h new file mode 100644 index 00000000000..70ec3feda32 --- /dev/null +++ b/fs/ocfs2/blockcheck.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * blockcheck.h + * + * Checksum and ECC codes for the OCFS2 userspace library. + * + * Copyright (C) 2004, 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License, version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_BLOCKCHECK_H +#define OCFS2_BLOCKCHECK_H + + +/* High level block API */ +void ocfs2_compute_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc(struct super_block *sb, void *data, + struct ocfs2_block_check *bc); +void ocfs2_compute_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_validate_meta_ecc_bhs(struct super_block *sb, + struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); + +/* Lower level API */ +void ocfs2_block_check_compute(void *data, size_t blocksize, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate(void *data, size_t blocksize, + struct ocfs2_block_check *bc); +void ocfs2_block_check_compute_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); +int ocfs2_block_check_validate_bhs(struct buffer_head **bhs, int nr, + struct ocfs2_block_check *bc); + +/* + * Hamming code functions + */ + +/* + * Encoding hamming code parity bits for a buffer. + * + * This is the low level encoder function. It can be called across + * multiple hunks just like the crc32 code. 'd' is the number of bits + * _in_this_hunk_. nr is the bit offset of this hunk. So, if you had + * two 512B buffers, you would do it like so: + * + * parity = ocfs2_hamming_encode(0, buf1, 512 * 8, 0); + * parity = ocfs2_hamming_encode(parity, buf2, 512 * 8, 512 * 8); + * + * If you just have one buffer, use ocfs2_hamming_encode_block(). + */ +u32 ocfs2_hamming_encode(u32 parity, void *data, unsigned int d, + unsigned int nr); +/* + * Fix a buffer with a bit error. The 'fix' is the original parity + * xor'd with the parity calculated now. + * + * Like ocfs2_hamming_encode(), this can handle hunks. nr is the bit + * offset of the current hunk. If bit to be fixed is not part of the + * current hunk, this does nothing. + * + * If you only have one buffer, use ocfs2_hamming_fix_block(). + */ +void ocfs2_hamming_fix(void *data, unsigned int d, unsigned int nr, + unsigned int fix); + +/* Convenience wrappers for a single buffer of data */ +extern u32 ocfs2_hamming_encode_block(void *data, unsigned int blocksize); +extern void ocfs2_hamming_fix_block(void *data, unsigned int blocksize, + unsigned int fix); +#endif diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index 3a178ec48d7..15c8e6deee2 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -39,6 +39,18 @@ #include "buffer_head_io.h" +/* + * Bits on bh->b_state used by ocfs2. + * + * These MUST be after the JBD2 bits. Hence, we use BH_JBDPrivateStart. + */ +enum ocfs2_state_bits { + BH_NeedsValidate = BH_JBDPrivateStart, +}; + +/* Expand the magic b_state functions */ +BUFFER_FNS(NeedsValidate, needs_validate); + int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode) { @@ -166,7 +178,9 @@ bail: } int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, - struct buffer_head *bhs[], int flags) + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) { int status = 0; int i, ignore_cache = 0; @@ -298,6 +312,8 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, clear_buffer_uptodate(bh); get_bh(bh); /* for end_buffer_read_sync() */ + if (validate) + set_buffer_needs_validate(bh); bh->b_end_io = end_buffer_read_sync; submit_bh(READ, bh); continue; @@ -328,6 +344,20 @@ int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, bhs[i] = NULL; continue; } + + if (buffer_needs_validate(bh)) { + /* We never set NeedsValidate if the + * buffer was held by the journal, so + * that better not have changed */ + BUG_ON(buffer_jbd(bh)); + clear_buffer_needs_validate(bh); + status = validate(inode->i_sb, bh); + if (status) { + put_bh(bh); + bhs[i] = NULL; + continue; + } + } } /* Always set the buffer in the cache, even if it was diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index 75e1dcb1ade..c75d682dadd 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -31,21 +31,24 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int uptodate); -static inline int ocfs2_read_block(struct inode *inode, - u64 off, - struct buffer_head **bh); - int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode); -int ocfs2_read_blocks(struct inode *inode, - u64 block, - int nr, - struct buffer_head *bhs[], - int flags); int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, unsigned int nr, struct buffer_head *bhs[]); +/* + * If not NULL, validate() will be called on a buffer that is freshly + * read from disk. It will not be called if the buffer was in cache. + * Note that if validate() is being used for this buffer, it needs to + * be set even for a READAHEAD call, as it marks the buffer for later + * validation. + */ +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); + int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh); @@ -53,7 +56,9 @@ int ocfs2_write_super_or_backup(struct ocfs2_super *osb, #define OCFS2_BH_READAHEAD 8 static inline int ocfs2_read_block(struct inode *inode, u64 off, - struct buffer_head **bh) + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) { int status = 0; @@ -63,7 +68,7 @@ static inline int ocfs2_read_block(struct inode *inode, u64 off, goto bail; } - status = ocfs2_read_blocks(inode, off, 1, bh, 0); + status = ocfs2_read_blocks(inode, off, 1, bh, 0, validate); bail: return status; diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 6ebaa58e2c0..04697ba7f73 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -854,7 +854,7 @@ static int o2hb_thread(void *data) while (!kthread_should_stop() && !reg->hr_unclean_stop) { /* We track the time spent inside - * o2hb_do_disk_heartbeat so that we avoid more then + * o2hb_do_disk_heartbeat so that we avoid more than * hr_timeout_ms between disk writes. On busy systems * this should result in a heartbeat which is less * likely to time itself out. */ diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index d8a0cb92cef..96df5416993 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -110,6 +110,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(QUORUM), define_mask(EXPORT), define_mask(XATTR), + define_mask(QUOTA), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 57670c68047..7e72a81bc2d 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -113,6 +113,7 @@ #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ #define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ +#define ML_QUOTA 0x0000000040000000ULL /* ocfs2 quota operations */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 026e6eb8518..f2c4098cf33 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -40,6 +40,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -47,6 +48,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dir.h" #include "dlmglue.h" #include "extent_map.h" @@ -82,47 +84,72 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); -static struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada) +/* + * These are distinct checks because future versions of the file system will + * want to have a trailing dirent structure independent of indexing. + */ +static int ocfs2_dir_has_trailer(struct inode *dir) { - struct buffer_head *bh = NULL; - int tmperr; - u64 p_blkno; - int readflags = 0; + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + return 0; - if (reada) - readflags |= OCFS2_BH_READAHEAD; + return ocfs2_meta_ecc(OCFS2_SB(dir->i_sb)); +} - if (((u64)block << inode->i_sb->s_blocksize_bits) >= - i_size_read(inode)) { - BUG_ON(!reada); - return NULL; - } +static int ocfs2_supports_dir_trailer(struct ocfs2_super *osb) +{ + return ocfs2_meta_ecc(osb); +} - down_read(&OCFS2_I(inode)->ip_alloc_sem); - tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, - NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (tmperr < 0) { - mlog_errno(tmperr); - goto fail; - } +static inline unsigned int ocfs2_dir_trailer_blk_off(struct super_block *sb) +{ + return sb->s_blocksize - sizeof(struct ocfs2_dir_block_trailer); +} - tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); - if (tmperr < 0) - goto fail; +#define ocfs2_trailer_from_bh(_bh, _sb) ((struct ocfs2_dir_block_trailer *) ((_bh)->b_data + ocfs2_dir_trailer_blk_off((_sb)))) - tmperr = 0; +/* XXX ocfs2_block_dqtrailer() is similar but not quite - can we make + * them more consistent? */ +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data) +{ + char *p = data; - *err = 0; - return bh; + p += blocksize - sizeof(struct ocfs2_dir_block_trailer); + return (struct ocfs2_dir_block_trailer *)p; +} -fail: - brelse(bh); - bh = NULL; +/* + * XXX: This is executed once on every dirent. We should consider optimizing + * it. + */ +static int ocfs2_skip_dir_trailer(struct inode *dir, + struct ocfs2_dir_entry *de, + unsigned long offset, + unsigned long blklen) +{ + unsigned long toff = blklen - sizeof(struct ocfs2_dir_block_trailer); - *err = -EIO; - return NULL; + if (!ocfs2_dir_has_trailer(dir)) + return 0; + + if (offset != toff) + return 0; + + return 1; +} + +static void ocfs2_init_dir_trailer(struct inode *inode, + struct buffer_head *bh) +{ + struct ocfs2_dir_block_trailer *trailer; + + trailer = ocfs2_trailer_from_bh(bh, inode->i_sb); + strcpy(trailer->db_signature, OCFS2_DIR_TRAILER_SIGNATURE); + trailer->db_compat_rec_len = + cpu_to_le16(sizeof(struct ocfs2_dir_block_trailer)); + trailer->db_parent_dinode = cpu_to_le64(OCFS2_I(inode)->ip_blkno); + trailer->db_blkno = cpu_to_le64(bh->b_blocknr); } /* @@ -231,7 +258,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -250,6 +277,108 @@ out: return NULL; } +static int ocfs2_validate_dir_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dir_block_trailer *trailer = + ocfs2_trailer_from_bh(bh, sb); + + + /* + * We don't validate dirents here, that's handled + * in-place when the code walks them. + */ + mlog(0, "Validating dirblock %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + * + * Note that we are safe to call this even if the directory + * doesn't have a trailer. Filesystems without metaecc will do + * nothing, and filesystems with it will have one. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &trailer->db_check); + if (rc) + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + + return rc; +} + +/* + * This function forces all errors to -EIO for consistency with its + * predecessor, ocfs2_bread(). We haven't audited what returning the + * real error codes would do to callers. We log the real codes with + * mlog_errno() before we squash them. + */ +static int ocfs2_read_dir_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, int flags) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + struct ocfs2_dir_block_trailer *trailer; + + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, flags, + ocfs2_validate_dir_block); + if (rc) { + mlog_errno(rc); + goto out; + } + + /* + * We check the trailer here rather than in + * ocfs2_validate_dir_block() because that function doesn't have + * the inode to test. + */ + if (!(flags & OCFS2_BH_READAHEAD) && + ocfs2_dir_has_trailer(inode)) { + trailer = ocfs2_trailer_from_bh(tmp, inode->i_sb); + if (!OCFS2_IS_VALID_DIR_TRAILER(trailer)) { + rc = -EINVAL; + ocfs2_error(inode->i_sb, + "Invalid dirblock #%llu: " + "signature = %.*s\n", + (unsigned long long)tmp->b_blocknr, 7, + trailer->db_signature); + goto out; + } + if (le64_to_cpu(trailer->db_blkno) != tmp->b_blocknr) { + rc = -EINVAL; + ocfs2_error(inode->i_sb, + "Directory block #%llu has an invalid " + "db_blkno of %llu", + (unsigned long long)tmp->b_blocknr, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + if (le64_to_cpu(trailer->db_parent_dinode) != + OCFS2_I(inode)->ip_blkno) { + rc = -EINVAL; + ocfs2_error(inode->i_sb, + "Directory block #%llu on dinode " + "#%llu has an invalid parent_dinode " + "of %llu", + (unsigned long long)tmp->b_blocknr, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)le64_to_cpu(trailer->db_blkno)); + goto out; + } + } + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc ? -EIO : 0; +} + static struct buffer_head *ocfs2_find_entry_el(const char *name, int namelen, struct inode *dir, struct ocfs2_dir_entry **res_dir) @@ -296,15 +425,17 @@ restart: } num++; - bh = ocfs2_bread(dir, b++, &err, 1); + bh = NULL; + err = ocfs2_read_dir_block(dir, b++, &bh, + OCFS2_BH_READAHEAD); bh_use[ra_max] = bh; } } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; - if (ocfs2_read_block(dir, block, &bh)) { + if (ocfs2_read_dir_block(dir, block, &bh, 0)) { /* read error, skip block & hope for the best. - * ocfs2_read_block() has released the bh. */ + * ocfs2_read_dir_block() has released the bh. */ ocfs2_error(dir->i_sb, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, @@ -381,14 +512,18 @@ int ocfs2_update_entry(struct inode *dir, handle_t *handle, struct inode *new_entry_inode) { int ret; + ocfs2_journal_access_func access = ocfs2_journal_access_db; /* * The same code works fine for both inline-data and extent - * based directories, so no need to split this up. + * based directories, so no need to split this up. The only + * difference is the journal_access function. */ - ret = ocfs2_journal_access(handle, dir, de_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + + ret = access(handle, dir, de_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -410,9 +545,13 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, { struct ocfs2_dir_entry *de, *pde; int i, status = -ENOENT; + ocfs2_journal_access_func access = ocfs2_journal_access_db; mlog_entry("(0x%p, 0x%p, 0x%p, 0x%p)\n", handle, dir, de_del, bh); + if (OCFS2_I(dir)->ip_dyn_features & OCFS2_INLINE_DATA_FL) + access = ocfs2_journal_access_di; + i = 0; pde = NULL; de = (struct ocfs2_dir_entry *) first_de; @@ -423,8 +562,8 @@ static int __ocfs2_delete_entry(handle_t *handle, struct inode *dir, goto bail; } if (de == de_del) { - status = ocfs2_journal_access(handle, dir, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = access(handle, dir, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { status = -EIO; mlog_errno(status); @@ -458,7 +597,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(dir, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -576,6 +715,16 @@ int __ocfs2_add_entry(handle_t *handle, goto bail; } + /* We're guaranteed that we should have space, so we + * can't possibly have hit the trailer...right? */ + mlog_bug_on_msg(ocfs2_skip_dir_trailer(dir, de, offset, size), + "Hit dir trailer trying to insert %.*s " + "(namelen %d) into directory %llu. " + "offset is %lu, trailer offset is %d\n", + namelen, name, namelen, + (unsigned long long)parent_fe_bh->b_blocknr, + offset, ocfs2_dir_trailer_blk_off(dir->i_sb)); + if (ocfs2_dirent_would_fit(de, rec_len)) { dir->i_mtime = dir->i_ctime = CURRENT_TIME; retval = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh); @@ -584,8 +733,14 @@ int __ocfs2_add_entry(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, dir, insert_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + if (insert_bh == parent_fe_bh) + status = ocfs2_journal_access_di(handle, dir, + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + else + status = ocfs2_journal_access_db(handle, dir, + insert_bh, + OCFS2_JOURNAL_ACCESS_WRITE); /* By now the buffer is marked for journaling */ offset += le16_to_cpu(de->rec_len); if (le64_to_cpu(de->inode)) { @@ -611,6 +766,7 @@ int __ocfs2_add_entry(handle_t *handle, retval = 0; goto bail; } + offset += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *) ((char *) de + le16_to_cpu(de->rec_len)); } @@ -636,7 +792,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -724,7 +880,6 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, int i, stored; struct buffer_head * bh, * tmp; struct ocfs2_dir_entry * de; - int err; struct super_block * sb = inode->i_sb; unsigned int ra_sectors = 16; @@ -735,12 +890,8 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, while (!error && !stored && *f_pos < i_size_read(inode)) { blk = (*f_pos) >> sb->s_blocksize_bits; - bh = ocfs2_bread(inode, blk, &err, 0); - if (!bh) { - mlog(ML_ERROR, - "directory #%llu contains a hole at offset %lld\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, - *f_pos); + if (ocfs2_read_dir_block(inode, blk, &bh, 0)) { + /* Skip the corrupt dirblock and keep trying */ *f_pos += sb->s_blocksize - offset; continue; } @@ -754,8 +905,10 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, || (((last_ra_blk - blk) << 9) <= (ra_sectors / 2))) { for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { - tmp = ocfs2_bread(inode, ++blk, &err, 1); - brelse(tmp); + tmp = NULL; + if (!ocfs2_read_dir_block(inode, ++blk, &tmp, + OCFS2_BH_READAHEAD)) + brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; @@ -828,6 +981,7 @@ revalidate: } offset = 0; brelse(bh); + bh = NULL; } stored = 0; @@ -1050,9 +1204,15 @@ int ocfs2_empty_dir(struct inode *inode) return !priv.seen_other; } -static void ocfs2_fill_initial_dirents(struct inode *inode, - struct inode *parent, - char *start, unsigned int size) +/* + * Fills "." and ".." dirents in a new directory block. Returns dirent for + * "..", which might be used during creation of a directory with a trailing + * header. It is otherwise safe to ignore the return code. + */ +static struct ocfs2_dir_entry *ocfs2_fill_initial_dirents(struct inode *inode, + struct inode *parent, + char *start, + unsigned int size) { struct ocfs2_dir_entry *de = (struct ocfs2_dir_entry *)start; @@ -1069,6 +1229,8 @@ static void ocfs2_fill_initial_dirents(struct inode *inode, de->name_len = 2; strcpy(de->name, ".."); ocfs2_set_de_type(de, S_IFDIR); + + return de; } /* @@ -1086,8 +1248,8 @@ static int ocfs2_fill_new_dir_id(struct ocfs2_super *osb, struct ocfs2_inline_data *data = &di->id2.i_data; unsigned int size = le16_to_cpu(data->id_count); - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -1121,10 +1283,15 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac) { int status; + unsigned int size = osb->sb->s_blocksize; struct buffer_head *new_bh = NULL; + struct ocfs2_dir_entry *de; mlog_entry_void(); + if (ocfs2_supports_dir_trailer(osb)) + size = ocfs2_dir_trailer_blk_off(parent->i_sb); + status = ocfs2_do_extend_dir(osb->sb, handle, inode, fe_bh, data_ac, NULL, &new_bh); if (status < 0) { @@ -1134,16 +1301,17 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, ocfs2_set_new_buffer_uptodate(inode, new_bh); - status = ocfs2_journal_access(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_db(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(new_bh->b_data, 0, osb->sb->s_blocksize); - ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, - osb->sb->s_blocksize); + de = ocfs2_fill_initial_dirents(inode, parent, new_bh->b_data, size); + if (ocfs2_supports_dir_trailer(osb)) + ocfs2_init_dir_trailer(inode, new_bh); status = ocfs2_journal_dirty(handle, new_bh); if (status < 0) { @@ -1184,13 +1352,27 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb, data_ac); } +/* + * Expand rec_len of the rightmost dirent in a directory block so that it + * contains the end of our valid space for dirents. We do this during + * expansion from an inline directory to one with extents. The first dir block + * in that case is taken from the inline data portion of the inode block. + * + * We add the dir trailer if this filesystem wants it. + */ static void ocfs2_expand_last_dirent(char *start, unsigned int old_size, - unsigned int new_size) + struct super_block *sb) { struct ocfs2_dir_entry *de; struct ocfs2_dir_entry *prev_de; char *de_buf, *limit; - unsigned int bytes = new_size - old_size; + unsigned int new_size = sb->s_blocksize; + unsigned int bytes; + + if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + new_size = ocfs2_dir_trailer_blk_off(sb); + + bytes = new_size - old_size; limit = start + old_size; de_buf = start; @@ -1216,9 +1398,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, unsigned int blocks_wanted, struct buffer_head **first_block_bh) { - int ret, credits = OCFS2_INLINE_TO_EXTENTS_CREDITS; u32 alloc, bit_off, len; struct super_block *sb = dir->i_sb; + int ret, credits = ocfs2_inline_to_extents_credits(sb); u64 blkno, bytes = blocks_wanted << sb->s_blocksize_bits; struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(dir); @@ -1227,6 +1409,7 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; struct ocfs2_extent_tree et; + int did_quota = 0; ocfs2_init_dinode_extent_tree(&et, dir, di_bh); @@ -1264,6 +1447,12 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, goto out_sem; } + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, alloc))) { + ret = -EDQUOT; + goto out_commit; + } + did_quota = 1; /* * Try to claim as many clusters as the bitmap can give though * if we only get one now, that's enough to continue. The rest @@ -1290,8 +1479,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, ocfs2_set_new_buffer_uptodate(dir, dirdata_bh); - ret = ocfs2_journal_access(handle, dir, dirdata_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_db(handle, dir, dirdata_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; @@ -1300,8 +1489,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, memcpy(dirdata_bh->b_data, di->id2.i_data.id_data, i_size_read(dir)); memset(dirdata_bh->b_data + i_size_read(dir), 0, sb->s_blocksize - i_size_read(dir)); - ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), - sb->s_blocksize); + ocfs2_expand_last_dirent(dirdata_bh->b_data, i_size_read(dir), sb); + if (ocfs2_supports_dir_trailer(osb)) + ocfs2_init_dir_trailer(dir, dirdata_bh); ret = ocfs2_journal_dirty(handle, dirdata_bh); if (ret) { @@ -1317,8 +1507,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * We let the later dirent insert modify c/mtime - to the user * the data hasn't changed. */ - ret = ocfs2_journal_access(handle, dir, di_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_di(handle, dir, di_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); goto out_commit; @@ -1386,6 +1576,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, dirdata_bh = NULL; out_commit: + if (ret < 0 && did_quota) + vfs_dq_free_space_nodirty(dir, + ocfs2_clusters_to_bytes(osb->sb, 2)); ocfs2_commit_trans(osb, handle); out_sem: @@ -1410,7 +1603,7 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct buffer_head **new_bh) { int status; - int extend; + int extend, did_quota = 0; u64 p_blkno, v_blkno; spin_lock(&OCFS2_I(dir)->ip_lock); @@ -1420,6 +1613,13 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; + if (vfs_dq_alloc_space_nodirty(dir, + ocfs2_clusters_to_bytes(sb, 1))) { + status = -EDQUOT; + goto bail; + } + did_quota = 1; + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, 1, 0, parent_fe_bh, handle, data_ac, meta_ac, NULL); @@ -1445,6 +1645,8 @@ static int ocfs2_do_extend_dir(struct super_block *sb, } status = 0; bail: + if (did_quota && status < 0) + vfs_dq_free_space_nodirty(dir, ocfs2_clusters_to_bytes(sb, 1)); mlog_exit(status); return status; } @@ -1569,16 +1771,22 @@ do_extend: ocfs2_set_new_buffer_uptodate(dir, new_bh); - status = ocfs2_journal_access(handle, dir, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_db(handle, dir, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; } memset(new_bh->b_data, 0, sb->s_blocksize); + de = (struct ocfs2_dir_entry *) new_bh->b_data; de->inode = 0; - de->rec_len = cpu_to_le16(sb->s_blocksize); + if (ocfs2_dir_has_trailer(dir)) { + de->rec_len = cpu_to_le16(ocfs2_dir_trailer_blk_off(sb)); + ocfs2_init_dir_trailer(dir, new_bh); + } else { + de->rec_len = cpu_to_le16(sb->s_blocksize); + } status = ocfs2_journal_dirty(handle, new_bh); if (status < 0) { mlog_errno(status); @@ -1620,11 +1828,21 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, unsigned int *blocks_wanted) { int ret; + struct super_block *sb = dir->i_sb; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_dir_entry *de, *last_de = NULL; char *de_buf, *limit; unsigned long offset = 0; - unsigned int rec_len, new_rec_len; + unsigned int rec_len, new_rec_len, free_space = dir->i_sb->s_blocksize; + + /* + * This calculates how many free bytes we'd have in block zero, should + * this function force expansion to an extent tree. + */ + if (ocfs2_supports_dir_trailer(OCFS2_SB(sb))) + free_space = ocfs2_dir_trailer_blk_off(sb) - i_size_read(dir); + else + free_space = dir->i_sb->s_blocksize - i_size_read(dir); de_buf = di->id2.i_data.id_data; limit = de_buf + i_size_read(dir); @@ -1641,6 +1859,11 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, ret = -EEXIST; goto out; } + /* + * No need to check for a trailing dirent record here as + * they're not used for inline dirs. + */ + if (ocfs2_dirent_would_fit(de, rec_len)) { /* Ok, we found a spot. Return this bh and let * the caller actually fill it in. */ @@ -1661,7 +1884,7 @@ static int ocfs2_find_dir_space_id(struct inode *dir, struct buffer_head *di_bh, * dirent can be found. */ *blocks_wanted = 1; - new_rec_len = le16_to_cpu(last_de->rec_len) + (dir->i_sb->s_blocksize - i_size_read(dir)); + new_rec_len = le16_to_cpu(last_de->rec_len) + free_space; if (new_rec_len < (rec_len + OCFS2_DIR_REC_LEN(last_de->name_len))) *blocks_wanted = 2; @@ -1679,9 +1902,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, struct ocfs2_dir_entry *de; struct super_block *sb = dir->i_sb; int status; + int blocksize = dir->i_sb->s_blocksize; - bh = ocfs2_bread(dir, 0, &status, 0); - if (!bh) { + status = ocfs2_read_dir_block(dir, 0, &bh, 0); + if (status) { mlog_errno(status); goto bail; } @@ -1702,11 +1926,10 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = -ENOSPC; goto bail; } - bh = ocfs2_bread(dir, - offset >> sb->s_blocksize_bits, - &status, - 0); - if (!bh) { + status = ocfs2_read_dir_block(dir, + offset >> sb->s_blocksize_bits, + &bh, 0); + if (status) { mlog_errno(status); goto bail; } @@ -1721,6 +1944,11 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = -EEXIST; goto bail; } + + if (ocfs2_skip_dir_trailer(dir, de, offset % blocksize, + blocksize)) + goto next; + if (ocfs2_dirent_would_fit(de, rec_len)) { /* Ok, we found a spot. Return this bh and let * the caller actually fill it in. */ @@ -1729,6 +1957,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = 0; goto bail; } +next: offset += le16_to_cpu(de->rec_len); de = (struct ocfs2_dir_entry *)((char *) de + le16_to_cpu(de->rec_len)); } diff --git a/fs/ocfs2/dir.h b/fs/ocfs2/dir.h index ce48b9080d8..c511e2e18e9 100644 --- a/fs/ocfs2/dir.h +++ b/fs/ocfs2/dir.h @@ -83,4 +83,6 @@ int ocfs2_fill_new_dir(struct ocfs2_super *osb, struct buffer_head *fe_bh, struct ocfs2_alloc_context *data_ac); +struct ocfs2_dir_block_trailer *ocfs2_dir_trailer_from_size(int blocksize, + void *data); #endif /* OCFS2_DIR_H */ diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 644bee55d8b..d07ddbe4b28 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -275,6 +275,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, struct list_head *iter, *head=NULL; u64 cookie; u32 flags; + u8 node; if (!dlm_grab(dlm)) { dlm_error(DLM_REJECTED); @@ -286,18 +287,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, name = past->name; locklen = past->namelen; - cookie = be64_to_cpu(past->cookie); + cookie = past->cookie; flags = be32_to_cpu(past->flags); + node = past->node_idx; if (locklen > DLM_LOCKID_NAME_MAX) { ret = DLM_IVBUFLEN; - mlog(ML_ERROR, "Invalid name length in proxy ast handler!\n"); + mlog(ML_ERROR, "Invalid name length (%d) in proxy ast " + "handler!\n", locklen); goto leave; } if ((flags & (LKM_PUT_LVB|LKM_GET_LVB)) == (LKM_PUT_LVB|LKM_GET_LVB)) { - mlog(ML_ERROR, "both PUT and GET lvb specified\n"); + mlog(ML_ERROR, "Both PUT and GET lvb specified, (0x%x)\n", + flags); ret = DLM_BADARGS; goto leave; } @@ -310,22 +314,21 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, if (past->type != DLM_AST && past->type != DLM_BAST) { mlog(ML_ERROR, "Unknown ast type! %d, cookie=%u:%llu" - "name=%.*s\n", past->type, - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - locklen, name); + "name=%.*s, node=%u\n", past->type, + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); ret = DLM_IVLOCKID; goto leave; } res = dlm_lookup_lockres(dlm, name, locklen); if (!res) { - mlog(0, "got %sast for unknown lockres! " - "cookie=%u:%llu, name=%.*s, namelen=%u\n", - past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - locklen, name, locklen); + mlog(0, "Got %sast for unknown lockres! cookie=%u:%llu, " + "name=%.*s, node=%u\n", (past->type == DLM_AST ? "" : "b"), + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); ret = DLM_IVLOCKID; goto leave; } @@ -337,12 +340,12 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { - mlog(0, "responding with DLM_RECOVERING!\n"); + mlog(0, "Responding with DLM_RECOVERING!\n"); ret = DLM_RECOVERING; goto unlock_out; } if (res->state & DLM_LOCK_RES_MIGRATING) { - mlog(0, "responding with DLM_MIGRATING!\n"); + mlog(0, "Responding with DLM_MIGRATING!\n"); ret = DLM_MIGRATING; goto unlock_out; } @@ -351,7 +354,7 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, lock = NULL; list_for_each(iter, head) { lock = list_entry (iter, struct dlm_lock, list); - if (be64_to_cpu(lock->ml.cookie) == cookie) + if (lock->ml.cookie == cookie) goto do_ast; } @@ -363,15 +366,15 @@ int dlm_proxy_ast_handler(struct o2net_msg *msg, u32 len, void *data, list_for_each(iter, head) { lock = list_entry (iter, struct dlm_lock, list); - if (be64_to_cpu(lock->ml.cookie) == cookie) + if (lock->ml.cookie == cookie) goto do_ast; } - mlog(0, "got %sast for unknown lock! cookie=%u:%llu, " - "name=%.*s, namelen=%u\n", past->type == DLM_AST ? "" : "b", - dlm_get_lock_cookie_node(cookie), - dlm_get_lock_cookie_seq(cookie), - locklen, name, locklen); + mlog(0, "Got %sast for unknown lock! cookie=%u:%llu, name=%.*s, " + "node=%u\n", past->type == DLM_AST ? "" : "b", + dlm_get_lock_cookie_node(be64_to_cpu(cookie)), + dlm_get_lock_cookie_seq(be64_to_cpu(cookie)), + locklen, name, node); ret = DLM_NORMAL; unlock_out: @@ -383,8 +386,8 @@ do_ast: if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ list_move_tail(&lock->list, &res->granted); - mlog(0, "ast: adding to granted list... type=%d, " - "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); + mlog(0, "ast: Adding to granted list... type=%d, " + "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { lock->ml.type = lock->ml.convert_type; lock->ml.convert_type = LKM_IVMODE; @@ -408,7 +411,6 @@ do_ast: dlm_do_local_bast(dlm, res, lock, past->blocked_type); leave: - if (res) dlm_lockres_put(res); diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index d5a86fb81a4..bb53714813a 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -140,6 +140,7 @@ struct dlm_ctxt unsigned int purge_count; spinlock_t spinlock; spinlock_t ast_lock; + spinlock_t track_lock; char *name; u8 node_num; u32 key; @@ -316,6 +317,8 @@ struct dlm_lock_resource * put on a list for the dlm thread to run. */ unsigned long last_used; + struct dlm_ctxt *dlm; + unsigned migration_pending:1; atomic_t asts_reserved; spinlock_t spinlock; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 1b81dcba175..b32f60a5acf 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -630,43 +630,38 @@ static void *lockres_seq_start(struct seq_file *m, loff_t *pos) { struct debug_lockres *dl = m->private; struct dlm_ctxt *dlm = dl->dl_ctxt; + struct dlm_lock_resource *oldres = dl->dl_res; struct dlm_lock_resource *res = NULL; + struct list_head *track_list; - spin_lock(&dlm->spinlock); + spin_lock(&dlm->track_lock); + if (oldres) + track_list = &oldres->tracking; + else + track_list = &dlm->tracking_list; - if (dl->dl_res) { - list_for_each_entry(res, &dl->dl_res->tracking, tracking) { - if (dl->dl_res) { - dlm_lockres_put(dl->dl_res); - dl->dl_res = NULL; - } - if (&res->tracking == &dlm->tracking_list) { - mlog(0, "End of list found, %p\n", res); - dl = NULL; - break; - } + list_for_each_entry(res, track_list, tracking) { + if (&res->tracking == &dlm->tracking_list) + res = NULL; + else dlm_lockres_get(res); - dl->dl_res = res; - break; - } - } else { - if (!list_empty(&dlm->tracking_list)) { - list_for_each_entry(res, &dlm->tracking_list, tracking) - break; - dlm_lockres_get(res); - dl->dl_res = res; - } else - dl = NULL; + break; } + spin_unlock(&dlm->track_lock); - if (dl) { - spin_lock(&dl->dl_res->spinlock); - dump_lockres(dl->dl_res, dl->dl_buf, dl->dl_len - 1); - spin_unlock(&dl->dl_res->spinlock); - } + if (oldres) + dlm_lockres_put(oldres); - spin_unlock(&dlm->spinlock); + dl->dl_res = res; + + if (res) { + spin_lock(&res->spinlock); + dump_lockres(res, dl->dl_buf, dl->dl_len - 1); + spin_unlock(&res->spinlock); + } else + dl = NULL; + /* passed to seq_show */ return dl; } diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 63f8125824e..d8d578f4561 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -1550,6 +1550,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); spin_lock_init(&dlm->ast_lock); + spin_lock_init(&dlm->track_lock); INIT_LIST_HEAD(&dlm->list); INIT_LIST_HEAD(&dlm->dirty_list); INIT_LIST_HEAD(&dlm->reco.resources); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 6f7a77d5402..1c9efb406a9 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -341,7 +341,6 @@ static struct inode *dlmfs_get_root_inode(struct super_block *sb) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inc_nlink(inode); @@ -367,7 +366,6 @@ static struct inode *dlmfs_get_inode(struct inode *parent, inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->backing_dev_info = &dlmfs_backing_dev_info; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 44f87caf368..54e182a27ca 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -505,8 +505,10 @@ void dlm_change_lockres_owner(struct dlm_ctxt *dlm, static void dlm_lockres_release(struct kref *kref) { struct dlm_lock_resource *res; + struct dlm_ctxt *dlm; res = container_of(kref, struct dlm_lock_resource, refs); + dlm = res->dlm; /* This should not happen -- all lockres' have a name * associated with them at init time. */ @@ -515,6 +517,7 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + spin_lock(&dlm->track_lock); if (!list_empty(&res->tracking)) list_del_init(&res->tracking); else { @@ -522,6 +525,9 @@ static void dlm_lockres_release(struct kref *kref) res->lockname.len, res->lockname.name); dlm_print_one_lock_resource(res); } + spin_unlock(&dlm->track_lock); + + dlm_put(dlm); if (!hlist_unhashed(&res->hash_node) || !list_empty(&res->granted) || @@ -595,6 +601,10 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, res->migration_pending = 0; res->inflight_locks = 0; + /* put in dlm_lockres_release */ + dlm_grab(dlm); + res->dlm = dlm; + kref_init(&res->refs); /* just for consistency */ @@ -722,14 +732,21 @@ lookup: if (tmpres) { int dropping_ref = 0; + spin_unlock(&dlm->spinlock); + spin_lock(&tmpres->spinlock); + /* We wait for the other thread that is mastering the resource */ + if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { + __dlm_wait_on_lockres(tmpres); + BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); + } + if (tmpres->owner == dlm->node_num) { BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); dlm_lockres_grab_inflight_ref(dlm, tmpres); } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) dropping_ref = 1; spin_unlock(&tmpres->spinlock); - spin_unlock(&dlm->spinlock); /* wait until done messaging the master, drop our ref to allow * the lockres to be purged, start over. */ @@ -2949,7 +2966,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, struct dlm_node_iter *iter) { struct dlm_migrate_request migrate; - int ret, status = 0; + int ret, skip, status = 0; int nodenum; memset(&migrate, 0, sizeof(migrate)); @@ -2966,12 +2983,27 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, nodenum == new_master) continue; + /* We could race exit domain. If exited, skip. */ + spin_lock(&dlm->spinlock); + skip = (!test_bit(nodenum, dlm->domain_map)); + spin_unlock(&dlm->spinlock); + if (skip) { + clear_bit(nodenum, iter->node_map); + continue; + } + ret = o2net_send_message(DLM_MIGRATE_REQUEST_MSG, dlm->key, &migrate, sizeof(migrate), nodenum, &status); - if (ret < 0) - mlog_errno(ret); - else if (status < 0) { + if (ret < 0) { + mlog(0, "migrate_request returned %d!\n", ret); + if (!dlm_is_host_down(ret)) { + mlog(ML_ERROR, "unhandled error=%d!\n", ret); + BUG(); + } + clear_bit(nodenum, iter->node_map); + ret = 0; + } else if (status < 0) { mlog(0, "migrate request (node %u) returned %d!\n", nodenum, status); ret = status; diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 4060bb328bc..d1295203029 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -181,7 +181,8 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); /* This ensures that clear refmap is sent after the set */ - __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); + __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG | + DLM_LOCK_RES_MIGRATING)); spin_unlock(&res->spinlock); /* clear our bit from the master's refmap, ignore errors */ diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 6e6cc0a2e5f..b0c4cadd4c4 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -32,6 +32,7 @@ #include <linux/debugfs.h> #include <linux/seq_file.h> #include <linux/time.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_DLM_GLUE #include <cluster/masklog.h> @@ -51,6 +52,7 @@ #include "slot_map.h" #include "super.h" #include "uptodate.h" +#include "quota.h" #include "buffer_head_io.h" @@ -68,6 +70,7 @@ struct ocfs2_mask_waiter { static struct ocfs2_super *ocfs2_get_dentry_osb(struct ocfs2_lock_res *lockres); static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres); static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres); +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres); /* * Return value from ->downconvert_worker functions. @@ -102,6 +105,7 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, static void ocfs2_dentry_post_unlock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres); #define mlog_meta_lvb(__level, __lockres) ocfs2_dump_meta_lvb_info(__level, __PRETTY_FUNCTION__, __LINE__, __lockres) @@ -111,8 +115,7 @@ static void ocfs2_dump_meta_lvb_info(u64 level, unsigned int line, struct ocfs2_lock_res *lockres) { - struct ocfs2_meta_lvb *lvb = - (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); mlog(level, "LVB information for %s (called from %s:%u):\n", lockres->l_name, function, line); @@ -258,6 +261,12 @@ static struct ocfs2_lock_res_ops ocfs2_flock_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_qinfo_lops = { + .set_lvb = ocfs2_set_qinfo_lvb, + .get_osb = ocfs2_get_qinfo_osb, + .flags = LOCK_TYPE_REQUIRES_REFRESH | LOCK_TYPE_USES_LVB, +}; + static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres) { return lockres->l_type == OCFS2_LOCK_TYPE_META || @@ -279,6 +288,13 @@ static inline struct ocfs2_dentry_lock *ocfs2_lock_res_dl(struct ocfs2_lock_res return (struct ocfs2_dentry_lock *)lockres->l_priv; } +static inline struct ocfs2_mem_dqinfo *ocfs2_lock_res_qinfo(struct ocfs2_lock_res *lockres) +{ + BUG_ON(lockres->l_type != OCFS2_LOCK_TYPE_QINFO); + + return (struct ocfs2_mem_dqinfo *)lockres->l_priv; +} + static inline struct ocfs2_super *ocfs2_get_lockres_osb(struct ocfs2_lock_res *lockres) { if (lockres->l_ops->get_osb) @@ -507,6 +523,13 @@ static struct ocfs2_super *ocfs2_get_inode_osb(struct ocfs2_lock_res *lockres) return OCFS2_SB(inode->i_sb); } +static struct ocfs2_super *ocfs2_get_qinfo_osb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_mem_dqinfo *info = lockres->l_priv; + + return OCFS2_SB(info->dqi_gi.dqi_sb); +} + static struct ocfs2_super *ocfs2_get_file_osb(struct ocfs2_lock_res *lockres) { struct ocfs2_file_private *fp = lockres->l_priv; @@ -609,6 +632,17 @@ void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, lockres->l_flags |= OCFS2_LOCK_NOCACHE; } +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info) +{ + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_QINFO, info->dqi_gi.dqi_type, + 0, lockres->l_name); + ocfs2_lock_res_init_common(OCFS2_SB(info->dqi_gi.dqi_sb), lockres, + OCFS2_LOCK_TYPE_QINFO, &ocfs2_qinfo_lops, + info); +} + void ocfs2_lock_res_free(struct ocfs2_lock_res *res) { mlog_entry_void(); @@ -1290,7 +1324,7 @@ again: goto out; } - mlog(0, "lock %s, successfull return from ocfs2_dlm_lock\n", + mlog(0, "lock %s, successful return from ocfs2_dlm_lock\n", lockres->l_name); /* At this point we've gone inside the dlm and need to @@ -1829,7 +1863,7 @@ static void __ocfs2_stuff_meta_lvb(struct inode *inode) mlog_entry_void(); - lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); /* * Invalidate the LVB of a deleted inode - this way other @@ -1881,7 +1915,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) mlog_meta_lvb(0, lockres); - lvb = (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); /* We're safe here without the lockres lock... */ spin_lock(&oi->ip_lock); @@ -1916,8 +1950,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode) static inline int ocfs2_meta_lvb_is_trustable(struct inode *inode, struct ocfs2_lock_res *lockres) { - struct ocfs2_meta_lvb *lvb = - (struct ocfs2_meta_lvb *)ocfs2_dlm_lvb(&lockres->l_lksb); + struct ocfs2_meta_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); if (lvb->lvb_version == OCFS2_LVB_VERSION && be32_to_cpu(lvb->lvb_igeneration) == inode->i_generation) @@ -2024,7 +2057,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ - status = ocfs2_read_block(inode, oi->ip_blkno, bh); + status = ocfs2_read_inode_block(inode, bh); if (status < 0) { mlog_errno(status); goto bail_refresh; @@ -2032,18 +2065,14 @@ static int ocfs2_inode_lock_update(struct inode *inode, fe = (struct ocfs2_dinode *) (*bh)->b_data; /* This is a good chance to make sure we're not - * locking an invalid object. + * locking an invalid object. ocfs2_read_inode_block() + * already checked that the inode block is sane. * * We bug on a stale inode here because we checked * above whether it was wiped from disk. The wiping * node provides a guarantee that we receive that * message and can mark the inode before dropping any * locks associated with it. */ - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail_refresh; - } mlog_bug_on_msg(inode->i_generation != le32_to_cpu(fe->i_generation), "Invalid dinode %llu disk generation: %u " @@ -2085,7 +2114,7 @@ static int ocfs2_assign_bh(struct inode *inode, return 0; } - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); + status = ocfs2_read_inode_block(inode, ret_bh); if (status < 0) mlog_errno(status); @@ -2922,7 +2951,7 @@ static int ocfs2_drop_lock(struct ocfs2_super *osb, ocfs2_dlm_dump_lksb(&lockres->l_lksb); BUG(); } - mlog(0, "lock %s, successfull return from ocfs2_dlm_unlock\n", + mlog(0, "lock %s, successful return from ocfs2_dlm_unlock\n", lockres->l_name); ocfs2_wait_on_busy_lock(lockres); @@ -3449,6 +3478,117 @@ static int ocfs2_dentry_convert_worker(struct ocfs2_lock_res *lockres, return UNBLOCK_CONTINUE_POST; } +static void ocfs2_set_qinfo_lvb(struct ocfs2_lock_res *lockres) +{ + struct ocfs2_qinfo_lvb *lvb; + struct ocfs2_mem_dqinfo *oinfo = ocfs2_lock_res_qinfo(lockres); + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + + mlog_entry_void(); + + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_QINFO_LVB_VERSION; + lvb->lvb_bgrace = cpu_to_be32(info->dqi_bgrace); + lvb->lvb_igrace = cpu_to_be32(info->dqi_igrace); + lvb->lvb_syncms = cpu_to_be32(oinfo->dqi_syncms); + lvb->lvb_blocks = cpu_to_be32(oinfo->dqi_gi.dqi_blocks); + lvb->lvb_free_blk = cpu_to_be32(oinfo->dqi_gi.dqi_free_blk); + lvb->lvb_free_entry = cpu_to_be32(oinfo->dqi_gi.dqi_free_entry); + + mlog_exit_void(); +} + +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + + mlog_entry_void(); + if (!ocfs2_is_hard_readonly(osb) && !ocfs2_mount_local(osb)) + ocfs2_cluster_unlock(osb, lockres, level); + mlog_exit_void(); +} + +static int ocfs2_refresh_qinfo(struct ocfs2_mem_dqinfo *oinfo) +{ + struct mem_dqinfo *info = sb_dqinfo(oinfo->dqi_gi.dqi_sb, + oinfo->dqi_gi.dqi_type); + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_qinfo_lvb *lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + struct buffer_head *bh = NULL; + struct ocfs2_global_disk_dqinfo *gdinfo; + int status = 0; + + if (lvb->lvb_version == OCFS2_QINFO_LVB_VERSION) { + info->dqi_bgrace = be32_to_cpu(lvb->lvb_bgrace); + info->dqi_igrace = be32_to_cpu(lvb->lvb_igrace); + oinfo->dqi_syncms = be32_to_cpu(lvb->lvb_syncms); + oinfo->dqi_gi.dqi_blocks = be32_to_cpu(lvb->lvb_blocks); + oinfo->dqi_gi.dqi_free_blk = be32_to_cpu(lvb->lvb_free_blk); + oinfo->dqi_gi.dqi_free_entry = + be32_to_cpu(lvb->lvb_free_entry); + } else { + status = ocfs2_read_quota_block(oinfo->dqi_gqinode, 0, &bh); + if (status) { + mlog_errno(status); + goto bail; + } + gdinfo = (struct ocfs2_global_disk_dqinfo *) + (bh->b_data + OCFS2_GLOBAL_INFO_OFF); + info->dqi_bgrace = le32_to_cpu(gdinfo->dqi_bgrace); + info->dqi_igrace = le32_to_cpu(gdinfo->dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(gdinfo->dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(gdinfo->dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(gdinfo->dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = + le32_to_cpu(gdinfo->dqi_free_entry); + brelse(bh); + ocfs2_track_lock_refresh(lockres); + } + +bail: + return status; +} + +/* Lock quota info, this function expects at least shared lock on the quota file + * so that we can safely refresh quota info from disk. */ +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + struct ocfs2_lock_res *lockres = &oinfo->dqi_gqlock; + struct ocfs2_super *osb = OCFS2_SB(oinfo->dqi_gi.dqi_sb); + int level = ex ? DLM_LOCK_EX : DLM_LOCK_PR; + int status = 0; + + mlog_entry_void(); + + /* On RO devices, locking really isn't needed... */ + if (ocfs2_is_hard_readonly(osb)) { + if (ex) + status = -EROFS; + goto bail; + } + if (ocfs2_mount_local(osb)) + goto bail; + + status = ocfs2_cluster_lock(osb, lockres, level, 0, 0); + if (status < 0) { + mlog_errno(status); + goto bail; + } + if (!ocfs2_should_refresh_lock_res(lockres)) + goto bail; + /* OK, we have the lock but we need to refresh the quota info */ + status = ocfs2_refresh_qinfo(oinfo); + if (status) + ocfs2_qinfo_unlock(oinfo, ex); + ocfs2_complete_lock_res_refresh(lockres, status); +bail: + mlog_exit(status); + return status; +} + /* * This is the filesystem locking protocol. It provides the lock handling * hooks for the underlying DLM. It has a maximum version number. diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 2bb01f09c1b..3f8d9986b8e 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -49,6 +49,19 @@ struct ocfs2_meta_lvb { __be32 lvb_reserved2; }; +#define OCFS2_QINFO_LVB_VERSION 1 + +struct ocfs2_qinfo_lvb { + __u8 lvb_version; + __u8 lvb_reserved[3]; + __be32 lvb_bgrace; + __be32 lvb_igrace; + __be32 lvb_syncms; + __be32 lvb_blocks; + __be32 lvb_free_blk; + __be32 lvb_free_entry; +}; + /* ocfs2_inode_lock_full() 'arg_flags' flags */ /* don't wait on recovery. */ #define OCFS2_META_LOCK_RECOVERY (0x01) @@ -69,6 +82,9 @@ void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl, struct ocfs2_file_private; void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres, struct ocfs2_file_private *fp); +struct ocfs2_mem_dqinfo; +void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres, + struct ocfs2_mem_dqinfo *info); void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); @@ -103,6 +119,9 @@ int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); void ocfs2_file_unlock(struct file *file); +int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex); + void ocfs2_mark_lockres_freeing(struct ocfs2_lock_res *lockres); void ocfs2_simple_drop_lockres(struct ocfs2_super *osb, diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index 2baedac5823..f2bb1a04d25 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -293,7 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); + ret = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -302,12 +302,6 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { - ret = -EROFS; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); - goto out; - } - if (el->l_tree_depth) { ocfs2_error(inode->i_sb, "Inode %lu has non zero tree depth in " @@ -381,23 +375,16 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_block(inode, - le64_to_cpu(eb->h_next_leaf_blk), - &next_eb_bh); + ret = ocfs2_read_extent_block(inode, + le64_to_cpu(eb->h_next_leaf_blk), + &next_eb_bh); if (ret) { mlog_errno(ret); goto out; } - next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; - - if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) { - ret = -EROFS; - OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb); - goto out; - } + next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data; el = &next_eb->h_list; - i = ocfs2_search_for_hole_index(el, v_cluster); } @@ -630,7 +617,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, if (ret == 0) goto out; - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -819,3 +806,74 @@ out: return ret; } + +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int rc = 0; + u64 p_block, p_count; + int i, count, done = 0; + + mlog_entry("(inode = %p, v_block = %llu, nr = %d, bhs = %p, " + "flags = %x, validate = %p)\n", + inode, (unsigned long long)v_block, nr, bhs, flags, + validate); + + if (((v_block + nr - 1) << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!(flags & OCFS2_BH_READAHEAD)); + goto out; + } + + while (done < nr) { + down_read(&OCFS2_I(inode)->ip_alloc_sem); + rc = ocfs2_extent_map_get_blocks(inode, v_block + done, + &p_block, &p_count, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (rc) { + mlog_errno(rc); + break; + } + + if (!p_block) { + rc = -EIO; + mlog(ML_ERROR, + "Inode #%llu contains a hole at offset %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)(v_block + done) << + inode->i_sb->s_blocksize_bits); + break; + } + + count = nr - done; + if (p_count < count) + count = p_count; + + /* + * If the caller passed us bhs, they should have come + * from a previous readahead call to this function. Thus, + * they should have the right b_blocknr. + */ + for (i = 0; i < count; i++) { + if (!bhs[done + i]) + continue; + BUG_ON(bhs[done + i]->b_blocknr != (p_block + i)); + } + + rc = ocfs2_read_blocks(inode, p_block, count, bhs + done, + flags, validate); + if (rc) { + mlog_errno(rc); + break; + } + done += count; + } + +out: + mlog_exit(rc); + return rc; +} + + diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 1c4aa8b06f3..b7dd9731b46 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -57,4 +57,28 @@ int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, struct ocfs2_extent_list *el); +int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, + struct buffer_head *bhs[], int flags, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)); +static inline int ocfs2_read_virt_block(struct inode *inode, u64 v_block, + struct buffer_head **bh, + int (*validate)(struct super_block *sb, + struct buffer_head *bh)) +{ + int status = 0; + + if (bh == NULL) { + printk("ocfs2: bh == NULL\n"); + status = -EINVAL; + goto bail; + } + + status = ocfs2_read_virt_blocks(inode, v_block, 1, bh, 0, validate); + +bail: + return status; +} + + #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e2570a3bc2b..a5887df2cd8 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -35,6 +35,7 @@ #include <linux/mount.h> #include <linux/writeback.h> #include <linux/falloc.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -56,6 +57,8 @@ #include "suballoc.h" #include "super.h" #include "xattr.h" +#include "acl.h" +#include "quota.h" #include "buffer_head_io.h" @@ -253,8 +256,8 @@ int ocfs2_update_inode_atime(struct inode *inode, goto out; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -303,9 +306,9 @@ bail: return status; } -static int ocfs2_simple_size_update(struct inode *inode, - struct buffer_head *di_bh, - u64 new_i_size) +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size) { int ret; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); @@ -350,8 +353,8 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -401,12 +404,9 @@ static int ocfs2_truncate_file(struct inode *inode, (unsigned long long)OCFS2_I(inode)->ip_blkno, (unsigned long long)new_i_size); + /* We trust di_bh because it comes from ocfs2_inode_lock(), which + * already validated it */ fe = (struct ocfs2_dinode *) di_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto bail; - } mlog_bug_on_msg(le64_to_cpu(fe->i_size) != i_size_read(inode), "Inode %llu, inode i_size = %lld != di " @@ -536,6 +536,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; + int did_quota = 0; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -545,18 +546,12 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); + status = ocfs2_read_inode_block(inode, &bh); if (status < 0) { mlog_errno(status); goto leave; } - fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - status = -EIO; - goto leave; - } restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); @@ -585,11 +580,18 @@ restart_all: } restarted_transaction: + if (vfs_dq_alloc_space_nodirty(inode, ocfs2_clusters_to_bytes(osb->sb, + clusters_to_add))) { + status = -EDQUOT; + goto leave; + } + did_quota = 1; + /* reserve a write to the file entry early on - that we if we * run out of credits in the allocation path, we can still * update i_size. */ - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -622,6 +624,10 @@ restarted_transaction: spin_lock(&OCFS2_I(inode)->ip_lock); clusters_to_add -= (OCFS2_I(inode)->ip_clusters - prev_clusters); spin_unlock(&OCFS2_I(inode)->ip_lock); + /* Release unused quota reservation */ + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); + did_quota = 0; if (why != RESTART_NONE && clusters_to_add) { if (why == RESTART_META) { @@ -654,6 +660,9 @@ restarted_transaction: OCFS2_I(inode)->ip_clusters, (long long)i_size_read(inode)); leave: + if (status < 0 && did_quota) + vfs_dq_free_space(inode, + ocfs2_clusters_to_bytes(osb->sb, clusters_to_add)); if (handle) { ocfs2_commit_trans(osb, handle); handle = NULL; @@ -885,6 +894,9 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) struct ocfs2_super *osb = OCFS2_SB(sb); struct buffer_head *bh = NULL; handle_t *handle = NULL; + int locked[MAXQUOTAS] = {0, 0}; + int credits, qtype; + struct ocfs2_mem_dqinfo *oinfo; mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); @@ -955,11 +967,47 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) } } - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto bail_unlock; + if ((attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) || + (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) { + credits = OCFS2_INODE_UPDATE_CREDITS; + if (attr->ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + oinfo = sb_dqinfo(sb, USRQUOTA)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto bail_unlock; + credits += ocfs2_calc_qinit_credits(sb, USRQUOTA) + + ocfs2_calc_qdel_credits(sb, USRQUOTA); + locked[USRQUOTA] = 1; + } + if (attr->ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid + && OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + oinfo = sb_dqinfo(sb, GRPQUOTA)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto bail_unlock; + credits += ocfs2_calc_qinit_credits(sb, GRPQUOTA) + + ocfs2_calc_qdel_credits(sb, GRPQUOTA); + locked[GRPQUOTA] = 1; + } + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } + status = vfs_dq_transfer(inode, attr) ? -EDQUOT : 0; + if (status < 0) + goto bail_commit; + } else { + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto bail_unlock; + } } /* @@ -982,6 +1030,12 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: + for (qtype = 0; qtype < MAXQUOTAS; qtype++) { + if (!locked[qtype]) + continue; + oinfo = sb_dqinfo(sb, qtype)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 1); + } ocfs2_inode_unlock(inode, 1); bail_unlock_rw: if (size_change) @@ -989,6 +1043,12 @@ bail_unlock_rw: bail: brelse(bh); + if (!status && attr->ia_valid & ATTR_MODE) { + status = ocfs2_acl_chmod(inode); + if (status < 0) + mlog_errno(status); + } + mlog_exit(status); return status; } @@ -1035,7 +1095,7 @@ int ocfs2_permission(struct inode *inode, int mask) goto out; } - ret = generic_permission(inode, mask, NULL); + ret = generic_permission(inode, mask, ocfs2_check_acl); ocfs2_inode_unlock(inode, 0); out: @@ -1061,8 +1121,8 @@ static int __ocfs2_write_remove_suid(struct inode *inode, goto out; } - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_trans; @@ -1128,9 +1188,8 @@ static int ocfs2_write_remove_suid(struct inode *inode) { int ret; struct buffer_head *bh = NULL; - struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); + ret = ocfs2_read_inode_block(inode, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1156,8 +1215,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, struct buffer_head *di_bh = NULL; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, - &di_bh); + ret = ocfs2_read_inode_block(inode, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -1226,83 +1284,6 @@ out: return ret; } -static int __ocfs2_remove_inode_range(struct inode *inode, - struct buffer_head *di_bh, - u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) -{ - int ret; - u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - handle_t *handle; - struct ocfs2_alloc_context *meta_ac = NULL; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_tree et; - - ocfs2_init_dinode_extent_tree(&et, inode, di_bh); - - ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); - if (ret) { - mlog_errno(ret); - return ret; - } - - mutex_lock(&tl_inode->i_mutex); - - if (ocfs2_truncate_log_needs_flush(osb)) { - ret = __ocfs2_flush_truncate_log(osb); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } - - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, - dealloc); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - OCFS2_I(inode)->ip_clusters -= len; - di->i_clusters = cpu_to_le32(OCFS2_I(inode)->ip_clusters); - - ret = ocfs2_journal_dirty(handle, di_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); - if (ret) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); -out: - mutex_unlock(&tl_inode->i_mutex); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - - return ret; -} - /* * Truncate a byte range, avoiding pages within partial clusters. This * preserves those pages for the zeroing code to write to. @@ -1402,7 +1383,9 @@ static int ocfs2_remove_inode_range(struct inode *inode, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_cached_dealloc_ctxt dealloc; struct address_space *mapping = inode->i_mapping; + struct ocfs2_extent_tree et; + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); ocfs2_init_dealloc_ctxt(&dealloc); if (byte_len == 0) @@ -1458,9 +1441,9 @@ static int ocfs2_remove_inode_range(struct inode *inode, /* Only do work for non-holes */ if (phys_cpos != 0) { - ret = __ocfs2_remove_inode_range(inode, di_bh, cpos, - phys_cpos, alloc_size, - &dealloc); + ret = ocfs2_remove_btree_range(inode, &et, cpos, + phys_cpos, alloc_size, + &dealloc); if (ret) { mlog_errno(ret); goto out; @@ -1622,7 +1605,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd, struct ocfs2_space_resv *sr) { struct inode *inode = file->f_path.dentry->d_inode; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); if ((cmd == OCFS2_IOC_RESVSP || cmd == OCFS2_IOC_RESVSP64) && !ocfs2_writes_unwritten_extents(osb)) diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index e92382cbca5..172f9fbc9fc 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -51,6 +51,9 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_simple_size_update(struct inode *inode, + struct buffer_head *di_bh, + u64 new_i_size); int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7aa00d51187..229e707bc05 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/pagemap.h> +#include <linux/quotaops.h> #include <asm/byteorder.h> @@ -37,6 +38,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "extent_map.h" #include "file.h" @@ -214,12 +216,11 @@ static int ocfs2_init_locked_inode(struct inode *inode, void *opaque) return 0; } -int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, - int create_ino) +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino) { struct super_block *sb; struct ocfs2_super *osb; - int status = -EINVAL; int use_plocks = 1; mlog_entry("(0x%p, size:%llu)\n", inode, @@ -232,25 +233,17 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) use_plocks = 0; - /* this means that read_inode cannot create a superblock inode - * today. change if needed. */ - if (!OCFS2_IS_VALID_DINODE(fe) || - !(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { - mlog(0, "Invalid dinode: i_ino=%lu, i_blkno=%llu, " - "signature = %.*s, flags = 0x%x\n", - inode->i_ino, - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature, le32_to_cpu(fe->i_flags)); - goto bail; - } + /* + * These have all been checked by ocfs2_read_inode_block() or set + * by ocfs2_mknod_locked(), so a failure is a code bug. + */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); /* This means that read_inode + cannot create a superblock + inode today. change if + that is needed. */ + BUG_ON(!(fe->i_flags & cpu_to_le32(OCFS2_VALID_FL))); + BUG_ON(le32_to_cpu(fe->i_fs_generation) != osb->fs_generation); - if (le32_to_cpu(fe->i_fs_generation) != osb->fs_generation) { - mlog(ML_ERROR, "file entry generation does not match " - "superblock! osb->fs_generation=%x, " - "fe->i_fs_generation=%x\n", - osb->fs_generation, le32_to_cpu(fe->i_fs_generation)); - goto bail; - } OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr); @@ -284,14 +277,18 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, inode->i_nlink = le16_to_cpu(fe->i_links_count); - if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) + if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_SYSTEM_FILE; + inode->i_flags |= S_NOQUOTA; + } if (fe->i_flags & cpu_to_le32(OCFS2_LOCAL_ALLOC_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; mlog(0, "local alloc inode: i_ino=%lu\n", inode->i_ino); } else if (fe->i_flags & cpu_to_le32(OCFS2_BITMAP_FL)) { OCFS2_I(inode)->ip_flags |= OCFS2_INODE_BITMAP; + } else if (fe->i_flags & cpu_to_le32(OCFS2_QUOTA_FL)) { + inode->i_flags |= S_NOQUOTA; } else if (fe->i_flags & cpu_to_le32(OCFS2_SUPER_BLOCK_FL)) { mlog(0, "superblock inode: i_ino=%lu\n", inode->i_ino); /* we can't actually hit this as read_inode can't @@ -354,10 +351,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, ocfs2_set_inode_flags(inode); - status = 0; -bail: - mlog_exit(status); - return status; + mlog_exit_void(); } static int ocfs2_read_locked_inode(struct inode *inode, @@ -460,11 +454,14 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } - if (can_lock) - status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, - OCFS2_BH_IGNORE_CACHE); - else + if (can_lock) { + status = ocfs2_read_inode_block_full(inode, &bh, + OCFS2_BH_IGNORE_CACHE); + } else { status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); + if (!status) + status = ocfs2_validate_inode_block(osb->sb, bh); + } if (status < 0) { mlog_errno(status); goto bail; @@ -472,12 +469,6 @@ static int ocfs2_read_locked_inode(struct inode *inode, status = -EINVAL; fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - mlog(0, "Invalid dinode #%llu: signature = %.*s\n", - (unsigned long long)args->fi_blkno, 7, - fe->i_signature); - goto bail; - } /* * This is a code bug. Right now the caller needs to @@ -491,10 +482,9 @@ static int ocfs2_read_locked_inode(struct inode *inode, if (S_ISCHR(le16_to_cpu(fe->i_mode)) || S_ISBLK(le16_to_cpu(fe->i_mode))) - inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); + inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev)); - if (ocfs2_populate_inode(inode, fe, 0) < 0) - goto bail; + ocfs2_populate_inode(inode, fe, 0); BUG_ON(args->fi_blkno != le64_to_cpu(fe->i_blkno)); @@ -547,8 +537,8 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, goto out; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out; @@ -615,7 +605,8 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS); + handle = ocfs2_start_trans(osb, OCFS2_DELETE_INODE_CREDITS + + ocfs2_quota_trans_credits(inode->i_sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -630,8 +621,8 @@ static int ocfs2_remove_inode(struct inode *inode, } /* set the inodes dtime */ - status = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail_commit; @@ -647,6 +638,7 @@ static int ocfs2_remove_inode(struct inode *inode, } ocfs2_remove_from_cache(inode, di_bh); + vfs_dq_free_inode(inode); status = ocfs2_free_dinode(handle, inode_alloc_inode, inode_alloc_bh, di); @@ -929,7 +921,10 @@ void ocfs2_delete_inode(struct inode *inode) mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - if (is_bad_inode(inode)) { + /* When we fail in read_inode() we mark inode as bad. The second test + * catches the case when inode allocation fails before allocating + * a block for inode. */ + if (is_bad_inode(inode) || !OCFS2_I(inode)->ip_blkno) { mlog(0, "Skipping delete of bad inode\n"); goto bail; } @@ -1195,8 +1190,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, mlog_entry("(inode %llu)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -1264,3 +1259,89 @@ void ocfs2_refresh_inode(struct inode *inode, spin_unlock(&OCFS2_I(inode)->ip_lock); } + +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)bh->b_data; + + mlog(0, "Validating dinode %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &di->i_check); + if (rc) { + mlog(ML_ERROR, "Checksum failed for dinode %llu\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + /* + * Errors after here are fatal. + */ + + rc = -EINVAL; + + if (!OCFS2_IS_VALID_DINODE(di)) { + ocfs2_error(sb, "Invalid dinode #%llu: signature = %.*s\n", + (unsigned long long)bh->b_blocknr, 7, + di->i_signature); + goto bail; + } + + if (le64_to_cpu(di->i_blkno) != bh->b_blocknr) { + ocfs2_error(sb, "Invalid dinode #%llu: i_blkno is %llu\n", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(di->i_blkno)); + goto bail; + } + + if (!(di->i_flags & cpu_to_le32(OCFS2_VALID_FL))) { + ocfs2_error(sb, + "Invalid dinode #%llu: OCFS2_VALID_FL not set\n", + (unsigned long long)bh->b_blocknr); + goto bail; + } + + if (le32_to_cpu(di->i_fs_generation) != + OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Invalid dinode #%llu: fs_generation is %u\n", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(di->i_fs_generation)); + goto bail; + } + + rc = 0; + +bail: + return rc; +} + +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, &tmp, + flags, ocfs2_validate_inode_block); + + /* If ocfs2_read_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh) +{ + return ocfs2_read_inode_block_full(inode, bh, 0); +} diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 2f37af9bcc4..eb3c302b38d 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -128,8 +128,8 @@ struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, unsigned flags, int sysfile_type); int ocfs2_inode_init_private(struct inode *inode); int ocfs2_inode_revalidate(struct dentry *dentry); -int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, - int create_ino); +void ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, + int create_ino); void ocfs2_read_inode(struct inode *inode); void ocfs2_read_inode2(struct inode *inode, void *opaque); ssize_t ocfs2_rw_direct(int rw, struct file *filp, char *buf, @@ -142,6 +142,8 @@ int ocfs2_mark_inode_dirty(handle_t *handle, struct buffer_head *bh); int ocfs2_aio_read(struct file *file, struct kiocb *req, struct iocb *iocb); int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb); +struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada); void ocfs2_set_inode_flags(struct inode *inode); void ocfs2_get_inode_flags(struct ocfs2_inode_info *oi); @@ -153,4 +155,16 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); } +/* Validate that a bh contains a valid inode */ +int ocfs2_validate_inode_block(struct super_block *sb, + struct buffer_head *bh); +/* + * Read an inode block into *bh. If *bh is NULL, a bh will be allocated. + * This is a cached read. The inode will be validated with + * ocfs2_validate_inode_block(). + */ +int ocfs2_read_inode_block(struct inode *inode, struct buffer_head **bh); +/* The same, but can be passed OCFS2_BH_* flags */ +int ocfs2_read_inode_block_full(struct inode *inode, struct buffer_head **bh, + int flags); #endif /* OCFS2_INODE_H */ diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 99fe9d584f3..57d7d25a2b9 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -35,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dir.h" #include "dlmglue.h" #include "extent_map.h" @@ -45,6 +46,7 @@ #include "slot_map.h" #include "super.h" #include "sysfile.h" +#include "quota.h" #include "buffer_head_io.h" @@ -52,10 +54,10 @@ DEFINE_SPINLOCK(trans_inc_lock); static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, - int node_num); + int node_num, int slot_num); static int __ocfs2_recovery_thread(void *arg); static int ocfs2_commit_cache(struct ocfs2_super *osb); -static int ocfs2_wait_on_mount(struct ocfs2_super *osb); +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota); static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, int dirty, int replayed); static int ocfs2_trylock_journal(struct ocfs2_super *osb, @@ -64,6 +66,17 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, int slot); static int ocfs2_commit_thread(void *arg); +static inline int ocfs2_wait_on_mount(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 0); +} + +static inline int ocfs2_wait_on_quotas(struct ocfs2_super *osb) +{ + return __ocfs2_wait_on_mount(osb, 1); +} + + /* * The recovery_list is a simple linked list of node numbers to recover. @@ -256,11 +269,9 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) BUG_ON(osb->journal->j_state == OCFS2_JOURNAL_FREE); BUG_ON(max_buffs <= 0); - /* JBD might support this, but our journalling code doesn't yet. */ - if (journal_current_handle()) { - mlog(ML_ERROR, "Recursive transaction attempted!\n"); - BUG(); - } + /* Nested transaction? Just return the handle... */ + if (journal_current_handle()) + return jbd2_journal_start(journal, max_buffs); down_read(&osb->journal->j_trans_barrier); @@ -285,16 +296,18 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) int ocfs2_commit_trans(struct ocfs2_super *osb, handle_t *handle) { - int ret; + int ret, nested; struct ocfs2_journal *journal = osb->journal; BUG_ON(!handle); + nested = handle->h_ref > 1; ret = jbd2_journal_stop(handle); if (ret < 0) mlog_errno(ret); - up_read(&journal->j_trans_barrier); + if (!nested) + up_read(&journal->j_trans_barrier); return ret; } @@ -357,10 +370,137 @@ bail: return status; } -int ocfs2_journal_access(handle_t *handle, - struct inode *inode, - struct buffer_head *bh, - int type) +struct ocfs2_triggers { + struct jbd2_buffer_trigger_type ot_triggers; + int ot_offset; +}; + +static inline struct ocfs2_triggers *to_ocfs2_trigger(struct jbd2_buffer_trigger_type *triggers) +{ + return container_of(triggers, struct ocfs2_triggers, ot_triggers); +} + +static void ocfs2_commit_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_triggers *ot = to_ocfs2_trigger(triggers); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, data + ot->ot_offset); +} + +/* + * Quota blocks have their own trigger because the struct ocfs2_block_check + * offset depends on the blocksize. + */ +static void ocfs2_dq_commit_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &dqt->dq_check); +} + +/* + * Directory blocks also have their own trigger because the + * struct ocfs2_block_check offset depends on the blocksize. + */ +static void ocfs2_db_commit_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh, + void *data, size_t size) +{ + struct ocfs2_dir_block_trailer *trailer = + ocfs2_dir_trailer_from_size(size, data); + + /* + * We aren't guaranteed to have the superblock here, so we + * must unconditionally compute the ecc data. + * __ocfs2_journal_access() will only set the triggers if + * metaecc is enabled. + */ + ocfs2_block_check_compute(data, size, &trailer->db_check); +} + +static void ocfs2_abort_trigger(struct jbd2_buffer_trigger_type *triggers, + struct buffer_head *bh) +{ + mlog(ML_ERROR, + "ocfs2_abort_trigger called by JBD2. bh = 0x%lx, " + "bh->b_blocknr = %llu\n", + (unsigned long)bh, + (unsigned long long)bh->b_blocknr); + + /* We aren't guaranteed to have the superblock here - but if we + * don't, it'll just crash. */ + ocfs2_error(bh->b_assoc_map->host->i_sb, + "JBD2 has aborted our journal, ocfs2 cannot continue\n"); +} + +static struct ocfs2_triggers di_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_dinode, i_check), +}; + +static struct ocfs2_triggers eb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_extent_block, h_check), +}; + +static struct ocfs2_triggers gd_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_group_desc, bg_check), +}; + +static struct ocfs2_triggers db_triggers = { + .ot_triggers = { + .t_commit = ocfs2_db_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static struct ocfs2_triggers xb_triggers = { + .ot_triggers = { + .t_commit = ocfs2_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, + .ot_offset = offsetof(struct ocfs2_xattr_block, xb_check), +}; + +static struct ocfs2_triggers dq_triggers = { + .ot_triggers = { + .t_commit = ocfs2_dq_commit_trigger, + .t_abort = ocfs2_abort_trigger, + }, +}; + +static int __ocfs2_journal_access(handle_t *handle, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_triggers *triggers, + int type) { int status; @@ -406,6 +546,8 @@ int ocfs2_journal_access(handle_t *handle, status = -EINVAL; mlog(ML_ERROR, "Uknown access type!\n"); } + if (!status && ocfs2_meta_ecc(OCFS2_SB(inode->i_sb)) && triggers) + jbd2_journal_set_triggers(bh, &triggers->ot_triggers); mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); if (status < 0) @@ -416,6 +558,54 @@ int ocfs2_journal_access(handle_t *handle, return status; } +int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &di_triggers, + type); +} + +int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &eb_triggers, + type); +} + +int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &gd_triggers, + type); +} + +int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &db_triggers, + type); +} + +int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &xb_triggers, + type); +} + +int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, &dq_triggers, + type); +} + +int ocfs2_journal_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type) +{ + return __ocfs2_journal_access(handle, inode, bh, NULL, type); +} + int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh) { @@ -434,20 +624,6 @@ int ocfs2_journal_dirty(handle_t *handle, return status; } -#ifdef CONFIG_OCFS2_COMPAT_JBD -int ocfs2_journal_dirty_data(handle_t *handle, - struct buffer_head *bh) -{ - int err = journal_dirty_data(handle, bh); - if (err) - mlog_errno(err); - /* TODO: When we can handle it, abort the handle and go RO on - * error here. */ - - return err; -} -#endif - #define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) @@ -587,17 +763,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, mlog_entry_void(); fe = (struct ocfs2_dinode *)bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - /* This is called from startup/shutdown which will - * handle the errors in a specific manner, so no need - * to call ocfs2_error() here. */ - mlog(ML_ERROR, "Journal dinode %llu has invalid " - "signature: %.*s", - (unsigned long long)le64_to_cpu(fe->i_blkno), 7, - fe->i_signature); - status = -EIO; - goto out; - } + + /* The journal bh on the osb always comes from ocfs2_journal_init() + * and was validated there inside ocfs2_inode_lock_full(). It's a + * code bug if we mess it up. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); flags = le32_to_cpu(fe->id1.journal1.ij_flags); if (dirty) @@ -609,11 +779,11 @@ static int ocfs2_journal_toggle_dirty(struct ocfs2_super *osb, if (replayed) ocfs2_bump_recovery_generation(fe); + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); status = ocfs2_write_block(osb, bh, journal->j_inode); if (status < 0) mlog_errno(status); -out: mlog_exit(status); return status; } @@ -878,6 +1048,7 @@ struct ocfs2_la_recovery_item { int lri_slot; struct ocfs2_dinode *lri_la_dinode; struct ocfs2_dinode *lri_tl_dinode; + struct ocfs2_quota_recovery *lri_qrec; }; /* Does the second half of the recovery process. By this point, the @@ -898,6 +1069,7 @@ void ocfs2_complete_recovery(struct work_struct *work) struct ocfs2_super *osb = journal->j_osb; struct ocfs2_dinode *la_dinode, *tl_dinode; struct ocfs2_la_recovery_item *item, *n; + struct ocfs2_quota_recovery *qrec; LIST_HEAD(tmp_la_list); mlog_entry_void(); @@ -913,6 +1085,8 @@ void ocfs2_complete_recovery(struct work_struct *work) mlog(0, "Complete recovery for slot %d\n", item->lri_slot); + ocfs2_wait_on_quotas(osb); + la_dinode = item->lri_la_dinode; if (la_dinode) { mlog(0, "Clean up local alloc %llu\n", @@ -943,6 +1117,16 @@ void ocfs2_complete_recovery(struct work_struct *work) if (ret < 0) mlog_errno(ret); + qrec = item->lri_qrec; + if (qrec) { + mlog(0, "Recovering quota files"); + ret = ocfs2_finish_quota_recovery(osb, qrec, + item->lri_slot); + if (ret < 0) + mlog_errno(ret); + /* Recovery info is already freed now */ + } + kfree(item); } @@ -956,7 +1140,8 @@ void ocfs2_complete_recovery(struct work_struct *work) static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, int slot_num, struct ocfs2_dinode *la_dinode, - struct ocfs2_dinode *tl_dinode) + struct ocfs2_dinode *tl_dinode, + struct ocfs2_quota_recovery *qrec) { struct ocfs2_la_recovery_item *item; @@ -971,6 +1156,9 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, if (tl_dinode) kfree(tl_dinode); + if (qrec) + ocfs2_free_quota_recovery(qrec); + mlog_errno(-ENOMEM); return; } @@ -979,6 +1167,7 @@ static void ocfs2_queue_recovery_completion(struct ocfs2_journal *journal, item->lri_la_dinode = la_dinode; item->lri_slot = slot_num; item->lri_tl_dinode = tl_dinode; + item->lri_qrec = qrec; spin_lock(&journal->j_lock); list_add_tail(&item->lri_list, &journal->j_la_cleanups); @@ -998,6 +1187,7 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) ocfs2_queue_recovery_completion(journal, osb->slot_num, osb->local_alloc_copy, + NULL, NULL); ocfs2_schedule_truncate_log_flush(osb, 0); @@ -1006,11 +1196,26 @@ void ocfs2_complete_mount_recovery(struct ocfs2_super *osb) } } +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb) +{ + if (osb->quota_rec) { + ocfs2_queue_recovery_completion(osb->journal, + osb->slot_num, + NULL, + NULL, + osb->quota_rec); + osb->quota_rec = NULL; + } +} + static int __ocfs2_recovery_thread(void *arg) { - int status, node_num; + int status, node_num, slot_num; struct ocfs2_super *osb = arg; struct ocfs2_recovery_map *rm = osb->recovery_map; + int *rm_quota = NULL; + int rm_quota_used = 0, i; + struct ocfs2_quota_recovery *qrec; mlog_entry_void(); @@ -1019,6 +1224,11 @@ static int __ocfs2_recovery_thread(void *arg) goto bail; } + rm_quota = kzalloc(osb->max_slots * sizeof(int), GFP_NOFS); + if (!rm_quota) { + status = -ENOMEM; + goto bail; + } restart: status = ocfs2_super_lock(osb, 1); if (status < 0) { @@ -1032,8 +1242,28 @@ restart: * clear it until ocfs2_recover_node() has succeeded. */ node_num = rm->rm_entries[0]; spin_unlock(&osb->osb_lock); - - status = ocfs2_recover_node(osb, node_num); + mlog(0, "checking node %d\n", node_num); + slot_num = ocfs2_node_num_to_slot(osb, node_num); + if (slot_num == -ENOENT) { + status = 0; + mlog(0, "no slot for this node, so no recovery" + "required.\n"); + goto skip_recovery; + } + mlog(0, "node %d was using slot %d\n", node_num, slot_num); + + /* It is a bit subtle with quota recovery. We cannot do it + * immediately because we have to obtain cluster locks from + * quota files and we also don't want to just skip it because + * then quota usage would be out of sync until some node takes + * the slot. So we remember which nodes need quota recovery + * and when everything else is done, we recover quotas. */ + for (i = 0; i < rm_quota_used && rm_quota[i] != slot_num; i++); + if (i == rm_quota_used) + rm_quota[rm_quota_used++] = slot_num; + + status = ocfs2_recover_node(osb, node_num, slot_num); +skip_recovery: if (!status) { ocfs2_recovery_map_clear(osb, node_num); } else { @@ -1055,13 +1285,27 @@ restart: if (status < 0) mlog_errno(status); + /* Now it is right time to recover quotas... We have to do this under + * superblock lock so that noone can start using the slot (and crash) + * before we recover it */ + for (i = 0; i < rm_quota_used; i++) { + qrec = ocfs2_begin_quota_recovery(osb, rm_quota[i]); + if (IS_ERR(qrec)) { + status = PTR_ERR(qrec); + mlog_errno(status); + continue; + } + ocfs2_queue_recovery_completion(osb->journal, rm_quota[i], + NULL, NULL, qrec); + } + ocfs2_super_unlock(osb, 1); /* We always run recovery on our own orphan dir - the dead * node(s) may have disallowd a previos inode delete. Re-processing * is therefore required. */ ocfs2_queue_recovery_completion(osb->journal, osb->slot_num, NULL, - NULL); + NULL, NULL); bail: mutex_lock(&osb->recovery_lock); @@ -1076,6 +1320,9 @@ bail: mutex_unlock(&osb->recovery_lock); + if (rm_quota) + kfree(rm_quota); + mlog_exit(status); /* no one is callint kthread_stop() for us so the kthread() api * requires that we call do_exit(). And it isn't exported, but @@ -1135,8 +1382,7 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb, } SET_INODE_JOURNAL(inode); - status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, - OCFS2_BH_IGNORE_CACHE); + status = ocfs2_read_inode_block_full(inode, bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -1268,6 +1514,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, osb->slot_recovery_generations[slot_num] = ocfs2_get_recovery_generation(fe); + ocfs2_compute_meta_ecc(osb->sb, bh->b_data, &fe->i_check); status = ocfs2_write_block(osb, bh, inode); if (status < 0) mlog_errno(status); @@ -1304,31 +1551,19 @@ done: * far less concerning. */ static int ocfs2_recover_node(struct ocfs2_super *osb, - int node_num) + int node_num, int slot_num) { int status = 0; - int slot_num; struct ocfs2_dinode *la_copy = NULL; struct ocfs2_dinode *tl_copy = NULL; - mlog_entry("(node_num=%d, osb->node_num = %d)\n", - node_num, osb->node_num); - - mlog(0, "checking node %d\n", node_num); + mlog_entry("(node_num=%d, slot_num=%d, osb->node_num = %d)\n", + node_num, slot_num, osb->node_num); /* Should not ever be called to recover ourselves -- in that * case we should've called ocfs2_journal_load instead. */ BUG_ON(osb->node_num == node_num); - slot_num = ocfs2_node_num_to_slot(osb, node_num); - if (slot_num == -ENOENT) { - status = 0; - mlog(0, "no slot for this node, so no recovery required.\n"); - goto done; - } - - mlog(0, "node %d was using slot %d\n", node_num, slot_num); - status = ocfs2_replay_journal(osb, node_num, slot_num); if (status < 0) { if (status == -EBUSY) { @@ -1364,7 +1599,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb, /* This will kfree the memory pointed to by la_copy and tl_copy */ ocfs2_queue_recovery_completion(osb->journal, slot_num, la_copy, - tl_copy); + tl_copy, NULL); status = 0; done: @@ -1659,13 +1894,14 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, return ret; } -static int ocfs2_wait_on_mount(struct ocfs2_super *osb) +static int __ocfs2_wait_on_mount(struct ocfs2_super *osb, int quota) { /* This check is good because ocfs2 will wait on our recovery * thread before changing it to something other than MOUNTED * or DISABLED. */ wait_event(osb->osb_mount_event, - atomic_read(&osb->vol_state) == VOLUME_MOUNTED || + (!quota && atomic_read(&osb->vol_state) == VOLUME_MOUNTED) || + atomic_read(&osb->vol_state) == VOLUME_MOUNTED_QUOTAS || atomic_read(&osb->vol_state) == VOLUME_DISABLED); /* If there's an error on mount, then we may never get to the diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index d4d14e9a3ce..3c3532e1307 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -27,12 +27,7 @@ #define OCFS2_JOURNAL_H #include <linux/fs.h> -#ifndef CONFIG_OCFS2_COMPAT_JBD -# include <linux/jbd2.h> -#else -# include <linux/jbd.h> -# include "ocfs2_jbd_compat.h" -#endif +#include <linux/jbd2.h> enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, @@ -173,6 +168,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num); int ocfs2_mark_dead_nodes(struct ocfs2_super *osb); void ocfs2_complete_mount_recovery(struct ocfs2_super *osb); +void ocfs2_complete_quota_recovery(struct ocfs2_super *osb); static inline void ocfs2_start_checkpoint(struct ocfs2_super *osb) { @@ -216,9 +212,12 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * ocfs2_extend_trans - Extend a handle by nblocks credits. This may * commit the handle to disk in the process, but will * not release any locks taken during the transaction. - * ocfs2_journal_access - Notify the handle that we want to journal this + * ocfs2_journal_access* - Notify the handle that we want to journal this * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . + * Always call the specific flavor of + * ocfs2_journal_access_*() unless you intend to + * manage the checksum by hand. * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before * the current handle commits. @@ -248,10 +247,29 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks); #define OCFS2_JOURNAL_ACCESS_WRITE 1 #define OCFS2_JOURNAL_ACCESS_UNDO 2 -int ocfs2_journal_access(handle_t *handle, - struct inode *inode, - struct buffer_head *bh, - int type); + +/* ocfs2_inode */ +int ocfs2_journal_access_di(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_extent_block */ +int ocfs2_journal_access_eb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_group_desc */ +int ocfs2_journal_access_gd(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* ocfs2_xattr_block */ +int ocfs2_journal_access_xb(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* quota blocks */ +int ocfs2_journal_access_dq(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* dirblock */ +int ocfs2_journal_access_db(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); +/* Anything that has no ecc */ +int ocfs2_journal_access(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); + /* * A word about the journal_access/journal_dirty "dance". It is * entirely legal to journal_access a buffer more than once (as long @@ -273,10 +291,6 @@ int ocfs2_journal_access(handle_t *handle, */ int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); -#ifdef CONFIG_OCFS2_COMPAT_JBD -int ocfs2_journal_dirty_data(handle_t *handle, - struct buffer_head *bh); -#endif /* * Credit Macros: @@ -293,6 +307,37 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 +/* global quotafile inode update, data block */ +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) + +/* + * The two writes below can accidentally see global info dirty due + * to set_info() quotactl so make them prepared for the writes. + */ +/* quota data block, global info */ +/* Write to local quota file */ +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) + +/* global quota data block, local quota data block, global quota inode, + * global quota info */ +#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) + +static inline int ocfs2_quota_trans_credits(struct super_block *sb) +{ + int credits = 0; + + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + if (OCFS2_HAS_RO_COMPAT_FEATURE(sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) + credits += OCFS2_QWRITE_CREDITS; + return credits; +} + +/* Number of credits needed for removing quota structure from file */ +int ocfs2_calc_qdel_credits(struct super_block *sb, int type); +/* Number of credits needed for initialization of new quota structure */ +int ocfs2_calc_qinit_credits(struct super_block *sb, int type); + /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) @@ -303,8 +348,11 @@ int ocfs2_journal_dirty_data(handle_t *handle, * prev. group desc. if we relink. */ #define OCFS2_SUBALLOC_ALLOC (3) -#define OCFS2_INLINE_TO_EXTENTS_CREDITS (OCFS2_SUBALLOC_ALLOC \ - + OCFS2_INODE_UPDATE_CREDITS) +static inline int ocfs2_inline_to_extents_credits(struct super_block *sb) +{ + return OCFS2_SUBALLOC_ALLOC + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} /* dinode + group descriptor update. We don't relink on free yet. */ #define OCFS2_SUBALLOC_FREE (2) @@ -313,16 +361,23 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC (OCFS2_SUBALLOC_FREE \ + OCFS2_TRUNCATE_LOG_UPDATE) -#define OCFS2_REMOVE_EXTENT_CREDITS (OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS) +static inline int ocfs2_remove_extent_credits(struct super_block *sb) +{ + return OCFS2_TRUNCATE_LOG_UPDATE + OCFS2_INODE_UPDATE_CREDITS + + ocfs2_quota_trans_credits(sb); +} /* data block for new dir/symlink, 2 for bitmap updates (bitmap fe + * bitmap block for the new bit) */ #define OCFS2_DIR_LINK_ADDITIONAL_CREDITS (1 + 2) /* parent fe, parent block, new file entry, inode alloc fe, inode alloc - * group descriptor + mkdir/symlink blocks */ -#define OCFS2_MKNOD_CREDITS (3 + OCFS2_SUBALLOC_ALLOC \ - + OCFS2_DIR_LINK_ADDITIONAL_CREDITS) + * group descriptor + mkdir/symlink blocks + quota update */ +static inline int ocfs2_mknod_credits(struct super_block *sb) +{ + return 3 + OCFS2_SUBALLOC_ALLOC + OCFS2_DIR_LINK_ADDITIONAL_CREDITS + + ocfs2_quota_trans_credits(sb); +} /* local alloc metadata change + main bitmap updates */ #define OCFS2_WINDOW_MOVE_CREDITS (OCFS2_INODE_UPDATE_CREDITS \ @@ -332,13 +387,21 @@ int ocfs2_journal_dirty_data(handle_t *handle, * for the dinode, one for the new block. */ #define OCFS2_SIMPLE_DIR_EXTEND_CREDITS (2) -/* file update (nlink, etc) + directory mtime/ctime + dir entry block */ -#define OCFS2_LINK_CREDITS (2*OCFS2_INODE_UPDATE_CREDITS + 1) +/* file update (nlink, etc) + directory mtime/ctime + dir entry block + quota + * update on dir */ +static inline int ocfs2_link_credits(struct super_block *sb) +{ + return 2*OCFS2_INODE_UPDATE_CREDITS + 1 + + ocfs2_quota_trans_credits(sb); +} /* inode + dir inode (if we unlink a dir), + dir entry block + orphan * dir inode link */ -#define OCFS2_UNLINK_CREDITS (2 * OCFS2_INODE_UPDATE_CREDITS + 1 \ - + OCFS2_LINK_CREDITS) +static inline int ocfs2_unlink_credits(struct super_block *sb) +{ + /* The quota update from ocfs2_link_credits is unused here... */ + return 2 * OCFS2_INODE_UPDATE_CREDITS + 1 + ocfs2_link_credits(sb); +} /* dinode + orphan dir dinode + inode alloc dinode + orphan dir entry + * inode alloc group descriptor */ @@ -347,8 +410,10 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* dinode update, old dir dinode update, new dir dinode update, old * dir dir entry, new dir dir entry, dir entry update for renaming * directory + target unlink */ -#define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ - + OCFS2_UNLINK_CREDITS) +static inline int ocfs2_rename_credits(struct super_block *sb) +{ + return 3 * OCFS2_INODE_UPDATE_CREDITS + 3 + ocfs2_unlink_credits(sb); +} /* global bitmap dinode, group desc., relinked group, * suballocator dinode, group desc., relinked group, @@ -386,18 +451,19 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, * credit for the dinode there. */ extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); - return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks + + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) { - int blocks = OCFS2_MKNOD_CREDITS; + int blocks = ocfs2_mknod_credits(sb); /* links can be longer than one block so we may update many * within our single allocated extent. */ blocks += ocfs2_clusters_to_blocks(sb, 1); - return blocks; + return blocks + ocfs2_quota_trans_credits(sb); } static inline int ocfs2_calc_group_alloc_credits(struct super_block *sb, @@ -434,6 +500,8 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, /* update to the truncate log. */ credits += OCFS2_TRUNCATE_LOG_UPDATE; + credits += ocfs2_quota_trans_credits(sb); + return credits; } diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 687b28713c3..ec70cdbe77f 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -36,6 +36,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" @@ -248,8 +249,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) goto bail; } - status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, - &alloc_bh, OCFS2_BH_IGNORE_CACHE); + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -382,8 +383,8 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) } memcpy(alloc_copy, alloc, bh->b_size); - status = ocfs2_journal_access(handle, local_alloc_inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, local_alloc_inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_commit; @@ -459,8 +460,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, - &alloc_bh, OCFS2_BH_IGNORE_CACHE); + status = ocfs2_read_inode_block_full(inode, &alloc_bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -476,6 +477,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, alloc = (struct ocfs2_dinode *) alloc_bh->b_data; ocfs2_clear_local_alloc(alloc); + ocfs2_compute_meta_ecc(osb->sb, alloc_bh->b_data, &alloc->i_check); status = ocfs2_write_block(osb, alloc_bh, inode); if (status < 0) mlog_errno(status); @@ -762,9 +764,9 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, * delete bits from it! */ *num_bits = bits_wanted; - status = ocfs2_journal_access(handle, local_alloc_inode, - osb->local_alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1240,9 +1242,9 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, } memcpy(alloc_copy, alloc, osb->local_alloc_bh->b_size); - status = ocfs2_journal_access(handle, local_alloc_inode, - osb->local_alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, local_alloc_inode, + osb->local_alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 2545e7402ef..084aba86c3b 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -40,6 +40,7 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/highmem.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_NAMEI #include <cluster/masklog.h> @@ -61,17 +62,18 @@ #include "sysfile.h" #include "uptodate.h" #include "xattr.h" +#include "acl.h" #include "buffer_head_io.h" static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, - struct dentry *dentry, int mode, + struct inode *inode, + struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, handle_t *handle, - struct inode **ret_inode, struct ocfs2_alloc_context *inode_ac); static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, @@ -186,6 +188,35 @@ bail: return ret; } +static struct inode *ocfs2_get_init_inode(struct inode *dir, int mode) +{ + struct inode *inode; + + inode = new_inode(dir->i_sb); + if (!inode) { + mlog(ML_ERROR, "new_inode failed!\n"); + return NULL; + } + + /* populate as many fields early on as possible - many of + * these are used by the support functions here and in + * callers. */ + if (S_ISDIR(mode)) + inode->i_nlink = 2; + else + inode->i_nlink = 1; + inode->i_uid = current_fsuid(); + if (dir->i_mode & S_ISGID) { + inode->i_gid = dir->i_gid; + if (S_ISDIR(mode)) + mode |= S_ISGID; + } else + inode->i_gid = current_fsgid(); + inode->i_mode = mode; + vfs_dq_init(inode); + return inode; +} + static int ocfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, @@ -201,6 +232,13 @@ static int ocfs2_mknod(struct inode *dir, struct inode *inode = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *xattr_ac = NULL; + int want_clusters = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota_inode = 0; mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, (unsigned long)dev, dentry->d_name.len, @@ -250,17 +288,46 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - /* Reserve a cluster if creating an extent based directory. */ - if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) { - status = ocfs2_reserve_clusters(osb, 1, &data_ac); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + inode = ocfs2_get_init_inode(dir, mode); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); goto leave; } } - handle = ocfs2_start_trans(osb, OCFS2_MKNOD_CREDITS); + /* calculate meta data/clusters for setting security and acl xattr */ + status = ocfs2_calc_xattr_init(dir, parent_fe_bh, mode, + &si, &want_clusters, + &xattr_credits, &xattr_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + /* Reserve a cluster if creating an extent based directory. */ + if (S_ISDIR(mode) && !ocfs2_supports_inline_data(osb)) + want_clusters += 1; + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + handle = ocfs2_start_trans(osb, ocfs2_mknod_credits(osb->sb) + + xattr_credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -268,10 +335,19 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto leave; + } + did_quota_inode = 1; + /* do the real work now. */ - status = ocfs2_mknod_locked(osb, dir, dentry, mode, dev, + status = ocfs2_mknod_locked(osb, dir, inode, dentry, dev, &new_fe_bh, parent_fe_bh, handle, - &inode, inode_ac); + inode_ac); if (status < 0) { mlog_errno(status); goto leave; @@ -285,8 +361,8 @@ static int ocfs2_mknod(struct inode *dir, goto leave; } - status = ocfs2_journal_access(handle, dir, parent_fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, dir, parent_fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -300,6 +376,22 @@ static int ocfs2_mknod(struct inode *dir, inc_nlink(dir); } + status = ocfs2_init_acl(handle, inode, dir, new_fe_bh, parent_fe_bh, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + } + status = ocfs2_add_entry(handle, dentry, inode, OCFS2_I(inode)->ip_blkno, parent_fe_bh, de_bh); @@ -320,6 +412,8 @@ static int ocfs2_mknod(struct inode *dir, d_instantiate(dentry, inode); status = 0; leave: + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -331,9 +425,13 @@ leave: brelse(new_fe_bh); brelse(de_bh); brelse(parent_fe_bh); + kfree(si.name); + kfree(si.value); - if ((status < 0) && inode) + if ((status < 0) && inode) { + clear_nlink(inode); iput(inode); + } if (inode_ac) ocfs2_free_alloc_context(inode_ac); @@ -341,6 +439,9 @@ leave: if (data_ac) ocfs2_free_alloc_context(data_ac); + if (xattr_ac) + ocfs2_free_alloc_context(xattr_ac); + mlog_exit(status); return status; @@ -348,12 +449,12 @@ leave: static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct inode *dir, - struct dentry *dentry, int mode, + struct inode *inode, + struct dentry *dentry, dev_t dev, struct buffer_head **new_fe_bh, struct buffer_head *parent_fe_bh, handle_t *handle, - struct inode **ret_inode, struct ocfs2_alloc_context *inode_ac) { int status = 0; @@ -361,14 +462,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, struct ocfs2_extent_list *fel; u64 fe_blkno = 0; u16 suballoc_bit; - struct inode *inode = NULL; - mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, mode, - (unsigned long)dev, dentry->d_name.len, + mlog_entry("(0x%p, 0x%p, %d, %lu, '%.*s')\n", dir, dentry, + inode->i_mode, (unsigned long)dev, dentry->d_name.len, dentry->d_name.name); *new_fe_bh = NULL; - *ret_inode = NULL; status = ocfs2_claim_new_inode(osb, handle, inode_ac, &suballoc_bit, &fe_blkno); @@ -377,23 +476,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, goto leave; } - inode = new_inode(dir->i_sb); - if (!inode) { - status = -ENOMEM; - mlog(ML_ERROR, "new_inode failed!\n"); - goto leave; - } - /* populate as many fields early on as possible - many of * these are used by the support functions here and in * callers. */ inode->i_ino = ino_from_blkno(osb->sb, fe_blkno); OCFS2_I(inode)->ip_blkno = fe_blkno; - if (S_ISDIR(mode)) - inode->i_nlink = 2; - else - inode->i_nlink = 1; - inode->i_mode = mode; spin_lock(&osb->osb_lock); inode->i_generation = osb->s_next_generation++; spin_unlock(&osb->osb_lock); @@ -406,8 +493,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, } ocfs2_set_new_buffer_uptodate(inode, *new_fe_bh); - status = ocfs2_journal_access(handle, inode, *new_fe_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_di(handle, inode, *new_fe_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto leave; @@ -421,17 +508,11 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_blkno = cpu_to_le64(fe_blkno); fe->i_suballoc_bit = cpu_to_le16(suballoc_bit); fe->i_suballoc_slot = cpu_to_le16(inode_ac->ac_alloc_slot); - fe->i_uid = cpu_to_le32(current_fsuid()); - if (dir->i_mode & S_ISGID) { - fe->i_gid = cpu_to_le32(dir->i_gid); - if (S_ISDIR(mode)) - mode |= S_ISGID; - } else - fe->i_gid = cpu_to_le32(current_fsgid()); - fe->i_mode = cpu_to_le16(mode); - if (S_ISCHR(mode) || S_ISBLK(mode)) + fe->i_uid = cpu_to_le32(inode->i_uid); + fe->i_gid = cpu_to_le32(inode->i_gid); + fe->i_mode = cpu_to_le16(inode->i_mode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) fe->id1.dev1.i_rdev = cpu_to_le64(huge_encode_dev(dev)); - fe->i_links_count = cpu_to_le16(inode->i_nlink); fe->i_last_eb_blk = 0; @@ -446,7 +527,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, /* * If supported, directories start with inline data. */ - if (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) { + if (S_ISDIR(inode->i_mode) && ocfs2_supports_inline_data(osb)) { u16 feat = le16_to_cpu(fe->i_dyn_features); fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); @@ -465,15 +546,7 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, goto leave; } - if (ocfs2_populate_inode(inode, fe, 1) < 0) { - mlog(ML_ERROR, "populate inode failed! bh->b_blocknr=%llu, " - "i_blkno=%llu, i_ino=%lu\n", - (unsigned long long)(*new_fe_bh)->b_blocknr, - (unsigned long long)le64_to_cpu(fe->i_blkno), - inode->i_ino); - BUG(); - } - + ocfs2_populate_inode(inode, fe, 1); ocfs2_inode_set_new(osb, inode); if (!ocfs2_mount_local(osb)) { status = ocfs2_create_new_inode_locks(inode); @@ -484,17 +557,12 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, status = 0; /* error in ocfs2_create_new_inode_locks is not * critical */ - *ret_inode = inode; leave: if (status < 0) { if (*new_fe_bh) { brelse(*new_fe_bh); *new_fe_bh = NULL; } - if (inode) { - clear_nlink(inode); - iput(inode); - } } mlog_exit(status); @@ -588,7 +656,7 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } - handle = ocfs2_start_trans(osb, OCFS2_LINK_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_link_credits(osb->sb)); if (IS_ERR(handle)) { err = PTR_ERR(handle); handle = NULL; @@ -596,8 +664,8 @@ static int ocfs2_link(struct dentry *old_dentry, goto out_unlock_inode; } - err = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + err = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (err < 0) { mlog_errno(err); goto out_commit; @@ -775,7 +843,7 @@ static int ocfs2_unlink(struct inode *dir, } } - handle = ocfs2_start_trans(osb, OCFS2_UNLINK_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_unlink_credits(osb->sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -783,8 +851,8 @@ static int ocfs2_unlink(struct inode *dir, goto leave; } - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -1181,7 +1249,7 @@ static int ocfs2_rename(struct inode *old_dir, } } - handle = ocfs2_start_trans(osb, OCFS2_RENAME_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_rename_credits(osb->sb)); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -1197,8 +1265,8 @@ static int ocfs2_rename(struct inode *old_dir, goto bail; } } - status = ocfs2_journal_access(handle, new_inode, newfe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, new_inode, newfe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1244,8 +1312,8 @@ static int ocfs2_rename(struct inode *old_dir, old_inode->i_ctime = CURRENT_TIME; mark_inode_dirty(old_inode); - status = ocfs2_journal_access(handle, old_inode, old_inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, old_inode, old_inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status >= 0) { old_di = (struct ocfs2_dinode *) old_inode_bh->b_data; @@ -1321,9 +1389,9 @@ static int ocfs2_rename(struct inode *old_dir, (int)old_dir_nlink, old_dir->i_nlink); } else { struct ocfs2_dinode *fe; - status = ocfs2_journal_access(handle, old_dir, - old_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, old_dir, + old_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); fe = (struct ocfs2_dinode *) old_dir_bh->b_data; fe->i_links_count = cpu_to_le16(old_dir->i_nlink); status = ocfs2_journal_dirty(handle, old_dir_bh); @@ -1496,6 +1564,13 @@ static int ocfs2_symlink(struct inode *dir, handle_t *handle = NULL; struct ocfs2_alloc_context *inode_ac = NULL; struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *xattr_ac = NULL; + int want_clusters = 0; + int xattr_credits = 0; + struct ocfs2_security_xattr_info si = { + .enable = 1, + }; + int did_quota = 0, did_quota_inode = 0; mlog_entry("(0x%p, 0x%p, symname='%s' actual='%.*s')\n", dir, dentry, symname, dentry->d_name.len, dentry->d_name.name); @@ -1542,17 +1617,46 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - /* don't reserve bitmap space for fast symlinks. */ - if (l > ocfs2_fast_symlink_chars(sb)) { - status = ocfs2_reserve_clusters(osb, 1, &data_ac); + inode = ocfs2_get_init_inode(dir, S_IFLNK | S_IRWXUGO); + if (!inode) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + /* get security xattr */ + status = ocfs2_init_security_get(inode, dir, &si); + if (status) { + if (status == -EOPNOTSUPP) + si.enable = 0; + else { + mlog_errno(status); + goto bail; + } + } + + /* calculate meta data/clusters for setting security xattr */ + if (si.enable) { + status = ocfs2_calc_security_init(dir, &si, &want_clusters, + &xattr_credits, &xattr_ac); if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + mlog_errno(status); goto bail; } } - handle = ocfs2_start_trans(osb, credits); + /* don't reserve bitmap space for fast symlinks. */ + if (l > ocfs2_fast_symlink_chars(sb)) + want_clusters += 1; + + status = ocfs2_reserve_clusters(osb, want_clusters, &data_ac); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto bail; + } + + handle = ocfs2_start_trans(osb, credits + xattr_credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); handle = NULL; @@ -1560,10 +1664,18 @@ static int ocfs2_symlink(struct inode *dir, goto bail; } - status = ocfs2_mknod_locked(osb, dir, dentry, - S_IFLNK | S_IRWXUGO, 0, - &new_fe_bh, parent_fe_bh, handle, - &inode, inode_ac); + /* We don't use standard VFS wrapper because we don't want vfs_dq_init + * to be called. */ + if (sb_any_quota_active(osb->sb) && + osb->sb->dq_op->alloc_inode(inode, 1) == NO_QUOTA) { + status = -EDQUOT; + goto bail; + } + did_quota_inode = 1; + + status = ocfs2_mknod_locked(osb, dir, inode, dentry, + 0, &new_fe_bh, parent_fe_bh, handle, + inode_ac); if (status < 0) { mlog_errno(status); goto bail; @@ -1576,6 +1688,12 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; + if (vfs_dq_alloc_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1))) { + status = -EDQUOT; + goto bail; + } + did_quota = 1; status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, new_fe_bh, handle, data_ac, NULL, @@ -1614,6 +1732,15 @@ static int ocfs2_symlink(struct inode *dir, } } + if (si.enable) { + status = ocfs2_init_security_set(handle, inode, new_fe_bh, &si, + xattr_ac, data_ac); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + status = ocfs2_add_entry(handle, dentry, inode, le64_to_cpu(fe->i_blkno), parent_fe_bh, de_bh); @@ -1632,6 +1759,11 @@ static int ocfs2_symlink(struct inode *dir, dentry->d_op = &ocfs2_dentry_ops; d_instantiate(dentry, inode); bail: + if (status < 0 && did_quota) + vfs_dq_free_space_nodirty(inode, + ocfs2_clusters_to_bytes(osb->sb, 1)); + if (status < 0 && did_quota_inode) + vfs_dq_free_inode(inode); if (handle) ocfs2_commit_trans(osb, handle); @@ -1640,12 +1772,18 @@ bail: brelse(new_fe_bh); brelse(parent_fe_bh); brelse(de_bh); + kfree(si.name); + kfree(si.value); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) ocfs2_free_alloc_context(data_ac); - if ((status < 0) && inode) + if (xattr_ac) + ocfs2_free_alloc_context(xattr_ac); + if ((status < 0) && inode) { + clear_nlink(inode); iput(inode); + } mlog_exit(status); @@ -1754,16 +1892,14 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - status = ocfs2_read_block(orphan_dir_inode, - OCFS2_I(orphan_dir_inode)->ip_blkno, - &orphan_dir_bh); + status = ocfs2_read_inode_block(orphan_dir_inode, &orphan_dir_bh); if (status < 0) { mlog_errno(status); goto leave; } - status = ocfs2_journal_access(handle, orphan_dir_inode, orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; @@ -1850,8 +1986,8 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, goto leave; } - status = ocfs2_journal_access(handle,orphan_dir_inode, orphan_dir_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle,orphan_dir_inode, orphan_dir_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 3fed9e3d899..ad5c24a29ed 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -161,6 +161,7 @@ enum ocfs2_vol_state { VOLUME_INIT = 0, VOLUME_MOUNTED, + VOLUME_MOUNTED_QUOTAS, VOLUME_DISMOUNTED, VOLUME_DISABLED }; @@ -195,6 +196,9 @@ enum ocfs2_mount_options OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ + OCFS2_MOUNT_POSIX_ACL = 1 << 8, /* POSIX access control lists */ + OCFS2_MOUNT_USRQUOTA = 1 << 9, /* We support user quotas */ + OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -205,6 +209,7 @@ enum ocfs2_mount_options struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; +struct ocfs2_quota_recovery; struct ocfs2_super { struct task_struct *commit_task; @@ -286,10 +291,11 @@ struct ocfs2_super char *local_alloc_debug_buf; #endif - /* Next two fields are for local node slot recovery during + /* Next three fields are for local node slot recovery during * mount. */ int dirty; struct ocfs2_dinode *local_alloc_copy; + struct ocfs2_quota_recovery *quota_rec; struct ocfs2_alloc_stats alloc_stats; char dev_str[20]; /* "major,minor" of the device */ @@ -333,6 +339,10 @@ struct ocfs2_super #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) +/* Useful typedef for passing around journal access functions */ +typedef int (*ocfs2_journal_access_func)(handle_t *handle, struct inode *inode, + struct buffer_head *bh, int type); + static inline int ocfs2_should_order_data(struct inode *inode) { if (!S_ISREG(inode->i_mode)) @@ -376,6 +386,13 @@ static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_meta_ecc(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_META_ECC) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -443,39 +460,19 @@ static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb) #define OCFS2_IS_VALID_DINODE(ptr) \ (!strcmp((ptr)->i_signature, OCFS2_INODE_SIGNATURE)) -#define OCFS2_RO_ON_INVALID_DINODE(__sb, __di) do { \ - typeof(__di) ____di = (__di); \ - ocfs2_error((__sb), \ - "Dinode # %llu has bad signature %.*s", \ - (unsigned long long)le64_to_cpu((____di)->i_blkno), 7, \ - (____di)->i_signature); \ -} while (0) - #define OCFS2_IS_VALID_EXTENT_BLOCK(ptr) \ (!strcmp((ptr)->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE)) -#define OCFS2_RO_ON_INVALID_EXTENT_BLOCK(__sb, __eb) do { \ - typeof(__eb) ____eb = (__eb); \ - ocfs2_error((__sb), \ - "Extent Block # %llu has bad signature %.*s", \ - (unsigned long long)le64_to_cpu((____eb)->h_blkno), 7, \ - (____eb)->h_signature); \ -} while (0) - #define OCFS2_IS_VALID_GROUP_DESC(ptr) \ (!strcmp((ptr)->bg_signature, OCFS2_GROUP_DESC_SIGNATURE)) -#define OCFS2_RO_ON_INVALID_GROUP_DESC(__sb, __gd) do { \ - typeof(__gd) ____gd = (__gd); \ - ocfs2_error((__sb), \ - "Group Descriptor # %llu has bad signature %.*s", \ - (unsigned long long)le64_to_cpu((____gd)->bg_blkno), 7, \ - (____gd)->bg_signature); \ -} while (0) #define OCFS2_IS_VALID_XATTR_BLOCK(ptr) \ (!strcmp((ptr)->xb_signature, OCFS2_XATTR_BLOCK_SIGNATURE)) +#define OCFS2_IS_VALID_DIR_TRAILER(ptr) \ + (!strcmp((ptr)->db_signature, OCFS2_DIR_TRAILER_SIGNATURE)) + static inline unsigned long ino_from_blkno(struct super_block *sb, u64 blkno) { @@ -632,5 +629,6 @@ static inline s16 ocfs2_get_inode_steal_slot(struct ocfs2_super *osb) #define ocfs2_clear_bit ext2_clear_bit #define ocfs2_test_bit ext2_test_bit #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit +#define ocfs2_find_next_bit ext2_find_next_bit #endif /* OCFS2_H */ diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 5e0c0d0aef7..c7ae45aaa36 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -65,6 +65,7 @@ #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" #define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" +#define OCFS2_DIR_TRAILER_SIGNATURE "DIRTRL1" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -93,8 +94,11 @@ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ - | OCFS2_FEATURE_INCOMPAT_XATTR) -#define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN + | OCFS2_FEATURE_INCOMPAT_XATTR \ + | OCFS2_FEATURE_INCOMPAT_META_ECC) +#define OCFS2_FEATURE_RO_COMPAT_SUPP (OCFS2_FEATURE_RO_COMPAT_UNWRITTEN \ + | OCFS2_FEATURE_RO_COMPAT_USRQUOTA \ + | OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) /* * Heartbeat-only devices are missing journals and other files. The @@ -147,6 +151,9 @@ /* Support for extended attributes */ #define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 +/* Metadata checksum and error correction */ +#define OCFS2_FEATURE_INCOMPAT_META_ECC 0x0800 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -163,6 +170,12 @@ */ #define OCFS2_FEATURE_RO_COMPAT_UNWRITTEN 0x0001 +/* + * Maintain quota information for this filesystem + */ +#define OCFS2_FEATURE_RO_COMPAT_USRQUOTA 0x0002 +#define OCFS2_FEATURE_RO_COMPAT_GRPQUOTA 0x0004 + /* The byte offset of the first backup block will be 1G. * The following will be 4G, 16G, 64G, 256G and 1T. */ @@ -192,6 +205,7 @@ #define OCFS2_HEARTBEAT_FL (0x00000200) /* Heartbeat area */ #define OCFS2_CHAIN_FL (0x00000400) /* Chain allocator */ #define OCFS2_DEALLOC_FL (0x00000800) /* Truncate log */ +#define OCFS2_QUOTA_FL (0x00001000) /* Quota file */ /* * Flags on ocfs2_dinode.i_dyn_features @@ -329,13 +343,17 @@ enum { #define OCFS2_FIRST_ONLINE_SYSTEM_INODE SLOT_MAP_SYSTEM_INODE HEARTBEAT_SYSTEM_INODE, GLOBAL_BITMAP_SYSTEM_INODE, -#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GLOBAL_BITMAP_SYSTEM_INODE + USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE, +#define OCFS2_LAST_GLOBAL_SYSTEM_INODE GROUP_QUOTA_SYSTEM_INODE ORPHAN_DIR_SYSTEM_INODE, EXTENT_ALLOC_SYSTEM_INODE, INODE_ALLOC_SYSTEM_INODE, JOURNAL_SYSTEM_INODE, LOCAL_ALLOC_SYSTEM_INODE, TRUNCATE_LOG_SYSTEM_INODE, + LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE, NUM_SYSTEM_INODES }; @@ -349,6 +367,8 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { [SLOT_MAP_SYSTEM_INODE] = { "slot_map", 0, S_IFREG | 0644 }, [HEARTBEAT_SYSTEM_INODE] = { "heartbeat", OCFS2_HEARTBEAT_FL, S_IFREG | 0644 }, [GLOBAL_BITMAP_SYSTEM_INODE] = { "global_bitmap", 0, S_IFREG | 0644 }, + [USER_QUOTA_SYSTEM_INODE] = { "aquota.user", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group", OCFS2_QUOTA_FL, S_IFREG | 0644 }, /* Slot-specific system inodes (one copy per slot) */ [ORPHAN_DIR_SYSTEM_INODE] = { "orphan_dir:%04d", 0, S_IFDIR | 0755 }, @@ -356,7 +376,9 @@ static struct ocfs2_system_inode_info ocfs2_system_inodes[NUM_SYSTEM_INODES] = { [INODE_ALLOC_SYSTEM_INODE] = { "inode_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_CHAIN_FL, S_IFREG | 0644 }, [JOURNAL_SYSTEM_INODE] = { "journal:%04d", OCFS2_JOURNAL_FL, S_IFREG | 0644 }, [LOCAL_ALLOC_SYSTEM_INODE] = { "local_alloc:%04d", OCFS2_BITMAP_FL | OCFS2_LOCAL_ALLOC_FL, S_IFREG | 0644 }, - [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 } + [TRUNCATE_LOG_SYSTEM_INODE] = { "truncate_log:%04d", OCFS2_DEALLOC_FL, S_IFREG | 0644 }, + [LOCAL_USER_QUOTA_SYSTEM_INODE] = { "aquota.user:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, + [LOCAL_GROUP_QUOTA_SYSTEM_INODE] = { "aquota.group:%04d", OCFS2_QUOTA_FL, S_IFREG | 0644 }, }; /* Parameter passed from mount.ocfs2 to module */ @@ -410,6 +432,22 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = { #define OCFS2_RAW_SB(dinode) (&((dinode)->id2.i_super)) /* + * Block checking structure. This is used in metadata to validate the + * contents. If OCFS2_FEATURE_INCOMPAT_META_ECC is not set, it is all + * zeros. + */ +struct ocfs2_block_check { +/*00*/ __le32 bc_crc32e; /* 802.3 Ethernet II CRC32 */ + __le16 bc_ecc; /* Single-error-correction parity vector. + This is a simple Hamming code dependant + on the blocksize. OCFS2's maximum + blocksize, 4K, requires 16 parity bits, + so we fit in __le16. */ + __le16 bc_reserved1; +/*08*/ +}; + +/* * On disk extent record for OCFS2 * It describes a range of clusters on disk. * @@ -496,7 +534,7 @@ struct ocfs2_truncate_log { struct ocfs2_extent_block { /*00*/ __u8 h_signature[8]; /* Signature for verification */ - __le64 h_reserved1; + struct ocfs2_block_check h_check; /* Error checking */ /*10*/ __le16 h_suballoc_slot; /* Slot suballocator this extent_header belongs to */ __le16 h_suballoc_bit; /* Bit offset in suballocator @@ -666,7 +704,8 @@ struct ocfs2_dinode { was set in i_flags */ __le16 i_dyn_features; __le64 i_xattr_loc; -/*80*/ __le64 i_reserved2[7]; +/*80*/ struct ocfs2_block_check i_check; /* Error checking */ +/*88*/ __le64 i_reserved2[6]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -715,6 +754,34 @@ struct ocfs2_dir_entry { } __attribute__ ((packed)); /* + * Per-block record for the unindexed directory btree. This is carefully + * crafted so that the rec_len and name_len records of an ocfs2_dir_entry are + * mirrored. That way, the directory manipulation code needs a minimal amount + * of update. + * + * NOTE: Keep this structure aligned to a multiple of 4 bytes. + */ +struct ocfs2_dir_block_trailer { +/*00*/ __le64 db_compat_inode; /* Always zero. Was inode */ + + __le16 db_compat_rec_len; /* Backwards compatible with + * ocfs2_dir_entry. */ + __u8 db_compat_name_len; /* Always zero. Was name_len */ + __u8 db_reserved0; + __le16 db_reserved1; + __le16 db_free_rec_len; /* Size of largest empty hole + * in this block. (unused) */ +/*10*/ __u8 db_signature[8]; /* Signature for verification */ + __le64 db_reserved2; + __le64 db_free_next; /* Next block in list (unused) */ +/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ + __le64 db_parent_dinode; /* dinode which owns me, in + blocks */ +/*30*/ struct ocfs2_block_check db_check; /* Error checking */ +/*40*/ +}; + +/* * On disk allocator group structure for OCFS2 */ struct ocfs2_group_desc @@ -733,7 +800,8 @@ struct ocfs2_group_desc /*20*/ __le64 bg_parent_dinode; /* dinode which owns me, in blocks */ __le64 bg_blkno; /* Offset on disk, in blocks */ -/*30*/ __le64 bg_reserved2[2]; +/*30*/ struct ocfs2_block_check bg_check; /* Error checking */ + __le64 bg_reserved2; /*40*/ __u8 bg_bitmap[0]; }; @@ -776,7 +844,12 @@ struct ocfs2_xattr_header { in this extent record, only valid in the first bucket. */ - __le64 xh_csum; + struct ocfs2_block_check xh_check; /* Error checking + (Note, this is only + used for xattr + buckets. A block uses + xb_check and sets + this field to zero.) */ struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ }; @@ -827,7 +900,7 @@ struct ocfs2_xattr_block { block group */ __le32 xb_fs_generation; /* Must match super block */ /*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ - __le64 xb_csum; + struct ocfs2_block_check xb_check; /* Error checking */ /*20*/ __le16 xb_flags; /* Indicates whether this block contains real xattr or a xattr tree. */ __le16 xb_reserved0; @@ -868,6 +941,128 @@ static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) return xe->xe_type & OCFS2_XATTR_TYPE_MASK; } +/* + * On disk structures for global quota file + */ + +/* Magic numbers and known versions for global quota files */ +#define OCFS2_GLOBAL_QMAGICS {\ + 0x0cf52470, /* USRQUOTA */ \ + 0x0cf52471 /* GRPQUOTA */ \ +} + +#define OCFS2_GLOBAL_QVERSIONS {\ + 0, \ + 0, \ +} + + +/* Each block of each quota file has a certain fixed number of bytes reserved + * for OCFS2 internal use at its end. OCFS2 can use it for things like + * checksums, etc. */ +#define OCFS2_QBLK_RESERVED_SPACE 8 + +/* Generic header of all quota files */ +struct ocfs2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* Quota format version */ +}; + +#define OCFS2_GLOBAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of global quota file (immediately follows the generic + * header) */ +struct ocfs2_global_disk_dqinfo { +/*00*/ __le32 dqi_bgrace; /* Grace time for space softlimit excess */ + __le32 dqi_igrace; /* Grace time for inode softlimit excess */ + __le32 dqi_syncms; /* Time after which we sync local changes to + * global quota file */ + __le32 dqi_blocks; /* Number of blocks in quota file */ +/*10*/ __le32 dqi_free_blk; /* First free block in quota file */ + __le32 dqi_free_entry; /* First block with free dquot entry in quota + * file */ +}; + +/* Structure with global user / group information. We reserve some space + * for future use. */ +struct ocfs2_global_disk_dqblk { +/*00*/ __le32 dqb_id; /* ID the structure belongs to */ + __le32 dqb_use_count; /* Number of nodes having reference to this structure */ + __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ +/*10*/ __le64 dqb_isoftlimit; /* preferred inode limit */ + __le64 dqb_curinodes; /* current # allocated inodes */ +/*20*/ __le64 dqb_bhardlimit; /* absolute limit on disk space */ + __le64 dqb_bsoftlimit; /* preferred limit on disk space */ +/*30*/ __le64 dqb_curspace; /* current space occupied */ + __le64 dqb_btime; /* time limit for excessive disk use */ +/*40*/ __le64 dqb_itime; /* time limit for excessive inode use */ + __le64 dqb_pad1; +/*50*/ __le64 dqb_pad2; +}; + +/* + * On-disk structures for local quota file + */ + +/* Magic numbers and known versions for local quota files */ +#define OCFS2_LOCAL_QMAGICS {\ + 0x0cf524c0, /* USRQUOTA */ \ + 0x0cf524c1 /* GRPQUOTA */ \ +} + +#define OCFS2_LOCAL_QVERSIONS {\ + 0, \ + 0, \ +} + +/* Quota flags in dqinfo header */ +#define OLQF_CLEAN 0x0001 /* Quota file is empty (this should be after\ + * quota has been cleanly turned off) */ + +#define OCFS2_LOCAL_INFO_OFF (sizeof(struct ocfs2_disk_dqheader)) + +/* Information header of local quota file (immediately follows the generic + * header) */ +struct ocfs2_local_disk_dqinfo { + __le32 dqi_flags; /* Flags for quota file */ + __le32 dqi_chunks; /* Number of chunks of quota structures + * with a bitmap */ + __le32 dqi_blocks; /* Number of blocks allocated for quota file */ +}; + +/* Header of one chunk of a quota file */ +struct ocfs2_local_disk_chunk { + __le32 dqc_free; /* Number of free entries in the bitmap */ + u8 dqc_bitmap[0]; /* Bitmap of entries in the corresponding + * chunk of quota file */ +}; + +/* One entry in local quota file */ +struct ocfs2_local_disk_dqblk { +/*00*/ __le64 dqb_id; /* id this quota applies to */ + __le64 dqb_spacemod; /* Change in the amount of used space */ +/*10*/ __le64 dqb_inodemod; /* Change in the amount of used inodes */ +}; + + +/* + * The quota trailer lives at the end of each quota block. + */ + +struct ocfs2_disk_dqtrailer { +/*00*/ struct ocfs2_block_check dq_check; /* Error checking */ +/*08*/ /* Cannot be larger than OCFS2_QBLK_RESERVED_SPACE */ +}; + +static inline struct ocfs2_disk_dqtrailer *ocfs2_block_dqtrailer(int blocksize, + void *buf) +{ + char *ptr = buf; + ptr += blocksize - OCFS2_QBLK_RESERVED_SPACE; + + return (struct ocfs2_disk_dqtrailer *)ptr; +} + #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h deleted file mode 100644 index b91c78f8f55..00000000000 --- a/fs/ocfs2/ocfs2_jbd_compat.h +++ /dev/null @@ -1,82 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * ocfs2_jbd_compat.h - * - * Compatibility defines for JBD. - * - * Copyright (C) 2008 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License version 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ - -#ifndef OCFS2_JBD_COMPAT_H -#define OCFS2_JBD_COMPAT_H - -#ifndef CONFIG_OCFS2_COMPAT_JBD -# error Should not have been included -#endif - -struct jbd2_inode { - unsigned int dummy; -}; - -#define JBD2_BARRIER JFS_BARRIER -#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE - -#define jbd2_journal_ack_err journal_ack_err -#define jbd2_journal_clear_err journal_clear_err -#define jbd2_journal_destroy journal_destroy -#define jbd2_journal_dirty_metadata journal_dirty_metadata -#define jbd2_journal_errno journal_errno -#define jbd2_journal_extend journal_extend -#define jbd2_journal_flush journal_flush -#define jbd2_journal_force_commit journal_force_commit -#define jbd2_journal_get_write_access journal_get_write_access -#define jbd2_journal_get_undo_access journal_get_undo_access -#define jbd2_journal_init_inode journal_init_inode -#define jbd2_journal_invalidatepage journal_invalidatepage -#define jbd2_journal_load journal_load -#define jbd2_journal_lock_updates journal_lock_updates -#define jbd2_journal_restart journal_restart -#define jbd2_journal_start journal_start -#define jbd2_journal_start_commit journal_start_commit -#define jbd2_journal_stop journal_stop -#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers -#define jbd2_journal_unlock_updates journal_unlock_updates -#define jbd2_journal_wipe journal_wipe -#define jbd2_log_wait_commit log_wait_commit - -static inline int jbd2_journal_file_inode(handle_t *handle, - struct jbd2_inode *inode) -{ - return 0; -} - -static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, - loff_t new_size) -{ - return 0; -} - -static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, - struct inode *inode) -{ - return; -} - -static inline void jbd2_journal_release_jbd_inode(journal_t *journal, - struct jbd2_inode *jinode) -{ - return; -} - - -#endif /* OCFS2_JBD_COMPAT_H */ diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index 82c200f7a8f..eb6f50c9cec 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -46,6 +46,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_DENTRY, OCFS2_LOCK_TYPE_OPEN, OCFS2_LOCK_TYPE_FLOCK, + OCFS2_LOCK_TYPE_QINFO, OCFS2_NUM_LOCK_TYPES }; @@ -77,6 +78,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_FLOCK: c = 'F'; break; + case OCFS2_LOCK_TYPE_QINFO: + c = 'Q'; + break; default: c = '\0'; } @@ -95,6 +99,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_DENTRY] = "Dentry", [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", + [OCFS2_LOCK_TYPE_QINFO] = "Quota", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h new file mode 100644 index 00000000000..7365e2e0870 --- /dev/null +++ b/fs/ocfs2/quota.h @@ -0,0 +1,119 @@ +/* + * quota.h for OCFS2 + * + * On disk quota structures for local and global quota file, in-memory + * structures. + * + */ + +#ifndef _OCFS2_QUOTA_H +#define _OCFS2_QUOTA_H + +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/quota.h> +#include <linux/list.h> +#include <linux/dqblk_qtree.h> + +#include "ocfs2.h" + +/* Common stuff */ +/* id number of quota format */ +#define QFMT_OCFS2 3 + +/* + * In-memory structures + */ +struct ocfs2_dquot { + struct dquot dq_dquot; /* Generic VFS dquot */ + loff_t dq_local_off; /* Offset in the local quota file */ + struct ocfs2_quota_chunk *dq_chunk; /* Chunk dquot is in */ + unsigned int dq_use_count; /* Number of nodes having reference to this entry in global quota file */ + s64 dq_origspace; /* Last globally synced space usage */ + s64 dq_originodes; /* Last globally synced inode usage */ +}; + +/* Description of one chunk to recover in memory */ +struct ocfs2_recovery_chunk { + struct list_head rc_list; /* List of chunks */ + int rc_chunk; /* Chunk number */ + unsigned long *rc_bitmap; /* Bitmap of entries to recover */ +}; + +struct ocfs2_quota_recovery { + struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ +}; + +/* In-memory structure with quota header information */ +struct ocfs2_mem_dqinfo { + unsigned int dqi_type; /* Quota type this structure describes */ + unsigned int dqi_chunks; /* Number of chunks in local quota file */ + unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ + unsigned int dqi_syncms; /* How often should we sync with other nodes */ + unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ + struct list_head dqi_chunk; /* List of chunks */ + struct inode *dqi_gqinode; /* Global quota file inode */ + struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ + struct buffer_head *dqi_gqi_bh; /* Buffer head with global quota file inode - set only if inode lock is obtained */ + int dqi_gqi_count; /* Number of holders of dqi_gqi_bh */ + struct buffer_head *dqi_lqi_bh; /* Buffer head with local quota file inode */ + struct buffer_head *dqi_ibh; /* Buffer with information header */ + struct qtree_mem_dqinfo dqi_gi; /* Info about global file */ + struct delayed_work dqi_sync_work; /* Work for syncing dquots */ + struct ocfs2_quota_recovery *dqi_rec; /* Pointer to recovery + * information, in case we + * enable quotas on file + * needing it */ +}; + +static inline struct ocfs2_dquot *OCFS2_DQUOT(struct dquot *dquot) +{ + return container_of(dquot, struct ocfs2_dquot, dq_dquot); +} + +struct ocfs2_quota_chunk { + struct list_head qc_chunk; /* List of quotafile chunks */ + int qc_num; /* Number of quota chunk */ + struct buffer_head *qc_headerbh; /* Buffer head with chunk header */ +}; + +extern struct kmem_cache *ocfs2_dquot_cachep; +extern struct kmem_cache *ocfs2_qf_chunk_cachep; + +extern struct qtree_fmt_operations ocfs2_global_ops; + +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, int slot_num); +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num); +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec); +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off); +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off); +int ocfs2_global_read_info(struct super_block *sb, int type); +int ocfs2_global_write_info(struct super_block *sb, int type); +int ocfs2_global_read_dquot(struct dquot *dquot); +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing); +static inline int ocfs2_sync_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 0); +} +static inline int ocfs2_global_release_dquot(struct dquot *dquot) +{ + return __ocfs2_sync_dquot(dquot, 1); +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex); +int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh); + +extern struct dquot_operations ocfs2_quota_operations; +extern struct quota_format_type ocfs2_quota_format; + +int ocfs2_quota_setup(void); +void ocfs2_quota_shutdown(void); + +#endif /* _OCFS2_QUOTA_H */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c new file mode 100644 index 00000000000..6aff8f2d3e4 --- /dev/null +++ b/fs/ocfs2/quota_global.c @@ -0,0 +1,1025 @@ +/* + * Implementation of operations over global quota file + */ +#include <linux/spinlock.h> +#include <linux/fs.h> +#include <linux/quota.h> +#include <linux/quotaops.h> +#include <linux/dqblk_qtree.h> +#include <linux/jiffies.h> +#include <linux/writeback.h> +#include <linux/workqueue.h> + +#define MLOG_MASK_PREFIX ML_QUOTA +#include <cluster/masklog.h> + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "alloc.h" +#include "blockcheck.h" +#include "inode.h" +#include "journal.h" +#include "file.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "uptodate.h" +#include "quota.h" + +static struct workqueue_struct *ocfs2_quota_wq = NULL; + +static void qsync_work_fn(struct work_struct *work); + +static void ocfs2_global_disk2memdqb(struct dquot *dquot, void *dp) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + /* Update from disk only entries not set by the admin */ + if (!test_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags)) { + m->dqb_ihardlimit = le64_to_cpu(d->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(d->dqb_isoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + m->dqb_curinodes = le64_to_cpu(d->dqb_curinodes); + if (!test_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags)) { + m->dqb_bhardlimit = le64_to_cpu(d->dqb_bhardlimit); + m->dqb_bsoftlimit = le64_to_cpu(d->dqb_bsoftlimit); + } + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + m->dqb_curspace = le64_to_cpu(d->dqb_curspace); + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags)) + m->dqb_btime = le64_to_cpu(d->dqb_btime); + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags)) + m->dqb_itime = le64_to_cpu(d->dqb_itime); + OCFS2_DQUOT(dquot)->dq_use_count = le32_to_cpu(d->dqb_use_count); +} + +static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + + d->dqb_id = cpu_to_le32(dquot->dq_id); + d->dqb_use_count = cpu_to_le32(OCFS2_DQUOT(dquot)->dq_use_count); + d->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + d->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + d->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + d->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); + d->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); + d->dqb_curspace = cpu_to_le64(m->dqb_curspace); + d->dqb_btime = cpu_to_le64(m->dqb_btime); + d->dqb_itime = cpu_to_le64(m->dqb_itime); +} + +static int ocfs2_global_is_id(void *dp, struct dquot *dquot) +{ + struct ocfs2_global_disk_dqblk *d = dp; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + if (qtree_entry_unused(&oinfo->dqi_gi, dp)) + return 0; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; +} + +struct qtree_fmt_operations ocfs2_global_ops = { + .mem2disk_dqblk = ocfs2_global_mem2diskdqb, + .disk2mem_dqblk = ocfs2_global_disk2memdqb, + .is_id = ocfs2_global_is_id, +}; + +static int ocfs2_validate_quota_block(struct super_block *sb, + struct buffer_head *bh) +{ + struct ocfs2_disk_dqtrailer *dqt = + ocfs2_block_dqtrailer(sb->s_blocksize, bh->b_data); + + mlog(0, "Validating quota block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + return ocfs2_validate_meta_ecc(sb, bh->b_data, &dqt->dq_check); +} + +int ocfs2_read_quota_block(struct inode *inode, u64 v_block, + struct buffer_head **bh) +{ + int rc = 0; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, + ocfs2_validate_quota_block); + if (rc) + mlog_errno(rc); + + /* If ocfs2_read_virt_blocks() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + +static int ocfs2_get_quota_block(struct inode *inode, int block, + struct buffer_head **bh) +{ + u64 pblock, pcount; + int err; + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + err = ocfs2_extent_map_get_blocks(inode, block, &pblock, &pcount, NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (err) { + mlog_errno(err); + return err; + } + *bh = sb_getblk(inode->i_sb, pblock); + if (!*bh) { + err = -EIO; + mlog_errno(err); + } + return err;; +} + +/* Read data from global quotafile - avoid pagecache and such because we cannot + * afford acquiring the locks... We use quota cluster lock to serialize + * operations. Caller is responsible for acquiring it. */ +ssize_t ocfs2_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + loff_t i_size = i_size_read(gqinode); + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0; + struct buffer_head *bh; + size_t toread, tocopy; + + if (off > i_size) + return 0; + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(size_t, (sb->s_blocksize - offset), toread); + bh = NULL; + err = ocfs2_read_quota_block(gqinode, blk, &bh); + if (err) { + mlog_errno(err); + return err; + } + memcpy(data, bh->b_data + offset, tocopy); + brelse(bh); + offset = 0; + toread -= tocopy; + data += tocopy; + blk++; + } + return len; +} + +/* Write to quotafile (we know the transaction is already started and has + * enough credits) */ +ssize_t ocfs2_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *gqinode = oinfo->dqi_gqinode; + int offset = off & (sb->s_blocksize - 1); + sector_t blk = off >> sb->s_blocksize_bits; + int err = 0, new = 0, ja_type; + struct buffer_head *bh = NULL; + handle_t *handle = journal_current_handle(); + + if (!handle) { + mlog(ML_ERROR, "Quota write (off=%llu, len=%llu) cancelled " + "because transaction was not started.\n", + (unsigned long long)off, (unsigned long long)len); + return -EIO; + } + if (len > sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset) { + WARN_ON(1); + len = sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE - offset; + } + + mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); + if (gqinode->i_size < off + len) { + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, off + len, off); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; + err = ocfs2_simple_size_update(gqinode, + oinfo->dqi_gqi_bh, + off + len); + if (err < 0) + goto out; + new = 1; + } + /* Not rewriting whole block? */ + if ((offset || len < sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) && + !new) { + err = ocfs2_read_quota_block(gqinode, blk, &bh); + ja_type = OCFS2_JOURNAL_ACCESS_WRITE; + } else { + err = ocfs2_get_quota_block(gqinode, blk, &bh); + ja_type = OCFS2_JOURNAL_ACCESS_CREATE; + } + if (err) { + mlog_errno(err); + return err; + } + lock_buffer(bh); + if (new) + memset(bh->b_data, 0, sb->s_blocksize); + memcpy(bh->b_data + offset, data, len); + flush_dcache_page(bh->b_page); + set_buffer_uptodate(bh); + unlock_buffer(bh); + ocfs2_set_buffer_uptodate(gqinode, bh); + err = ocfs2_journal_access_dq(handle, gqinode, bh, ja_type); + if (err < 0) { + brelse(bh); + goto out; + } + err = ocfs2_journal_dirty(handle, bh); + brelse(bh); + if (err < 0) + goto out; +out: + if (err) { + mutex_unlock(&gqinode->i_mutex); + mlog_errno(err); + return err; + } + gqinode->i_version++; + ocfs2_mark_inode_dirty(handle, gqinode, oinfo->dqi_gqi_bh); + mutex_unlock(&gqinode->i_mutex); + return len; +} + +int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + int status; + struct buffer_head *bh = NULL; + + status = ocfs2_inode_lock(oinfo->dqi_gqinode, &bh, ex); + if (status < 0) + return status; + spin_lock(&dq_data_lock); + if (!oinfo->dqi_gqi_count++) + oinfo->dqi_gqi_bh = bh; + else + WARN_ON(bh != oinfo->dqi_gqi_bh); + spin_unlock(&dq_data_lock); + return 0; +} + +void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) +{ + ocfs2_inode_unlock(oinfo->dqi_gqinode, ex); + brelse(oinfo->dqi_gqi_bh); + spin_lock(&dq_data_lock); + if (!--oinfo->dqi_gqi_count) + oinfo->dqi_gqi_bh = NULL; + spin_unlock(&dq_data_lock); +} + +/* Read information header from global quota file */ +int ocfs2_global_read_info(struct super_block *sb, int type) +{ + struct inode *gqinode = NULL; + unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct ocfs2_global_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + int status; + + mlog_entry_void(); + + /* Read global header */ + gqinode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!gqinode) { + mlog(ML_ERROR, "failed to get global quota inode (type=%d)\n", + type); + status = -EINVAL; + goto out_err; + } + oinfo->dqi_gi.dqi_sb = sb; + oinfo->dqi_gi.dqi_type = type; + ocfs2_qinfo_lock_res_init(&oinfo->dqi_gqlock, oinfo); + oinfo->dqi_gi.dqi_entry_size = sizeof(struct ocfs2_global_disk_dqblk); + oinfo->dqi_gi.dqi_ops = &ocfs2_global_ops; + oinfo->dqi_gqi_bh = NULL; + oinfo->dqi_gqi_count = 0; + oinfo->dqi_gqinode = gqinode; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + status = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + ocfs2_unlock_global_qf(oinfo, 0); + if (status != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot read global quota info (%d).\n", + status); + if (status >= 0) + status = -EIO; + mlog_errno(status); + goto out_err; + } + info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); + info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); + oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); + oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); + oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + oinfo->dqi_gi.dqi_blocksize_bits = sb->s_blocksize_bits; + oinfo->dqi_gi.dqi_usable_bs = sb->s_blocksize - + OCFS2_QBLK_RESERVED_SPACE; + oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); + INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); + queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, + oinfo->dqi_syncjiff); + +out_err: + mlog_exit(status); + return status; +} + +/* Write information to global quota file. Expects exlusive lock on quota + * file inode and quota info */ +static int __ocfs2_global_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_global_disk_dqinfo dinfo; + ssize_t size; + + spin_lock(&dq_data_lock); + info->dqi_flags &= ~DQF_INFO_DIRTY; + dinfo.dqi_bgrace = cpu_to_le32(info->dqi_bgrace); + dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); + spin_unlock(&dq_data_lock); + dinfo.dqi_syncms = cpu_to_le32(oinfo->dqi_syncms); + dinfo.dqi_blocks = cpu_to_le32(oinfo->dqi_gi.dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(oinfo->dqi_gi.dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(oinfo->dqi_gi.dqi_free_entry); + size = sb->s_op->quota_write(sb, type, (char *)&dinfo, + sizeof(struct ocfs2_global_disk_dqinfo), + OCFS2_GLOBAL_INFO_OFF); + if (size != sizeof(struct ocfs2_global_disk_dqinfo)) { + mlog(ML_ERROR, "Cannot write global quota info structure\n"); + if (size >= 0) + size = -EIO; + return size; + } + return 0; +} + +int ocfs2_global_write_info(struct super_block *sb, int type) +{ + int err; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + + err = ocfs2_qinfo_lock(info, 1); + if (err < 0) + return err; + err = __ocfs2_global_write_info(sb, type); + ocfs2_qinfo_unlock(info, 1); + return err; +} + +/* Read in information from global quota file and acquire a reference to it. + * dquot_acquire() has already started the transaction and locked quota file */ +int ocfs2_global_read_dquot(struct dquot *dquot) +{ + int err, err2, ex = 0; + struct ocfs2_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + + err = ocfs2_qinfo_lock(info, 0); + if (err < 0) + goto out; + err = qtree_read_dquot(&info->dqi_gi, dquot); + if (err < 0) + goto out_qlock; + OCFS2_DQUOT(dquot)->dq_use_count++; + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + if (!dquot->dq_off) { /* No real quota entry? */ + /* Upgrade to exclusive lock for allocation */ + err = ocfs2_qinfo_lock(info, 1); + if (err < 0) + goto out_qlock; + ex = 1; + } + err = qtree_write_dquot(&info->dqi_gi, dquot); + if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { + err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); + if (!err) + err = err2; + } +out_qlock: + if (ex) + ocfs2_qinfo_unlock(info, 1); + ocfs2_qinfo_unlock(info, 0); +out: + if (err < 0) + mlog_errno(err); + return err; +} + +/* Sync local information about quota modifications with global quota file. + * Caller must have started the transaction and obtained exclusive lock for + * global quota file inode */ +int __ocfs2_sync_dquot(struct dquot *dquot, int freeing) +{ + int err, err2; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_global_disk_dqblk dqblk; + s64 spacechange, inodechange; + time_t olditime, oldbtime; + + err = sb->s_op->quota_read(sb, type, (char *)&dqblk, + sizeof(struct ocfs2_global_disk_dqblk), + dquot->dq_off); + if (err != sizeof(struct ocfs2_global_disk_dqblk)) { + if (err >= 0) { + mlog(ML_ERROR, "Short read from global quota file " + "(%u read)\n", err); + err = -EIO; + } + goto out; + } + + /* Update space and inode usage. Get also other information from + * global quota file so that we don't overwrite any changes there. + * We are */ + spin_lock(&dq_data_lock); + spacechange = dquot->dq_dqb.dqb_curspace - + OCFS2_DQUOT(dquot)->dq_origspace; + inodechange = dquot->dq_dqb.dqb_curinodes - + OCFS2_DQUOT(dquot)->dq_originodes; + olditime = dquot->dq_dqb.dqb_itime; + oldbtime = dquot->dq_dqb.dqb_btime; + ocfs2_global_disk2memdqb(dquot, &dqblk); + mlog(0, "Syncing global dquot %u space %lld+%lld, inodes %lld+%lld\n", + dquot->dq_id, dquot->dq_dqb.dqb_curspace, (long long)spacechange, + dquot->dq_dqb.dqb_curinodes, (long long)inodechange); + if (!test_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curspace += spacechange; + if (!test_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags)) + dquot->dq_dqb.dqb_curinodes += inodechange; + /* Set properly space grace time... */ + if (dquot->dq_dqb.dqb_bsoftlimit && + dquot->dq_dqb.dqb_curspace > dquot->dq_dqb.dqb_bsoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags) && + oldbtime > 0) { + if (dquot->dq_dqb.dqb_btime > 0) + dquot->dq_dqb.dqb_btime = + min(dquot->dq_dqb.dqb_btime, oldbtime); + else + dquot->dq_dqb.dqb_btime = oldbtime; + } + } else { + dquot->dq_dqb.dqb_btime = 0; + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } + /* Set properly inode grace time... */ + if (dquot->dq_dqb.dqb_isoftlimit && + dquot->dq_dqb.dqb_curinodes > dquot->dq_dqb.dqb_isoftlimit) { + if (!test_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags) && + olditime > 0) { + if (dquot->dq_dqb.dqb_itime > 0) + dquot->dq_dqb.dqb_itime = + min(dquot->dq_dqb.dqb_itime, olditime); + else + dquot->dq_dqb.dqb_itime = olditime; + } + } else { + dquot->dq_dqb.dqb_itime = 0; + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } + /* All information is properly updated, clear the flags */ + __clear_bit(DQ_LASTSET_B + QIF_SPACE_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_INODES_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BLIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ILIMITS_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_BTIME_B, &dquot->dq_flags); + __clear_bit(DQ_LASTSET_B + QIF_ITIME_B, &dquot->dq_flags); + OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; + OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + spin_unlock(&dq_data_lock); + err = ocfs2_qinfo_lock(info, freeing); + if (err < 0) { + mlog(ML_ERROR, "Failed to lock quota info, loosing quota write" + " (type=%d, id=%u)\n", dquot->dq_type, + (unsigned)dquot->dq_id); + goto out; + } + if (freeing) + OCFS2_DQUOT(dquot)->dq_use_count--; + err = qtree_write_dquot(&info->dqi_gi, dquot); + if (err < 0) + goto out_qlock; + if (freeing && !OCFS2_DQUOT(dquot)->dq_use_count) { + err = qtree_release_dquot(&info->dqi_gi, dquot); + if (info_dirty(sb_dqinfo(sb, type))) { + err2 = __ocfs2_global_write_info(sb, type); + if (!err) + err = err2; + } + } +out_qlock: + ocfs2_qinfo_unlock(info, freeing); +out: + if (err < 0) + mlog_errno(err); + return err; +} + +/* + * Functions for periodic syncing of dquots with global file + */ +static int ocfs2_sync_dquot_helper(struct dquot *dquot, unsigned long type) +{ + handle_t *handle; + struct super_block *sb = dquot->dq_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + int status = 0; + + mlog_entry("id=%u qtype=%u type=%lu device=%s\n", dquot->dq_id, + dquot->dq_type, type, sb->s_id); + if (type != dquot->dq_type) + goto out; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + status = ocfs2_sync_dquot(dquot); + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + if (status < 0) + mlog_errno(status); + /* We have to write local structure as well... */ + dquot_mark_dquot_dirty(dquot); + status = dquot_commit(dquot); + if (status < 0) + mlog_errno(status); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +static void qsync_work_fn(struct work_struct *work) +{ + struct ocfs2_mem_dqinfo *oinfo = container_of(work, + struct ocfs2_mem_dqinfo, + dqi_sync_work.work); + struct super_block *sb = oinfo->dqi_gqinode->i_sb; + + dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); + queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, + oinfo->dqi_syncjiff); +} + +/* + * Wrappers for generic quota functions + */ + +static int ocfs2_write_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + + handle = ocfs2_start_trans(osb, OCFS2_QWRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + status = dquot_commit(dquot); + ocfs2_commit_trans(osb, handle); +out: + mlog_exit(status); + return status; +} + +int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo; + int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) + return 0; + + oinfo = sb_dqinfo(sb, type)->dqi_priv; + /* We modify tree, leaf block, global info, local chunk header, + * global and local inode */ + return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + + 2 * OCFS2_INODE_UPDATE_CREDITS; +} + +static int ocfs2_release_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, + ocfs2_calc_qdel_credits(dquot->dq_sb, dquot->dq_type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_release(dquot); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +int ocfs2_calc_qinit_credits(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo; + int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; + struct ocfs2_dinode *lfe, *gfe; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) + return 0; + + oinfo = sb_dqinfo(sb, type)->dqi_priv; + gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; + lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; + /* We can extend local file + global file. In local file we + * can modify info, chunk header block and dquot block. In + * global file we can modify info, tree and leaf block */ + return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + + ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + + 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; +} + +static int ocfs2_acquire_dquot(struct dquot *dquot) +{ + handle_t *handle; + struct ocfs2_mem_dqinfo *oinfo = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); + int status = 0; + + mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); + /* We need an exclusive lock, because we're going to update use count + * and instantiate possibly new dquot structure */ + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, + ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_acquire(dquot); + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +static int ocfs2_mark_dquot_dirty(struct dquot *dquot) +{ + unsigned long mask = (1 << (DQ_LASTSET_B + QIF_ILIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_BLIMITS_B)) | + (1 << (DQ_LASTSET_B + QIF_INODES_B)) | + (1 << (DQ_LASTSET_B + QIF_SPACE_B)) | + (1 << (DQ_LASTSET_B + QIF_BTIME_B)) | + (1 << (DQ_LASTSET_B + QIF_ITIME_B)); + int sync = 0; + int status; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(sb); + + mlog_entry("id=%u, type=%d", dquot->dq_id, type); + dquot_mark_dquot_dirty(dquot); + + /* In case user set some limits, sync dquot immediately to global + * quota file so that information propagates quicker */ + spin_lock(&dq_data_lock); + if (dquot->dq_flags & mask) + sync = 1; + spin_unlock(&dq_data_lock); + if (!sync) { + status = ocfs2_write_dquot(dquot); + goto out; + } + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(osb, OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = ocfs2_sync_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Now write updated local dquot structure */ + status = dquot_commit(dquot); +out_trans: + ocfs2_commit_trans(osb, handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +/* This should happen only after set_dqinfo(). */ +static int ocfs2_write_info(struct super_block *sb, int type) +{ + handle_t *handle; + int status = 0; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + mlog_entry_void(); + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + handle = ocfs2_start_trans(OCFS2_SB(sb), OCFS2_QINFO_WRITE_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + status = dquot_commit_info(sb, type); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_ilock: + ocfs2_unlock_global_qf(oinfo, 1); +out: + mlog_exit(status); + return status; +} + +/* This is difficult. We have to lock quota inode and start transaction + * in this function but we don't want to take the penalty of exlusive + * quota file lock when we are just going to use cached structures. So + * we just take read lock check whether we have dquot cached and if so, + * we don't have to take the write lock... */ +static int ocfs2_dquot_initialize(struct inode *inode, int type) +{ + handle_t *handle = NULL; + int status = 0; + struct super_block *sb = inode->i_sb; + struct ocfs2_mem_dqinfo *oinfo; + int exclusive = 0; + int cnt; + qid_t id; + + mlog_entry_void(); + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) + goto out; + /* This is just a performance optimization not a reliable test. + * Since we hold an inode lock, noone can actually release + * the structure until we are finished with initialization. */ + if (inode->i_dquot[cnt] != NODQUOT) { + ocfs2_unlock_global_qf(oinfo, 0); + continue; + } + /* When we have inode lock, we know that no dquot_release() can + * run and thus we can safely check whether we need to + * read+modify global file to get quota information or whether + * our node already has it. */ + if (cnt == USRQUOTA) + id = inode->i_uid; + else if (cnt == GRPQUOTA) + id = inode->i_gid; + else + BUG(); + /* Obtain exclusion from quota off... */ + down_write(&sb_dqopt(sb)->dqptr_sem); + exclusive = !dquot_is_cached(sb, id, cnt); + up_write(&sb_dqopt(sb)->dqptr_sem); + if (exclusive) { + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) { + exclusive = 0; + mlog_errno(status); + goto out_ilock; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), + ocfs2_calc_qinit_credits(sb, cnt)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_ilock; + } + } + dquot_initialize(inode, cnt); + if (exclusive) { + ocfs2_commit_trans(OCFS2_SB(sb), handle); + ocfs2_unlock_global_qf(oinfo, 1); + } + ocfs2_unlock_global_qf(oinfo, 0); + } + mlog_exit(0); + return 0; +out_ilock: + if (exclusive) + ocfs2_unlock_global_qf(oinfo, 1); + ocfs2_unlock_global_qf(oinfo, 0); +out: + mlog_exit(status); + return status; +} + +static int ocfs2_dquot_drop_slow(struct inode *inode) +{ + int status = 0; + int cnt; + int got_lock[MAXQUOTAS] = {0, 0}; + handle_t *handle; + struct super_block *sb = inode->i_sb; + struct ocfs2_mem_dqinfo *oinfo; + + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!sb_has_quota_active(sb, cnt)) + continue; + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + got_lock[cnt] = 1; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), + ocfs2_calc_qinit_credits(sb, USRQUOTA) + + ocfs2_calc_qinit_credits(sb, GRPQUOTA)); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + dquot_drop(inode); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (got_lock[cnt]) { + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 1); + } + return status; +} + +/* See the comment before ocfs2_dquot_initialize. */ +static int ocfs2_dquot_drop(struct inode *inode) +{ + int status = 0; + struct super_block *sb = inode->i_sb; + struct ocfs2_mem_dqinfo *oinfo; + int exclusive = 0; + int cnt; + int got_lock[MAXQUOTAS] = {0, 0}; + + mlog_entry_void(); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (!sb_has_quota_active(sb, cnt)) + continue; + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + status = ocfs2_lock_global_qf(oinfo, 0); + if (status < 0) + goto out; + got_lock[cnt] = 1; + } + /* Lock against anyone releasing references so that when when we check + * we know we are not going to be last ones to release dquot */ + down_write(&sb_dqopt(sb)->dqptr_sem); + /* Urgh, this is a terrible hack :( */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (inode->i_dquot[cnt] != NODQUOT && + atomic_read(&inode->i_dquot[cnt]->dq_count) > 1) { + exclusive = 1; + break; + } + } + if (!exclusive) + dquot_drop_locked(inode); + up_write(&sb_dqopt(sb)->dqptr_sem); +out: + for (cnt = 0; cnt < MAXQUOTAS; cnt++) + if (got_lock[cnt]) { + oinfo = sb_dqinfo(sb, cnt)->dqi_priv; + ocfs2_unlock_global_qf(oinfo, 0); + } + /* In case we bailed out because we had to do expensive locking + * do it now... */ + if (exclusive) + status = ocfs2_dquot_drop_slow(inode); + mlog_exit(status); + return status; +} + +static struct dquot *ocfs2_alloc_dquot(struct super_block *sb, int type) +{ + struct ocfs2_dquot *dquot = + kmem_cache_zalloc(ocfs2_dquot_cachep, GFP_NOFS); + + if (!dquot) + return NULL; + return &dquot->dq_dquot; +} + +static void ocfs2_destroy_dquot(struct dquot *dquot) +{ + kmem_cache_free(ocfs2_dquot_cachep, dquot); +} + +struct dquot_operations ocfs2_quota_operations = { + .initialize = ocfs2_dquot_initialize, + .drop = ocfs2_dquot_drop, + .alloc_space = dquot_alloc_space, + .alloc_inode = dquot_alloc_inode, + .free_space = dquot_free_space, + .free_inode = dquot_free_inode, + .transfer = dquot_transfer, + .write_dquot = ocfs2_write_dquot, + .acquire_dquot = ocfs2_acquire_dquot, + .release_dquot = ocfs2_release_dquot, + .mark_dirty = ocfs2_mark_dquot_dirty, + .write_info = ocfs2_write_info, + .alloc_dquot = ocfs2_alloc_dquot, + .destroy_dquot = ocfs2_destroy_dquot, +}; + +int ocfs2_quota_setup(void) +{ + ocfs2_quota_wq = create_workqueue("o2quot"); + if (!ocfs2_quota_wq) + return -ENOMEM; + return 0; +} + +void ocfs2_quota_shutdown(void) +{ + if (ocfs2_quota_wq) { + flush_workqueue(ocfs2_quota_wq); + destroy_workqueue(ocfs2_quota_wq); + ocfs2_quota_wq = NULL; + } +} diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c new file mode 100644 index 00000000000..07deec5e972 --- /dev/null +++ b/fs/ocfs2/quota_local.c @@ -0,0 +1,1253 @@ +/* + * Implementation of operations over local quota file + */ + +#include <linux/fs.h> +#include <linux/quota.h> +#include <linux/quotaops.h> +#include <linux/module.h> + +#define MLOG_MASK_PREFIX ML_QUOTA +#include <cluster/masklog.h> + +#include "ocfs2_fs.h" +#include "ocfs2.h" +#include "inode.h" +#include "alloc.h" +#include "file.h" +#include "buffer_head_io.h" +#include "journal.h" +#include "sysfile.h" +#include "dlmglue.h" +#include "quota.h" + +/* Number of local quota structures per block */ +static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) +{ + return ((sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE) / + sizeof(struct ocfs2_local_disk_dqblk)); +} + +/* Number of blocks with entries in one chunk */ +static inline unsigned int ol_chunk_blocks(struct super_block *sb) +{ + return ((sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE) << 3) / + ol_quota_entries_per_block(sb); +} + +/* Number of entries in a chunk bitmap */ +static unsigned int ol_chunk_entries(struct super_block *sb) +{ + return ol_chunk_blocks(sb) * ol_quota_entries_per_block(sb); +} + +/* Offset of the chunk in quota file */ +static unsigned int ol_quota_chunk_block(struct super_block *sb, int c) +{ + /* 1 block for local quota file info, 1 block per chunk for chunk info */ + return 1 + (ol_chunk_blocks(sb) + 1) * c; +} + +static unsigned int ol_dqblk_block(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ol_quota_chunk_block(sb, c) + 1 + off / epb; +} + +static unsigned int ol_dqblk_block_off(struct super_block *sb, int c, int off) +{ + int epb = ol_quota_entries_per_block(sb); + + return (off % epb) * sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Offset of the dquot structure in the quota file */ +static loff_t ol_dqblk_off(struct super_block *sb, int c, int off) +{ + return (ol_dqblk_block(sb, c, off) << sb->s_blocksize_bits) + + ol_dqblk_block_off(sb, c, off); +} + +/* Compute block number from given offset */ +static inline unsigned int ol_dqblk_file_block(struct super_block *sb, loff_t off) +{ + return off >> sb->s_blocksize_bits; +} + +static inline unsigned int ol_dqblk_block_offset(struct super_block *sb, loff_t off) +{ + return off & ((1 << sb->s_blocksize_bits) - 1); +} + +/* Compute offset in the chunk of a structure with the given offset */ +static int ol_dqblk_chunk_off(struct super_block *sb, int c, loff_t off) +{ + int epb = ol_quota_entries_per_block(sb); + + return ((off >> sb->s_blocksize_bits) - + ol_quota_chunk_block(sb, c) - 1) * epb + + ((unsigned int)(off & ((1 << sb->s_blocksize_bits) - 1))) / + sizeof(struct ocfs2_local_disk_dqblk); +} + +/* Write bufferhead into the fs */ +static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, + void (*modify)(struct buffer_head *, void *), void *private) +{ + struct super_block *sb = inode->i_sb; + handle_t *handle; + int status; + + handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + return status; + } + status = ocfs2_journal_access_dq(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + ocfs2_commit_trans(OCFS2_SB(sb), handle); + return status; + } + lock_buffer(bh); + modify(bh, private); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + ocfs2_commit_trans(OCFS2_SB(sb), handle); + return status; + } + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + return status; + } + return 0; +} + +/* Check whether we understand format of quota files */ +static int ocfs2_local_check_quota_file(struct super_block *sb, int type) +{ + unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; + struct buffer_head *bh = NULL; + struct inode *linode = sb_dqopt(sb)->files[type]; + struct inode *ginode = NULL; + struct ocfs2_disk_dqheader *dqhead; + int status, ret = 0; + + /* First check whether we understand local quota file */ + status = ocfs2_read_quota_block(linode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file header (type=%d)\n", + type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != lmagics[type]) { + mlog(ML_ERROR, "quota file magic does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_magic), + lmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != lversions[type]) { + mlog(ML_ERROR, "quota file version does not match (%u != %u)," + " type=%d\n", le32_to_cpu(dqhead->dqh_version), + lversions[type], type); + goto out_err; + } + brelse(bh); + bh = NULL; + + /* Next check whether we understand global quota file */ + ginode = ocfs2_get_system_file_inode(OCFS2_SB(sb), ino[type], + OCFS2_INVALID_SLOT); + if (!ginode) { + mlog(ML_ERROR, "cannot get global quota file inode " + "(type=%d)\n", type); + goto out_err; + } + /* Since the header is read only, we don't care about locking */ + status = ocfs2_read_quota_block(ginode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read global quota file header " + "(type=%d)\n", type); + goto out_err; + } + dqhead = (struct ocfs2_disk_dqheader *)(bh->b_data); + if (le32_to_cpu(dqhead->dqh_magic) != gmagics[type]) { + mlog(ML_ERROR, "global quota file magic does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_magic), gmagics[type], type); + goto out_err; + } + if (le32_to_cpu(dqhead->dqh_version) != gversions[type]) { + mlog(ML_ERROR, "global quota file version does not match " + "(%u != %u), type=%d\n", + le32_to_cpu(dqhead->dqh_version), gversions[type], + type); + goto out_err; + } + + ret = 1; +out_err: + brelse(bh); + iput(ginode); + return ret; +} + +/* Release given list of quota file chunks */ +static void ocfs2_release_local_quota_bitmaps(struct list_head *head) +{ + struct ocfs2_quota_chunk *pos, *next; + + list_for_each_entry_safe(pos, next, head, qc_chunk) { + list_del(&pos->qc_chunk); + brelse(pos->qc_headerbh); + kmem_cache_free(ocfs2_qf_chunk_cachep, pos); + } +} + +/* Load quota bitmaps into memory */ +static int ocfs2_load_local_quota_bitmaps(struct inode *inode, + struct ocfs2_local_disk_dqinfo *ldinfo, + struct list_head *head) +{ + struct ocfs2_quota_chunk *newchunk; + int i, status; + + INIT_LIST_HEAD(head); + for (i = 0; i < le32_to_cpu(ldinfo->dqi_chunks); i++) { + newchunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!newchunk) { + ocfs2_release_local_quota_bitmaps(head); + return -ENOMEM; + } + newchunk->qc_num = i; + newchunk->qc_headerbh = NULL; + status = ocfs2_read_quota_block(inode, + ol_quota_chunk_block(inode->i_sb, i), + &newchunk->qc_headerbh); + if (status) { + mlog_errno(status); + kmem_cache_free(ocfs2_qf_chunk_cachep, newchunk); + ocfs2_release_local_quota_bitmaps(head); + return status; + } + list_add_tail(&newchunk->qc_chunk, head); + } + return 0; +} + +static void olq_update_info(struct buffer_head *bh, void *private) +{ + struct mem_dqinfo *info = private; + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_local_disk_dqinfo *ldinfo; + + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + spin_lock(&dq_data_lock); + ldinfo->dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); + ldinfo->dqi_chunks = cpu_to_le32(oinfo->dqi_chunks); + ldinfo->dqi_blocks = cpu_to_le32(oinfo->dqi_blocks); + spin_unlock(&dq_data_lock); +} + +static int ocfs2_add_recovery_chunk(struct super_block *sb, + struct ocfs2_local_disk_chunk *dchunk, + int chunk, + struct list_head *head) +{ + struct ocfs2_recovery_chunk *rc; + + rc = kmalloc(sizeof(struct ocfs2_recovery_chunk), GFP_NOFS); + if (!rc) + return -ENOMEM; + rc->rc_chunk = chunk; + rc->rc_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + if (!rc->rc_bitmap) { + kfree(rc); + return -ENOMEM; + } + memcpy(rc->rc_bitmap, dchunk->dqc_bitmap, + (ol_chunk_entries(sb) + 7) >> 3); + list_add_tail(&rc->rc_list, head); + return 0; +} + +static void free_recovery_list(struct list_head *head) +{ + struct ocfs2_recovery_chunk *next; + struct ocfs2_recovery_chunk *rchunk; + + list_for_each_entry_safe(rchunk, next, head, rc_list) { + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + } +} + +void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + free_recovery_list(&(rec->r_list[type])); + kfree(rec); +} + +/* Load entries in our quota file we have to recover*/ +static int ocfs2_recovery_load_quota(struct inode *lqinode, + struct ocfs2_local_disk_dqinfo *ldinfo, + int type, + struct list_head *head) +{ + struct super_block *sb = lqinode->i_sb; + struct buffer_head *hbh; + struct ocfs2_local_disk_chunk *dchunk; + int i, chunks = le32_to_cpu(ldinfo->dqi_chunks); + int status = 0; + + for (i = 0; i < chunks; i++) { + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, i), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) < ol_chunk_entries(sb)) + status = ocfs2_add_recovery_chunk(sb, dchunk, i, head); + brelse(hbh); + if (status < 0) + break; + } + if (status < 0) + free_recovery_list(head); + return status; +} + +static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) +{ + int type; + struct ocfs2_quota_recovery *rec; + + rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); + if (!rec) + return NULL; + for (type = 0; type < MAXQUOTAS; type++) + INIT_LIST_HEAD(&(rec->r_list[type])); + return rec; +} + +/* Load information we need for quota recovery into memory */ +struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( + struct ocfs2_super *osb, + int slot_num) +{ + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct inode *lqinode; + struct buffer_head *bh; + int type; + int status = 0; + struct ocfs2_quota_recovery *rec; + + mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); + rec = ocfs2_alloc_quota_recovery(); + if (!rec) + return ERR_PTR(-ENOMEM); + /* First init... */ + + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + /* At this point, journal of the slot is already replayed so + * we can trust metadata and data of the quota file */ + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_RECOVERY); + if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + if (status < 0) { + ocfs2_free_quota_recovery(rec); + rec = ERR_PTR(status); + } + return rec; +} + +/* Sync changes in local quota file into global quota file and + * reinitialize local quota file. + * The function expects local quota file to be already locked and + * dqonoff_mutex locked. */ +static int ocfs2_recover_local_quota_file(struct inode *lqinode, + int type, + struct ocfs2_quota_recovery *rec) +{ + struct super_block *sb = lqinode->i_sb; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_local_disk_chunk *dchunk; + struct ocfs2_local_disk_dqblk *dqblk; + struct dquot *dquot; + handle_t *handle; + struct buffer_head *hbh = NULL, *qbh = NULL; + int status = 0; + int bit, chunk; + struct ocfs2_recovery_chunk *rchunk, *next; + qsize_t spacechange, inodechange; + + mlog_entry("ino=%lu type=%u", (unsigned long)lqinode->i_ino, type); + + status = ocfs2_lock_global_qf(oinfo, 1); + if (status < 0) + goto out; + + list_for_each_entry_safe(rchunk, next, &(rec->r_list[type]), rc_list) { + chunk = rchunk->rc_chunk; + hbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_quota_chunk_block(sb, chunk), + &hbh); + if (status) { + mlog_errno(status); + break; + } + dchunk = (struct ocfs2_local_disk_chunk *)hbh->b_data; + for_each_bit(bit, rchunk->rc_bitmap, ol_chunk_entries(sb)) { + qbh = NULL; + status = ocfs2_read_quota_block(lqinode, + ol_dqblk_block(sb, chunk, bit), + &qbh); + if (status) { + mlog_errno(status); + break; + } + dqblk = (struct ocfs2_local_disk_dqblk *)(qbh->b_data + + ol_dqblk_block_off(sb, chunk, bit)); + dquot = dqget(sb, le64_to_cpu(dqblk->dqb_id), type); + if (!dquot) { + status = -EIO; + mlog(ML_ERROR, "Failed to get quota structure " + "for id %u, type %d. Cannot finish quota " + "file recovery.\n", + (unsigned)le64_to_cpu(dqblk->dqb_id), + type); + goto out_put_bh; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QSYNC_CREDITS); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_put_dquot; + } + mutex_lock(&sb_dqopt(sb)->dqio_mutex); + spin_lock(&dq_data_lock); + /* Add usage from quota entry into quota changes + * of our node. Auxiliary variables are important + * due to signedness */ + spacechange = le64_to_cpu(dqblk->dqb_spacemod); + inodechange = le64_to_cpu(dqblk->dqb_inodemod); + dquot->dq_dqb.dqb_curspace += spacechange; + dquot->dq_dqb.dqb_curinodes += inodechange; + spin_unlock(&dq_data_lock); + /* We want to drop reference held by the crashed + * node. Since we have our own reference we know + * global structure actually won't be freed. */ + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + /* Release local quota file entry */ + status = ocfs2_journal_access_dq(handle, lqinode, + qbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_commit; + } + lock_buffer(qbh); + WARN_ON(!ocfs2_test_bit(bit, dchunk->dqc_bitmap)); + ocfs2_clear_bit(bit, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(qbh); + status = ocfs2_journal_dirty(handle, qbh); + if (status < 0) + mlog_errno(status); +out_commit: + mutex_unlock(&sb_dqopt(sb)->dqio_mutex); + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out_put_dquot: + dqput(dquot); +out_put_bh: + brelse(qbh); + if (status < 0) + break; + } + brelse(hbh); + list_del(&rchunk->rc_list); + kfree(rchunk->rc_bitmap); + kfree(rchunk); + if (status < 0) + break; + } + ocfs2_unlock_global_qf(oinfo, 1); +out: + if (status < 0) + free_recovery_list(&(rec->r_list[type])); + mlog_exit(status); + return status; +} + +/* Recover local quota files for given node different from us */ +int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, + struct ocfs2_quota_recovery *rec, + int slot_num) +{ + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + struct super_block *sb = osb->sb; + struct ocfs2_local_disk_dqinfo *ldinfo; + struct buffer_head *bh; + handle_t *handle; + int type; + int status = 0; + struct inode *lqinode; + unsigned int flags; + + mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + for (type = 0; type < MAXQUOTAS; type++) { + if (list_empty(&(rec->r_list[type]))) + continue; + mlog(0, "Recovering quota in slot %d\n", slot_num); + lqinode = ocfs2_get_system_file_inode(osb, ino[type], slot_num); + if (!lqinode) { + status = -ENOENT; + goto out; + } + status = ocfs2_inode_lock_full(lqinode, NULL, 1, + OCFS2_META_LOCK_NOQUEUE); + /* Someone else is holding the lock? Then he must be + * doing the recovery. Just skip the file... */ + if (status == -EAGAIN) { + mlog(ML_NOTICE, "skipping quota recovery for slot %d " + "because quota file is locked.\n", slot_num); + status = 0; + goto out_put; + } else if (status < 0) { + mlog_errno(status); + goto out_put; + } + /* Now read local header */ + bh = NULL; + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(slot=%d type=%d)\n", slot_num, type); + goto out_lock; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + /* Is recovery still needed? */ + flags = le32_to_cpu(ldinfo->dqi_flags); + if (!(flags & OLQF_CLEAN)) + status = ocfs2_recover_local_quota_file(lqinode, + type, + rec); + /* We don't want to mark file as clean when it is actually + * active */ + if (slot_num == osb->slot_num) + goto out_bh; + /* Mark quota file as clean if we are recovering quota file of + * some other node. */ + handle = ocfs2_start_trans(osb, 1); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out_bh; + } + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + ldinfo->dqi_flags = cpu_to_le32(flags | OLQF_CLEAN); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) + mlog_errno(status); +out_trans: + ocfs2_commit_trans(osb, handle); +out_bh: + brelse(bh); +out_lock: + ocfs2_inode_unlock(lqinode, 1); +out_put: + iput(lqinode); + if (status < 0) + break; + } +out: + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + kfree(rec); + return status; +} + +/* Read information header from quota file */ +static int ocfs2_local_read_info(struct super_block *sb, int type) +{ + struct ocfs2_local_disk_dqinfo *ldinfo; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + int status; + struct buffer_head *bh = NULL; + struct ocfs2_quota_recovery *rec; + int locked = 0; + + info->dqi_maxblimit = 0x7fffffffffffffffLL; + info->dqi_maxilimit = 0x7fffffffffffffffLL; + oinfo = kmalloc(sizeof(struct ocfs2_mem_dqinfo), GFP_NOFS); + if (!oinfo) { + mlog(ML_ERROR, "failed to allocate memory for ocfs2 quota" + " info."); + goto out_err; + } + info->dqi_priv = oinfo; + oinfo->dqi_type = type; + INIT_LIST_HEAD(&oinfo->dqi_chunk); + oinfo->dqi_rec = NULL; + oinfo->dqi_lqi_bh = NULL; + oinfo->dqi_ibh = NULL; + + status = ocfs2_global_read_info(sb, type); + if (status < 0) + goto out_err; + + status = ocfs2_inode_lock(lqinode, &oinfo->dqi_lqi_bh, 1); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + locked = 1; + + /* Now read local header */ + status = ocfs2_read_quota_block(lqinode, 0, &bh); + if (status) { + mlog_errno(status); + mlog(ML_ERROR, "failed to read quota file info header " + "(type=%d)\n", type); + goto out_err; + } + ldinfo = (struct ocfs2_local_disk_dqinfo *)(bh->b_data + + OCFS2_LOCAL_INFO_OFF); + info->dqi_flags = le32_to_cpu(ldinfo->dqi_flags); + oinfo->dqi_chunks = le32_to_cpu(ldinfo->dqi_chunks); + oinfo->dqi_blocks = le32_to_cpu(ldinfo->dqi_blocks); + oinfo->dqi_ibh = bh; + + /* We crashed when using local quota file? */ + if (!(info->dqi_flags & OLQF_CLEAN)) { + rec = OCFS2_SB(sb)->quota_rec; + if (!rec) { + rec = ocfs2_alloc_quota_recovery(); + if (!rec) { + status = -ENOMEM; + mlog_errno(status); + goto out_err; + } + OCFS2_SB(sb)->quota_rec = rec; + } + + status = ocfs2_recovery_load_quota(lqinode, ldinfo, type, + &rec->r_list[type]); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + } + + status = ocfs2_load_local_quota_bitmaps(lqinode, + ldinfo, + &oinfo->dqi_chunk); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + /* Now mark quota file as used */ + info->dqi_flags &= ~OLQF_CLEAN; + status = ocfs2_modify_bh(lqinode, bh, olq_update_info, info); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + return 0; +out_err: + if (oinfo) { + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + brelse(oinfo->dqi_lqi_bh); + if (locked) + ocfs2_inode_unlock(lqinode, 1); + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + kfree(oinfo); + } + brelse(bh); + return -1; +} + +/* Write local info to quota file */ +static int ocfs2_local_write_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct buffer_head *bh = ((struct ocfs2_mem_dqinfo *)info->dqi_priv) + ->dqi_ibh; + int status; + + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], bh, olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + return -1; + } + + return 0; +} + +/* Release info from memory */ +static int ocfs2_local_free_info(struct super_block *sb, int type) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int mark_clean = 1, len; + int status; + + /* At this point we know there are no more dquots and thus + * even if there's some sync in the pdflush queue, it won't + * find any dquots and return without doing anything */ + cancel_delayed_work_sync(&oinfo->dqi_sync_work); + iput(oinfo->dqi_gqinode); + ocfs2_simple_drop_lockres(OCFS2_SB(sb), &oinfo->dqi_gqlock); + ocfs2_lock_res_free(&oinfo->dqi_gqlock); + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + (chunk->qc_headerbh->b_data); + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + /* Not all entries free? Bug! */ + if (le32_to_cpu(dchunk->dqc_free) != len) { + mlog(ML_ERROR, "releasing quota file with used " + "entries (type=%d)\n", type); + mark_clean = 0; + } + } + ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); + + /* dqonoff_mutex protects us against racing with recovery thread... */ + if (oinfo->dqi_rec) { + ocfs2_free_quota_recovery(oinfo->dqi_rec); + mark_clean = 0; + } + + if (!mark_clean) + goto out; + + /* Mark local file as clean */ + info->dqi_flags |= OLQF_CLEAN; + status = ocfs2_modify_bh(sb_dqopt(sb)->files[type], + oinfo->dqi_ibh, + olq_update_info, + info); + if (status < 0) { + mlog_errno(status); + goto out; + } + +out: + ocfs2_inode_unlock(sb_dqopt(sb)->files[type], 1); + brelse(oinfo->dqi_ibh); + brelse(oinfo->dqi_lqi_bh); + kfree(oinfo); + return 0; +} + +static void olq_set_dquot(struct buffer_head *bh, void *private) +{ + struct ocfs2_dquot *od = private; + struct ocfs2_local_disk_dqblk *dqblk; + struct super_block *sb = od->dq_dquot.dq_sb; + + dqblk = (struct ocfs2_local_disk_dqblk *)(bh->b_data + + ol_dqblk_block_offset(sb, od->dq_local_off)); + + dqblk->dqb_id = cpu_to_le64(od->dq_dquot.dq_id); + spin_lock(&dq_data_lock); + dqblk->dqb_spacemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curspace - + od->dq_origspace); + dqblk->dqb_inodemod = cpu_to_le64(od->dq_dquot.dq_dqb.dqb_curinodes - + od->dq_originodes); + spin_unlock(&dq_data_lock); + mlog(0, "Writing local dquot %u space %lld inodes %lld\n", + od->dq_dquot.dq_id, (long long)le64_to_cpu(dqblk->dqb_spacemod), + (long long)le64_to_cpu(dqblk->dqb_inodemod)); +} + +/* Write dquot to local quota file */ +static int ocfs2_local_write_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct buffer_head *bh = NULL; + int status; + + status = ocfs2_read_quota_block(sb_dqopt(sb)->files[dquot->dq_type], + ol_dqblk_file_block(sb, od->dq_local_off), + &bh); + if (status) { + mlog_errno(status); + goto out; + } + status = ocfs2_modify_bh(sb_dqopt(sb)->files[dquot->dq_type], bh, + olq_set_dquot, od); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + brelse(bh); + return status; +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_find_free_entry(struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_local_disk_chunk *dchunk; + int found = 0, len; + + list_for_each_entry(chunk, &oinfo->dqi_chunk, qc_chunk) { + dchunk = (struct ocfs2_local_disk_chunk *) + chunk->qc_headerbh->b_data; + if (le32_to_cpu(dchunk->dqc_free) > 0) { + found = 1; + break; + } + } + if (!found) + return NULL; + + if (chunk->qc_num < oinfo->dqi_chunks - 1) { + len = ol_chunk_entries(sb); + } else { + len = (oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1) + * ol_quota_entries_per_block(sb); + } + + found = ocfs2_find_next_zero_bit(dchunk->dqc_bitmap, len, 0); + /* We failed? */ + if (found == len) { + mlog(ML_ERROR, "Did not find empty entry in chunk %d with %u" + " entries free (type=%d)\n", chunk->qc_num, + le32_to_cpu(dchunk->dqc_free), type); + return ERR_PTR(-EIO); + } + *offset = found; + return chunk; +} + +/* Add new chunk to the local quota file */ +static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk = NULL; + struct ocfs2_local_disk_chunk *dchunk; + int status; + handle_t *handle; + struct buffer_head *bh = NULL; + u64 p_blkno; + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, + lqinode->i_size + 2 * sb->s_blocksize, + lqinode->i_size); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + lqinode->i_size + 2 * sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + + chunk = kmem_cache_alloc(ocfs2_qf_chunk_cachep, GFP_NOFS); + if (!chunk) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + + handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + dchunk->dqc_free = cpu_to_le32(ol_quota_entries_per_block(sb)); + memset(dchunk->dqc_bitmap, 0, + sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - + OCFS2_QBLK_RESERVED_SPACE); + set_buffer_uptodate(bh); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + oinfo->dqi_blocks += 2; + oinfo->dqi_chunks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + + list_add_tail(&chunk->qc_chunk, &oinfo->dqi_chunk); + chunk->qc_num = list_entry(chunk->qc_chunk.prev, + struct ocfs2_quota_chunk, + qc_chunk)->qc_num + 1; + chunk->qc_headerbh = bh; + *offset = 0; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + brelse(bh); + kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); + return ERR_PTR(status); +} + +/* Find free entry in local quota file */ +static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( + struct super_block *sb, + int type, + int *offset) +{ + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; + struct ocfs2_quota_chunk *chunk; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_local_disk_chunk *dchunk; + int epb = ol_quota_entries_per_block(sb); + unsigned int chunk_blocks; + int status; + handle_t *handle; + + if (list_empty(&oinfo->dqi_chunk)) + return ocfs2_local_quota_add_chunk(sb, type, offset); + /* Is the last chunk full? */ + chunk = list_entry(oinfo->dqi_chunk.prev, + struct ocfs2_quota_chunk, qc_chunk); + chunk_blocks = oinfo->dqi_blocks - + ol_quota_chunk_block(sb, chunk->qc_num) - 1; + if (ol_chunk_blocks(sb) == chunk_blocks) + return ocfs2_local_quota_add_chunk(sb, type, offset); + + /* We are protected by dqio_sem so no locking needed */ + status = ocfs2_extend_no_holes(lqinode, + lqinode->i_size + sb->s_blocksize, + lqinode->i_size); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = ocfs2_simple_size_update(lqinode, oinfo->dqi_lqi_bh, + lqinode->i_size + sb->s_blocksize); + if (status < 0) { + mlog_errno(status); + goto out; + } + handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + dchunk = (struct ocfs2_local_disk_chunk *)chunk->qc_headerbh->b_data; + lock_buffer(chunk->qc_headerbh); + le32_add_cpu(&dchunk->dqc_free, ol_quota_entries_per_block(sb)); + unlock_buffer(chunk->qc_headerbh); + status = ocfs2_journal_dirty(handle, chunk->qc_headerbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + oinfo->dqi_blocks++; + status = ocfs2_local_write_info(sb, type); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + status = ocfs2_commit_trans(OCFS2_SB(sb), handle); + if (status < 0) { + mlog_errno(status); + goto out; + } + *offset = chunk_blocks * epb; + return chunk; +out_trans: + ocfs2_commit_trans(OCFS2_SB(sb), handle); +out: + return ERR_PTR(status); +} + +static void olq_alloc_dquot(struct buffer_head *bh, void *private) +{ + int *offset = private; + struct ocfs2_local_disk_chunk *dchunk; + + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; + ocfs2_set_bit(*offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, -1); +} + +/* Create dquot in the local file for given id */ +static int ocfs2_create_local_dquot(struct dquot *dquot) +{ + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct inode *lqinode = sb_dqopt(sb)->files[type]; + struct ocfs2_quota_chunk *chunk; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + int offset; + int status; + + chunk = ocfs2_find_free_entry(sb, type, &offset); + if (!chunk) { + chunk = ocfs2_extend_local_quota_file(sb, type, &offset); + if (IS_ERR(chunk)) + return PTR_ERR(chunk); + } else if (IS_ERR(chunk)) { + return PTR_ERR(chunk); + } + od->dq_local_off = ol_dqblk_off(sb, chunk->qc_num, offset); + od->dq_chunk = chunk; + + /* Initialize dquot structure on disk */ + status = ocfs2_local_write_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out; + } + + /* Mark structure as allocated */ + status = ocfs2_modify_bh(lqinode, chunk->qc_headerbh, olq_alloc_dquot, + &offset); + if (status < 0) { + mlog_errno(status); + goto out; + } +out: + return status; +} + +/* Create entry in local file for dquot, load data from the global file */ +static int ocfs2_local_read_dquot(struct dquot *dquot) +{ + int status; + + mlog_entry("id=%u, type=%d\n", dquot->dq_id, dquot->dq_type); + + status = ocfs2_global_read_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + + /* Now create entry in the local quota file */ + status = ocfs2_create_local_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out_err; + } + mlog_exit(0); + return 0; +out_err: + mlog_exit(status); + return status; +} + +/* Release dquot structure from local quota file. ocfs2_release_dquot() has + * already started a transaction and obtained exclusive lock for global + * quota file. */ +static int ocfs2_local_release_dquot(struct dquot *dquot) +{ + int status; + int type = dquot->dq_type; + struct ocfs2_dquot *od = OCFS2_DQUOT(dquot); + struct super_block *sb = dquot->dq_sb; + struct ocfs2_local_disk_chunk *dchunk; + int offset; + handle_t *handle = journal_current_handle(); + + BUG_ON(!handle); + /* First write all local changes to global file */ + status = ocfs2_global_release_dquot(dquot); + if (status < 0) { + mlog_errno(status); + goto out; + } + + status = ocfs2_journal_access_dq(handle, sb_dqopt(sb)->files[type], + od->dq_chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto out; + } + offset = ol_dqblk_chunk_off(sb, od->dq_chunk->qc_num, + od->dq_local_off); + dchunk = (struct ocfs2_local_disk_chunk *) + (od->dq_chunk->qc_headerbh->b_data); + /* Mark structure as freed */ + lock_buffer(od->dq_chunk->qc_headerbh); + ocfs2_clear_bit(offset, dchunk->dqc_bitmap); + le32_add_cpu(&dchunk->dqc_free, 1); + unlock_buffer(od->dq_chunk->qc_headerbh); + status = ocfs2_journal_dirty(handle, od->dq_chunk->qc_headerbh); + if (status < 0) { + mlog_errno(status); + goto out; + } + status = 0; +out: + /* Clear the read bit so that next time someone uses this + * dquot he reads fresh info from disk and allocates local + * dquot structure */ + clear_bit(DQ_READ_B, &dquot->dq_flags); + return status; +} + +static struct quota_format_ops ocfs2_format_ops = { + .check_quota_file = ocfs2_local_check_quota_file, + .read_file_info = ocfs2_local_read_info, + .write_file_info = ocfs2_global_write_info, + .free_file_info = ocfs2_local_free_info, + .read_dqblk = ocfs2_local_read_dquot, + .commit_dqblk = ocfs2_local_write_dquot, + .release_dqblk = ocfs2_local_release_dquot, +}; + +struct quota_format_type ocfs2_quota_format = { + .qf_fmt_id = QFMT_OCFS2, + .qf_ops = &ocfs2_format_ops, + .qf_owner = THIS_MODULE +}; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index ffd48db229a..424adaa5f90 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -106,8 +106,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, mlog_entry("(new_clusters=%d, first_new_cluster = %u)\n", new_clusters, first_new_cluster); - ret = ocfs2_journal_access(handle, bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, bm_inode, group_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -141,8 +141,8 @@ static int ocfs2_update_last_group_and_inode(handle_t *handle, } /* update the inode accordingly. */ - ret = ocfs2_journal_access(handle, bm_inode, bm_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, bm_inode, bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_rollback; @@ -314,6 +314,10 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) fe = (struct ocfs2_dinode *)main_bm_bh->b_data; + /* main_bm_bh is validated by inode read inside ocfs2_inode_lock(), + * so any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (le16_to_cpu(fe->id2.i_chain.cl_cpg) != ocfs2_group_bitmap_size(osb->sb) * 8) { mlog(ML_ERROR, "The disk is too old and small. " @@ -322,30 +326,18 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out_unlock; } - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(main_bm_inode->i_sb, fe); - ret = -EIO; - goto out_unlock; - } - first_new_cluster = le32_to_cpu(fe->i_clusters); lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); - ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); + ret = ocfs2_read_group_descriptor(main_bm_inode, fe, lgd_blkno, + &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; } - group = (struct ocfs2_group_desc *)group_bh->b_data; - ret = ocfs2_check_group_descriptor(inode->i_sb, fe, group); - if (ret) { - mlog_errno(ret); - goto out_unlock; - } - cl_bpc = le16_to_cpu(fe->id2.i_chain.cl_bpc); if (le16_to_cpu(group->bg_bits) / cl_bpc + new_clusters > le16_to_cpu(fe->id2.i_chain.cl_cpg)) { @@ -398,41 +390,16 @@ static int ocfs2_check_new_group(struct inode *inode, struct buffer_head *group_bh) { int ret; - struct ocfs2_group_desc *gd; + struct ocfs2_group_desc *gd = + (struct ocfs2_group_desc *)group_bh->b_data; u16 cl_bpc = le16_to_cpu(di->id2.i_chain.cl_bpc); - unsigned int max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * - le16_to_cpu(di->id2.i_chain.cl_bpc); - - gd = (struct ocfs2_group_desc *)group_bh->b_data; + ret = ocfs2_check_group_descriptor(inode->i_sb, di, group_bh); + if (ret) + goto out; - ret = -EIO; - if (!OCFS2_IS_VALID_GROUP_DESC(gd)) - mlog(ML_ERROR, "Group descriptor # %llu isn't valid.\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno)); - else if (di->i_blkno != gd->bg_parent_dinode) - mlog(ML_ERROR, "Group descriptor # %llu has bad parent " - "pointer (%llu, expected %llu)\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), - (unsigned long long)le64_to_cpu(di->i_blkno)); - else if (le16_to_cpu(gd->bg_bits) > max_bits) - mlog(ML_ERROR, "Group descriptor # %llu has bit count of %u\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits)); - else if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) - mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " - "claims that %u are free\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - le16_to_cpu(gd->bg_free_bits_count)); - else if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) - mlog(ML_ERROR, "Group descriptor # %llu has bit count %u but " - "max bitmap bits of %u\n", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - 8 * le16_to_cpu(gd->bg_size)); - else if (le16_to_cpu(gd->bg_chain) != input->chain) + ret = -EINVAL; + if (le16_to_cpu(gd->bg_chain) != input->chain) mlog(ML_ERROR, "Group descriptor # %llu has bad chain %u " "while input has %u set.\n", (unsigned long long)le64_to_cpu(gd->bg_blkno), @@ -451,6 +418,7 @@ static int ocfs2_check_new_group(struct inode *inode, else ret = 0; +out: return ret; } @@ -568,8 +536,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) cl = &fe->id2.i_chain; cr = &cl->cl_recs[input->chain]; - ret = ocfs2_journal_access(handle, main_bm_inode, group_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_gd(handle, main_bm_inode, group_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; @@ -584,8 +552,8 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_commit; } - ret = ocfs2_journal_access(handle, main_bm_inode, main_bm_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, main_bm_inode, main_bm_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out_commit; diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bdda2d8f850..40661e7824e 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -151,7 +151,7 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * this is not true, the read of -1 (UINT64_MAX) will fail. */ ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, - OCFS2_BH_IGNORE_CACHE); + OCFS2_BH_IGNORE_CACHE, NULL); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -405,7 +405,7 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, bh = NULL; /* Acquire a fresh bh */ status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, - OCFS2_BH_IGNORE_CACHE); + OCFS2_BH_IGNORE_CACHE, NULL); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index c5ff18b46b5..a69628603e1 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -35,6 +35,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "inode.h" #include "journal.h" @@ -145,62 +146,183 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) return (u32)le16_to_cpu(cl->cl_cpg) * (u32)le16_to_cpu(cl->cl_bpc); } -/* somewhat more expensive than our other checks, so use sparingly. */ -int ocfs2_check_group_descriptor(struct super_block *sb, - struct ocfs2_dinode *di, - struct ocfs2_group_desc *gd) +#define do_error(fmt, ...) \ + do{ \ + if (clean_error) \ + mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ + else \ + ocfs2_error(sb, fmt, ##__VA_ARGS__); \ + } while (0) + +static int ocfs2_validate_gd_self(struct super_block *sb, + struct buffer_head *bh, + int clean_error) { - unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(sb, gd); - return -EIO; + do_error("Group descriptor #%llu has bad signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + gd->bg_signature); + return -EINVAL; } + if (le64_to_cpu(gd->bg_blkno) != bh->b_blocknr) { + do_error("Group descriptor #%llu has an invalid bg_blkno " + "of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(gd->bg_generation) != OCFS2_SB(sb)->fs_generation) { + do_error("Group descriptor #%llu has an invalid " + "fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(gd->bg_generation)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { + do_error("Group descriptor #%llu has bit count %u but " + "claims that %u are free", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + le16_to_cpu(gd->bg_free_bits_count)); + return -EINVAL; + } + + if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { + do_error("Group descriptor #%llu has bit count %u but " + "max bitmap bits of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits), + 8 * le16_to_cpu(gd->bg_size)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_validate_gd_parent(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh, + int clean_error) +{ + unsigned int max_bits; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + if (di->i_blkno != gd->bg_parent_dinode) { - ocfs2_error(sb, "Group descriptor # %llu has bad parent " - "pointer (%llu, expected %llu)", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), - (unsigned long long)le64_to_cpu(di->i_blkno)); - return -EIO; + do_error("Group descriptor #%llu has bad parent " + "pointer (%llu, expected %llu)", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(gd->bg_parent_dinode), + (unsigned long long)le64_to_cpu(di->i_blkno)); + return -EINVAL; } max_bits = le16_to_cpu(di->id2.i_chain.cl_cpg) * le16_to_cpu(di->id2.i_chain.cl_bpc); if (le16_to_cpu(gd->bg_bits) > max_bits) { - ocfs2_error(sb, "Group descriptor # %llu has bit count of %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits)); - return -EIO; + do_error("Group descriptor #%llu has bit count of %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_bits)); + return -EINVAL; } if (le16_to_cpu(gd->bg_chain) >= le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { - ocfs2_error(sb, "Group descriptor # %llu has bad chain %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_chain)); - return -EIO; + do_error("Group descriptor #%llu has bad chain %u", + (unsigned long long)bh->b_blocknr, + le16_to_cpu(gd->bg_chain)); + return -EINVAL; } - if (le16_to_cpu(gd->bg_free_bits_count) > le16_to_cpu(gd->bg_bits)) { - ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " - "claims that %u are free", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - le16_to_cpu(gd->bg_free_bits_count)); - return -EIO; - } + return 0; +} - if (le16_to_cpu(gd->bg_bits) > (8 * le16_to_cpu(gd->bg_size))) { - ocfs2_error(sb, "Group descriptor # %llu has bit count %u but " - "max bitmap bits of %u", - (unsigned long long)le64_to_cpu(gd->bg_blkno), - le16_to_cpu(gd->bg_bits), - 8 * le16_to_cpu(gd->bg_size)); - return -EIO; +#undef do_error + +/* + * This version only prints errors. It does not fail the filesystem, and + * exists only for resize. + */ +int ocfs2_check_group_descriptor(struct super_block *sb, + struct ocfs2_dinode *di, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) { + mlog(ML_ERROR, + "Checksum failed for group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + } else + rc = ocfs2_validate_gd_self(sb, bh, 1); + if (!rc) + rc = ocfs2_validate_gd_parent(sb, di, bh, 1); + + return rc; +} + +static int ocfs2_validate_group_descriptor(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; + + mlog(0, "Validating group descriptor %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &gd->bg_check); + if (rc) + return rc; + + /* + * Errors after here are fatal. + */ + + return ocfs2_validate_gd_self(sb, bh, 0); +} + +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(inode, gd_blkno, &tmp, + ocfs2_validate_group_descriptor); + if (rc) + goto out; + + rc = ocfs2_validate_gd_parent(inode->i_sb, di, tmp, 0); + if (rc) { + brelse(tmp); + goto out; } - return 0; + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!*bh) + *bh = tmp; + +out: + return rc; } static int ocfs2_block_group_fill(handle_t *handle, @@ -225,10 +347,10 @@ static int ocfs2_block_group_fill(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, - alloc_inode, - bg_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + status = ocfs2_journal_access_gd(handle, + alloc_inode, + bg_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto bail; @@ -358,8 +480,8 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg = (struct ocfs2_group_desc *) bg_bh->b_data; - status = ocfs2_journal_access(handle, alloc_inode, - bh, OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, alloc_inode, + bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -441,11 +563,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, ac->ac_alloc_slot = slot; fe = (struct ocfs2_dinode *) bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto bail; - } + + /* The bh was validated by the inode read inside + * ocfs2_inode_lock(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (!(fe->i_flags & cpu_to_le32(OCFS2_CHAIN_FL))) { ocfs2_error(alloc_inode->i_sb, "Invalid chain allocator %llu", (unsigned long long)le64_to_cpu(fe->i_blkno)); @@ -790,10 +912,9 @@ static int ocfs2_block_group_find_clear_bits(struct ocfs2_super *osb, int offset, start, found, status = 0; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(osb->sb, bg); - return -EIO; - } + /* Callers got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); found = start = best_offset = best_size = 0; bitmap = bg->bg_bitmap; @@ -858,11 +979,9 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, mlog_entry_void(); - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto bail; - } + /* All callers get the descriptor via + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); BUG_ON(le16_to_cpu(bg->bg_free_bits_count) < num_bits); mlog(0, "block_group_set_bits: off = %u, num = %u\n", bit_off, @@ -871,10 +990,10 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access(handle, - alloc_inode, - group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, + alloc_inode, + group_bh, + journal_type); if (status < 0) { mlog_errno(status); goto bail; @@ -931,21 +1050,10 @@ static int ocfs2_relink_block_group(handle_t *handle, struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) bg_bh->b_data; struct ocfs2_group_desc *prev_bg = (struct ocfs2_group_desc *) prev_bg_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto out; - } - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto out; - } - if (!OCFS2_IS_VALID_GROUP_DESC(prev_bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, prev_bg); - status = -EIO; - goto out; - } + /* The caller got these descriptors from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(prev_bg)); mlog(0, "Suballoc %llu, chain %u, move group %llu to top, prev = %llu\n", (unsigned long long)le64_to_cpu(fe->i_blkno), chain, @@ -956,8 +1064,8 @@ static int ocfs2_relink_block_group(handle_t *handle, bg_ptr = le64_to_cpu(bg->bg_next_group); prev_bg_ptr = le64_to_cpu(prev_bg->bg_next_group); - status = ocfs2_journal_access(handle, alloc_inode, prev_bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_gd(handle, alloc_inode, prev_bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -971,8 +1079,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access(handle, alloc_inode, bg_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_gd(handle, alloc_inode, bg_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -986,8 +1094,8 @@ static int ocfs2_relink_block_group(handle_t *handle, goto out_rollback; } - status = ocfs2_journal_access(handle, alloc_inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, alloc_inode, fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto out_rollback; @@ -1008,7 +1116,7 @@ out_rollback: bg->bg_next_group = cpu_to_le64(bg_ptr); prev_bg->bg_next_group = cpu_to_le64(prev_bg_ptr); } -out: + mlog_exit(status); return status; } @@ -1138,8 +1246,8 @@ static int ocfs2_alloc_dinode_update_counts(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *) di_bh->b_data; struct ocfs2_chain_list *cl = (struct ocfs2_chain_list *) &di->id2.i_chain; - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto out; @@ -1170,21 +1278,17 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, u16 found; struct buffer_head *group_bh = NULL; struct ocfs2_group_desc *gd; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)ac->ac_bh->b_data; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); + ret = ocfs2_read_group_descriptor(alloc_inode, di, gd_blkno, + &group_bh); if (ret < 0) { mlog_errno(ret); return ret; } gd = (struct ocfs2_group_desc *) group_bh->b_data; - if (!OCFS2_IS_VALID_GROUP_DESC(gd)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, gd); - ret = -EIO; - goto out; - } - ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, ac->ac_max_block, bit_off, &found); if (ret < 0) { @@ -1241,19 +1345,14 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bits_wanted, chain, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); - status = ocfs2_read_block(alloc_inode, - le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh); + status = ocfs2_read_group_descriptor(alloc_inode, fe, + le64_to_cpu(cl->cl_recs[chain].c_blkno), + &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); - if (status) { - mlog_errno(status); - goto bail; - } status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use @@ -1271,18 +1370,13 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(alloc_inode, - next_group, &group_bh); + status = ocfs2_read_group_descriptor(alloc_inode, fe, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; } bg = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, bg); - if (status) { - mlog_errno(status); - goto bail; - } } if (status < 0) { if (status != -ENOSPC) @@ -1324,10 +1418,10 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* Ok, claim our bits now: set the info on dinode, chainlist * and then the group */ - status = ocfs2_journal_access(handle, - alloc_inode, - ac->ac_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, + alloc_inode, + ac->ac_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; @@ -1392,11 +1486,11 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, BUG_ON(!ac->ac_bh); fe = (struct ocfs2_dinode *) ac->ac_bh->b_data; - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(osb->sb, fe); - status = -EIO; - goto bail; - } + + /* The bh was validated by the inode read during + * ocfs2_reserve_suballoc_bits(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); + if (le32_to_cpu(fe->id1.bitmap1.i_used) >= le32_to_cpu(fe->id1.bitmap1.i_total)) { ocfs2_error(osb->sb, "Chain allocator dinode %llu has %u used " @@ -1725,19 +1819,17 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle, mlog_entry_void(); - if (!OCFS2_IS_VALID_GROUP_DESC(bg)) { - OCFS2_RO_ON_INVALID_GROUP_DESC(alloc_inode->i_sb, bg); - status = -EIO; - goto bail; - } + /* The caller got this descriptor from + * ocfs2_read_group_descriptor(). Any corruption is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_GROUP_DESC(bg)); mlog(0, "off = %u, num = %u\n", bit_off, num_bits); if (ocfs2_is_cluster_bitmap(alloc_inode)) journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - status = ocfs2_journal_access(handle, alloc_inode, group_bh, - journal_type); + status = ocfs2_journal_access_gd(handle, alloc_inode, group_bh, + journal_type); if (status < 0) { mlog_errno(status); goto bail; @@ -1782,29 +1874,26 @@ int ocfs2_free_suballoc_bits(handle_t *handle, mlog_entry_void(); - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(alloc_inode->i_sb, fe); - status = -EIO; - goto bail; - } + /* The alloc_bh comes from ocfs2_free_dinode() or + * ocfs2_free_clusters(). The callers have all locked the + * allocator and gotten alloc_bh from the lock call. This + * validates the dinode buffer. Any corruption that has happended + * is a code bug. */ + BUG_ON(!OCFS2_IS_VALID_DINODE(fe)); BUG_ON((count + start_bit) > ocfs2_bits_per_group(cl)); mlog(0, "%llu: freeing %u bits from group %llu, starting at %u\n", (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, (unsigned long long)bg_blkno, start_bit); - status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); + status = ocfs2_read_group_descriptor(alloc_inode, fe, bg_blkno, + &group_bh); if (status < 0) { mlog_errno(status); goto bail; } - group = (struct ocfs2_group_desc *) group_bh->b_data; - status = ocfs2_check_group_descriptor(alloc_inode->i_sb, fe, group); - if (status) { - mlog_errno(status); - goto bail; - } + BUG_ON((count + start_bit) > le16_to_cpu(group->bg_bits)); status = ocfs2_block_group_clear_bits(handle, alloc_inode, @@ -1815,8 +1904,8 @@ int ocfs2_free_suballoc_bits(handle_t *handle, goto bail; } - status = ocfs2_journal_access(handle, alloc_inode, alloc_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = ocfs2_journal_access_di(handle, alloc_inode, alloc_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 4df159d8f45..e3c13c77f9e 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -164,10 +164,24 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); * and return that block offset. */ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); -/* somewhat more expensive than our other checks, so use sparingly. */ +/* + * By default, ocfs2_read_group_descriptor() calls ocfs2_error() when it + * finds a problem. A caller that wants to check a group descriptor + * without going readonly should read the block with ocfs2_read_block[s]() + * and then checking it with this function. This is only resize, really. + * Everyone else should be using ocfs2_read_group_descriptor(). + */ int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, - struct ocfs2_group_desc *gd); + struct buffer_head *bh); +/* + * Read a group descriptor block into *bh. If *bh is NULL, a bh will be + * allocated. This is a cached read. The descriptor will be validated with + * ocfs2_validate_group_descriptor(). + */ +int ocfs2_read_group_descriptor(struct inode *inode, struct ocfs2_dinode *di, + u64 gd_blkno, struct buffer_head **bh); + int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, u32 clusters_to_add, u32 extents_to_split, struct ocfs2_alloc_context **data_ac, diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 304b63ac78c..43ed11345b5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -41,6 +41,7 @@ #include <linux/debugfs.h> #include <linux/mount.h> #include <linux/seq_file.h> +#include <linux/quotaops.h> #define MLOG_MASK_PREFIX ML_SUPER #include <cluster/masklog.h> @@ -51,6 +52,7 @@ #include "ocfs1_fs_compat.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "export.h" #include "extent_map.h" @@ -65,10 +67,13 @@ #include "uptodate.h" #include "ver.h" #include "xattr.h" +#include "quota.h" #include "buffer_head_io.h" static struct kmem_cache *ocfs2_inode_cachep = NULL; +struct kmem_cache *ocfs2_dquot_cachep; +struct kmem_cache *ocfs2_qf_chunk_cachep; /* OCFS2 needs to schedule several differnt types of work which * require cluster locking, disk I/O, recovery waits, etc. Since these @@ -124,6 +129,9 @@ static int ocfs2_get_sector(struct super_block *sb, static void ocfs2_write_super(struct super_block *sb); static struct inode *ocfs2_alloc_inode(struct super_block *sb); static void ocfs2_destroy_inode(struct inode *inode); +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend); +static int ocfs2_enable_quotas(struct ocfs2_super *osb); +static void ocfs2_disable_quotas(struct ocfs2_super *osb); static const struct super_operations ocfs2_sops = { .statfs = ocfs2_statfs, @@ -137,6 +145,8 @@ static const struct super_operations ocfs2_sops = { .put_super = ocfs2_put_super, .remount_fs = ocfs2_remount, .show_options = ocfs2_show_options, + .quota_read = ocfs2_quota_read, + .quota_write = ocfs2_quota_write, }; enum { @@ -158,6 +168,10 @@ enum { Opt_user_xattr, Opt_nouser_xattr, Opt_inode64, + Opt_acl, + Opt_noacl, + Opt_usrquota, + Opt_grpquota, Opt_err, }; @@ -180,6 +194,10 @@ static const match_table_t tokens = { {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_inode64, "inode64"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, {Opt_err, NULL} }; @@ -221,6 +239,19 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) return 0; } +static int ocfs2_need_system_inode(struct ocfs2_super *osb, int ino) +{ + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_USRQUOTA) + && (ino == USER_QUOTA_SYSTEM_INODE + || ino == LOCAL_USER_QUOTA_SYSTEM_INODE)) + return 0; + if (!OCFS2_HAS_RO_COMPAT_FEATURE(osb->sb, OCFS2_FEATURE_RO_COMPAT_GRPQUOTA) + && (ino == GROUP_QUOTA_SYSTEM_INODE + || ino == LOCAL_GROUP_QUOTA_SYSTEM_INODE)) + return 0; + return 1; +} + static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) { struct inode *new = NULL; @@ -247,6 +278,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) for (i = OCFS2_FIRST_ONLINE_SYSTEM_INODE; i <= OCFS2_LAST_GLOBAL_SYSTEM_INODE; i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); @@ -277,6 +310,8 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) for (i = OCFS2_LAST_GLOBAL_SYSTEM_INODE + 1; i < NUM_SYSTEM_INODES; i++) { + if (!ocfs2_need_system_inode(osb, i)) + continue; new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); @@ -426,6 +461,12 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) /* We're going to/from readonly mode. */ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { + /* Disable quota accounting before remounting RO */ + if (*flags & MS_RDONLY) { + ret = ocfs2_susp_quotas(osb, 0); + if (ret < 0) + goto out; + } /* Lock here so the check of HARD_RO and the potential * setting of SOFT_RO is atomic. */ spin_lock(&osb->osb_lock); @@ -461,11 +502,28 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) } unlock_osb: spin_unlock(&osb->osb_lock); + /* Enable quota accounting after remounting RW */ + if (!ret && !(*flags & MS_RDONLY)) { + if (sb_any_quota_suspended(sb)) + ret = ocfs2_susp_quotas(osb, 1); + else + ret = ocfs2_enable_quotas(osb); + if (ret < 0) { + /* Return back changes... */ + spin_lock(&osb->osb_lock); + sb->s_flags |= MS_RDONLY; + osb->osb_flags |= OCFS2_OSB_SOFT_RO; + spin_unlock(&osb->osb_lock); + goto out; + } + } } if (!ret) { /* Only save off the new mount options in case of a successful * remount. */ + if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) + parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; @@ -619,6 +677,131 @@ static int ocfs2_verify_userspace_stack(struct ocfs2_super *osb, return 0; } +static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) +{ + int type; + struct super_block *sb = osb->sb; + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + int status = 0; + + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + if (unsuspend) + status = vfs_quota_enable( + sb_dqopt(sb)->files[type], + type, QFMT_OCFS2, + DQUOT_SUSPENDED); + else + status = vfs_quota_disable(sb, type, + DQUOT_SUSPENDED); + if (status < 0) + break; + } + if (status < 0) + mlog(ML_ERROR, "Failed to suspend/unsuspend quotas on " + "remount (error = %d).\n", status); + return status; +} + +static int ocfs2_enable_quotas(struct ocfs2_super *osb) +{ + struct inode *inode[MAXQUOTAS] = { NULL, NULL }; + struct super_block *sb = osb->sb; + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + int status; + int type; + + sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; + for (type = 0; type < MAXQUOTAS; type++) { + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + continue; + inode[type] = ocfs2_get_system_file_inode(osb, ino[type], + osb->slot_num); + if (!inode[type]) { + status = -ENOENT; + goto out_quota_off; + } + status = vfs_quota_enable(inode[type], type, QFMT_OCFS2, + DQUOT_USAGE_ENABLED); + if (status < 0) + goto out_quota_off; + } + + for (type = 0; type < MAXQUOTAS; type++) + iput(inode[type]); + return 0; +out_quota_off: + ocfs2_disable_quotas(osb); + for (type = 0; type < MAXQUOTAS; type++) + iput(inode[type]); + mlog_errno(status); + return status; +} + +static void ocfs2_disable_quotas(struct ocfs2_super *osb) +{ + int type; + struct inode *inode; + struct super_block *sb = osb->sb; + + /* We mostly ignore errors in this function because there's not much + * we can do when we see them */ + for (type = 0; type < MAXQUOTAS; type++) { + if (!sb_has_quota_loaded(sb, type)) + continue; + inode = igrab(sb->s_dquot.files[type]); + /* Turn off quotas. This will remove all dquot structures from + * memory and so they will be automatically synced to global + * quota files */ + vfs_quota_disable(sb, type, DQUOT_USAGE_ENABLED | + DQUOT_LIMITS_ENABLED); + if (!inode) + continue; + iput(inode); + } +} + +/* Handle quota on quotactl */ +static int ocfs2_quota_on(struct super_block *sb, int type, int format_id, + char *path, int remount) +{ + unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + + if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) + return -EINVAL; + + if (remount) + return 0; /* Just ignore it has been handled in + * ocfs2_remount() */ + return vfs_quota_enable(sb_dqopt(sb)->files[type], type, + format_id, DQUOT_LIMITS_ENABLED); +} + +/* Handle quota off quotactl */ +static int ocfs2_quota_off(struct super_block *sb, int type, int remount) +{ + if (remount) + return 0; /* Ignore now and handle later in + * ocfs2_remount() */ + return vfs_quota_disable(sb, type, DQUOT_LIMITS_ENABLED); +} + +static struct quotactl_ops ocfs2_quotactl_ops = { + .quota_on = ocfs2_quota_on, + .quota_off = ocfs2_quota_off, + .quota_sync = vfs_quota_sync, + .get_info = vfs_get_dqinfo, + .set_info = vfs_set_dqinfo, + .get_dqblk = vfs_get_dqblk, + .set_dqblk = vfs_set_dqblk, +}; + static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) { struct dentry *root; @@ -651,12 +834,32 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) } brelse(bh); bh = NULL; + + if (!(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR)) + parsed_options.mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + osb->s_mount_opt = parsed_options.mount_opt; osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); osb->local_alloc_bits = osb->local_alloc_default_bits; + if (osb->s_mount_opt & OCFS2_MOUNT_USRQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + status = -EINVAL; + mlog(ML_ERROR, "User quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + goto read_super_error; + } + if (osb->s_mount_opt & OCFS2_MOUNT_GRPQUOTA && + !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + status = -EINVAL; + mlog(ML_ERROR, "Group quotas were requested, but this " + "filesystem does not have the feature enabled.\n"); + goto read_super_error; + } status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -664,6 +867,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) sb->s_magic = OCFS2_SUPER_MAGIC; + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); + /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, * heartbeat=none */ if (bdev_read_only(sb->s_bdev)) { @@ -758,6 +964,28 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) atomic_set(&osb->vol_state, VOLUME_MOUNTED); wake_up(&osb->osb_mount_event); + /* Now we can initialize quotas because we can afford to wait + * for cluster locks recovery now. That also means that truncation + * log recovery can happen but that waits for proper quota setup */ + if (!(sb->s_flags & MS_RDONLY)) { + status = ocfs2_enable_quotas(osb); + if (status < 0) { + /* We have to err-out specially here because + * s_root is already set */ + mlog_errno(status); + atomic_set(&osb->vol_state, VOLUME_DISABLED); + wake_up(&osb->osb_mount_event); + mlog_exit(status); + return status; + } + } + + ocfs2_complete_quota_recovery(osb); + + /* Now we wake up again for processes waiting for quotas */ + atomic_set(&osb->vol_state, VOLUME_MOUNTED_QUOTAS); + wake_up(&osb->osb_mount_event); + mlog_exit(status); return status; @@ -945,6 +1173,41 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_inode64: mopt->mount_opt |= OCFS2_MOUNT_INODE64; break; + case Opt_usrquota: + /* We check only on remount, otherwise features + * aren't yet initialized. */ + if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_USRQUOTA)) { + mlog(ML_ERROR, "User quota requested but " + "filesystem feature is not set\n"); + status = 0; + goto bail; + } + mopt->mount_opt |= OCFS2_MOUNT_USRQUOTA; + break; + case Opt_grpquota: + if (is_remount && !OCFS2_HAS_RO_COMPAT_FEATURE(sb, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA)) { + mlog(ML_ERROR, "Group quota requested but " + "filesystem feature is not set\n"); + status = 0; + goto bail; + } + mopt->mount_opt |= OCFS2_MOUNT_GRPQUOTA; + break; +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + case Opt_acl: + mopt->mount_opt |= OCFS2_MOUNT_POSIX_ACL; + break; + case Opt_noacl: + mopt->mount_opt &= ~OCFS2_MOUNT_POSIX_ACL; + break; +#else + case Opt_acl: + case Opt_noacl: + printk(KERN_INFO "ocfs2 (no)acl options not supported\n"); + break; +#endif default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -1008,6 +1271,10 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (osb->osb_cluster_stack[0]) seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_USRQUOTA) + seq_printf(s, ",usrquota"); + if (opts & OCFS2_MOUNT_GRPQUOTA) + seq_printf(s, ",grpquota"); if (opts & OCFS2_MOUNT_NOUSERXATTR) seq_printf(s, ",nouser_xattr"); @@ -1017,6 +1284,13 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) if (opts & OCFS2_MOUNT_INODE64) seq_printf(s, ",inode64"); +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + if (opts & OCFS2_MOUNT_POSIX_ACL) + seq_printf(s, ",acl"); + else + seq_printf(s, ",noacl"); +#endif + return 0; } @@ -1052,10 +1326,16 @@ static int __init ocfs2_init(void) mlog(ML_ERROR, "Unable to create ocfs2 debugfs root.\n"); } + status = ocfs2_quota_setup(); + if (status) + goto leave; + ocfs2_set_locking_protocol(); + status = register_quota_format(&ocfs2_quota_format); leave: if (status < 0) { + ocfs2_quota_shutdown(); ocfs2_free_mem_caches(); exit_ocfs2_uptodate_cache(); } @@ -1072,11 +1352,15 @@ static void __exit ocfs2_exit(void) { mlog_entry_void(); + ocfs2_quota_shutdown(); + if (ocfs2_wq) { flush_workqueue(ocfs2_wq); destroy_workqueue(ocfs2_wq); } + unregister_quota_format(&ocfs2_quota_format); + debugfs_remove(ocfs2_debugfs_root); ocfs2_free_mem_caches(); @@ -1192,8 +1476,27 @@ static int ocfs2_initialize_mem_caches(void) (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| SLAB_MEM_SPREAD), ocfs2_inode_init_once); - if (!ocfs2_inode_cachep) + ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache", + sizeof(struct ocfs2_dquot), + 0, + (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD), + NULL); + ocfs2_qf_chunk_cachep = kmem_cache_create("ocfs2_qf_chunk_cache", + sizeof(struct ocfs2_quota_chunk), + 0, + (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD), + NULL); + if (!ocfs2_inode_cachep || !ocfs2_dquot_cachep || + !ocfs2_qf_chunk_cachep) { + if (ocfs2_inode_cachep) + kmem_cache_destroy(ocfs2_inode_cachep); + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); return -ENOMEM; + } return 0; } @@ -1202,8 +1505,15 @@ static void ocfs2_free_mem_caches(void) { if (ocfs2_inode_cachep) kmem_cache_destroy(ocfs2_inode_cachep); - ocfs2_inode_cachep = NULL; + + if (ocfs2_dquot_cachep) + kmem_cache_destroy(ocfs2_dquot_cachep); + ocfs2_dquot_cachep = NULL; + + if (ocfs2_qf_chunk_cachep) + kmem_cache_destroy(ocfs2_qf_chunk_cachep); + ocfs2_qf_chunk_cachep = NULL; } static int ocfs2_get_sector(struct super_block *sb, @@ -1303,6 +1613,8 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) osb = OCFS2_SB(sb); BUG_ON(!osb); + ocfs2_disable_quotas(osb); + ocfs2_shutdown_local_alloc(osb); ocfs2_truncate_log_shutdown(osb); @@ -1413,6 +1725,8 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_qcop = &ocfs2_quotactl_ops; + sb->dq_op = &ocfs2_quota_operations; sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; @@ -1676,6 +1990,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { + /* We have to do a raw check of the feature here */ + if (le32_to_cpu(di->id2.i_super.s_feature_incompat) & + OCFS2_FEATURE_INCOMPAT_META_ECC) { + status = ocfs2_block_check_validate(bh->b_data, + bh->b_size, + &di->i_check); + if (status) + goto out; + } status = -EINVAL; if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { mlog(ML_ERROR, "found superblock with incorrect block " @@ -1717,6 +2040,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, } } +out: mlog_exit(status); return status; } diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index cbd03dfdc7b..ed0a0cfd68d 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -84,7 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, mlog_entry_void(); - status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); + status = ocfs2_read_inode_block(inode, bh); if (status < 0) { mlog_errno(status); link = ERR_PTR(status); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 74d7367ade1..e1d638af6ac 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -35,12 +35,14 @@ #include <linux/init.h> #include <linux/module.h> #include <linux/string.h> +#include <linux/security.h> #define MLOG_MASK_PREFIX ML_XATTR #include <cluster/masklog.h> #include "ocfs2.h" #include "alloc.h" +#include "blockcheck.h" #include "dlmglue.h" #include "file.h" #include "symlink.h" @@ -61,12 +63,32 @@ struct ocfs2_xattr_def_value_root { }; struct ocfs2_xattr_bucket { - struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; - struct ocfs2_xattr_header *xh; + /* The inode these xattrs are associated with */ + struct inode *bu_inode; + + /* The actual buffers that make up the bucket */ + struct buffer_head *bu_bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + + /* How many blocks make up one bucket for this filesystem */ + int bu_blocks; +}; + +struct ocfs2_xattr_set_ctxt { + handle_t *handle; + struct ocfs2_alloc_context *meta_ac; + struct ocfs2_alloc_context *data_ac; + struct ocfs2_cached_dealloc_ctxt dealloc; }; #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) #define OCFS2_XATTR_INLINE_SIZE 80 +#define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ + - sizeof(struct ocfs2_xattr_header) \ + - sizeof(__u32)) +#define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ + - sizeof(struct ocfs2_xattr_block) \ + - sizeof(struct ocfs2_xattr_header) \ + - sizeof(__u32)) static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), @@ -74,13 +96,25 @@ static struct ocfs2_xattr_def_value_root def_xv = { struct xattr_handler *ocfs2_xattr_handlers[] = { &ocfs2_xattr_user_handler, +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + &ocfs2_xattr_acl_access_handler, + &ocfs2_xattr_acl_default_handler, +#endif &ocfs2_xattr_trusted_handler, + &ocfs2_xattr_security_handler, NULL }; static struct xattr_handler *ocfs2_xattr_handler_map[OCFS2_XATTR_MAX] = { [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, +#ifdef CONFIG_OCFS2_FS_POSIX_ACL + [OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS] + = &ocfs2_xattr_acl_access_handler, + [OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT] + = &ocfs2_xattr_acl_default_handler, +#endif [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, + [OCFS2_XATTR_INDEX_SECURITY] = &ocfs2_xattr_security_handler, }; struct ocfs2_xattr_info { @@ -98,7 +132,7 @@ struct ocfs2_xattr_search { */ struct buffer_head *xattr_bh; struct ocfs2_xattr_header *header; - struct ocfs2_xattr_bucket bucket; + struct ocfs2_xattr_bucket *bucket; void *base; void *end; struct ocfs2_xattr_entry *here; @@ -127,14 +161,20 @@ static int ocfs2_xattr_tree_list_index_block(struct inode *inode, size_t buffer_size); static int ocfs2_xattr_create_index_block(struct inode *inode, - struct ocfs2_xattr_search *xs); + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs); + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt); static int ocfs2_delete_xattr_index_block(struct inode *inode, struct buffer_head *xb_bh); +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, + u32 *first_hash); static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) { @@ -154,6 +194,216 @@ static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) return len / sizeof(struct ocfs2_xattr_entry); } +#define bucket_blkno(_b) ((_b)->bu_bhs[0]->b_blocknr) +#define bucket_block(_b, _n) ((_b)->bu_bhs[(_n)]->b_data) +#define bucket_xh(_b) ((struct ocfs2_xattr_header *)bucket_block((_b), 0)) + +static struct ocfs2_xattr_bucket *ocfs2_xattr_bucket_new(struct inode *inode) +{ + struct ocfs2_xattr_bucket *bucket; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + BUG_ON(blks > OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET); + + bucket = kzalloc(sizeof(struct ocfs2_xattr_bucket), GFP_NOFS); + if (bucket) { + bucket->bu_inode = inode; + bucket->bu_blocks = blks; + } + + return bucket; +} + +static void ocfs2_xattr_bucket_relse(struct ocfs2_xattr_bucket *bucket) +{ + int i; + + for (i = 0; i < bucket->bu_blocks; i++) { + brelse(bucket->bu_bhs[i]); + bucket->bu_bhs[i] = NULL; + } +} + +static void ocfs2_xattr_bucket_free(struct ocfs2_xattr_bucket *bucket) +{ + if (bucket) { + ocfs2_xattr_bucket_relse(bucket); + bucket->bu_inode = NULL; + kfree(bucket); + } +} + +/* + * A bucket that has never been written to disk doesn't need to be + * read. We just need the buffer_heads. Don't call this for + * buckets that are already on disk. ocfs2_read_xattr_bucket() initializes + * them fully. + */ +static int ocfs2_init_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + bucket->bu_bhs[i] = sb_getblk(bucket->bu_inode->i_sb, + xb_blkno + i); + if (!bucket->bu_bhs[i]) { + rc = -EIO; + mlog_errno(rc); + break; + } + + if (!ocfs2_buffer_uptodate(bucket->bu_inode, + bucket->bu_bhs[i])) + ocfs2_set_new_buffer_uptodate(bucket->bu_inode, + bucket->bu_bhs[i]); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +/* Read the xattr bucket at xb_blkno */ +static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, + u64 xb_blkno) +{ + int rc; + + rc = ocfs2_read_blocks(bucket->bu_inode, xb_blkno, + bucket->bu_blocks, bucket->bu_bhs, 0, + NULL); + if (!rc) { + rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, + bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + if (rc) + mlog_errno(rc); + } + + if (rc) + ocfs2_xattr_bucket_relse(bucket); + return rc; +} + +static int ocfs2_xattr_bucket_journal_access(handle_t *handle, + struct ocfs2_xattr_bucket *bucket, + int type) +{ + int i, rc = 0; + + for (i = 0; i < bucket->bu_blocks; i++) { + rc = ocfs2_journal_access(handle, bucket->bu_inode, + bucket->bu_bhs[i], type); + if (rc) { + mlog_errno(rc); + break; + } + } + + return rc; +} + +static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle, + struct ocfs2_xattr_bucket *bucket) +{ + int i; + + ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, + bucket->bu_bhs, bucket->bu_blocks, + &bucket_xh(bucket)->xh_check); + + for (i = 0; i < bucket->bu_blocks; i++) + ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); +} + +static void ocfs2_xattr_bucket_copy_data(struct ocfs2_xattr_bucket *dest, + struct ocfs2_xattr_bucket *src) +{ + int i; + int blocksize = src->bu_inode->i_sb->s_blocksize; + + BUG_ON(dest->bu_blocks != src->bu_blocks); + BUG_ON(dest->bu_inode != src->bu_inode); + + for (i = 0; i < src->bu_blocks; i++) { + memcpy(bucket_block(dest, i), bucket_block(src, i), + blocksize); + } +} + +static int ocfs2_validate_xattr_block(struct super_block *sb, + struct buffer_head *bh) +{ + int rc; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)bh->b_data; + + mlog(0, "Validating xattr block %llu\n", + (unsigned long long)bh->b_blocknr); + + BUG_ON(!buffer_uptodate(bh)); + + /* + * If the ecc fails, we return the error but otherwise + * leave the filesystem running. We know any error is + * local to this block. + */ + rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &xb->xb_check); + if (rc) + return rc; + + /* + * Errors after here are fatal + */ + + if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { + ocfs2_error(sb, + "Extended attribute block #%llu has bad " + "signature %.*s", + (unsigned long long)bh->b_blocknr, 7, + xb->xb_signature); + return -EINVAL; + } + + if (le64_to_cpu(xb->xb_blkno) != bh->b_blocknr) { + ocfs2_error(sb, + "Extended attribute block #%llu has an " + "invalid xb_blkno of %llu", + (unsigned long long)bh->b_blocknr, + (unsigned long long)le64_to_cpu(xb->xb_blkno)); + return -EINVAL; + } + + if (le32_to_cpu(xb->xb_fs_generation) != OCFS2_SB(sb)->fs_generation) { + ocfs2_error(sb, + "Extended attribute block #%llu has an invalid " + "xb_fs_generation of #%u", + (unsigned long long)bh->b_blocknr, + le32_to_cpu(xb->xb_fs_generation)); + return -EINVAL; + } + + return 0; +} + +static int ocfs2_read_xattr_block(struct inode *inode, u64 xb_blkno, + struct buffer_head **bh) +{ + int rc; + struct buffer_head *tmp = *bh; + + rc = ocfs2_read_block(inode, xb_blkno, &tmp, + ocfs2_validate_xattr_block); + + /* If ocfs2_read_block() got us a new bh, pass it up. */ + if (!rc && !*bh) + *bh = tmp; + + return rc; +} + static inline const char *ocfs2_xattr_prefix(int name_index) { struct xattr_handler *handler = NULL; @@ -200,54 +450,163 @@ static void ocfs2_xattr_hash_entry(struct inode *inode, return; } +static int ocfs2_xattr_entry_real_size(int name_len, size_t value_len) +{ + int size = 0; + + if (value_len <= OCFS2_XATTR_INLINE_SIZE) + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(value_len); + else + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + size += sizeof(struct ocfs2_xattr_entry); + + return size; +} + +int ocfs2_calc_security_init(struct inode *dir, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + struct ocfs2_alloc_context **xattr_ac) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * So reserve one metadata block for it is ok. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + s_size > OCFS2_XATTR_FREE_IN_IBODY) { + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + /* reserve clusters for xattr value which will be set in B tree*/ + if (si->value_len > OCFS2_XATTR_INLINE_SIZE) { + int new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + return ret; +} + +int ocfs2_calc_xattr_init(struct inode *dir, + struct buffer_head *dir_bh, + int mode, + struct ocfs2_security_xattr_info *si, + int *want_clusters, + int *xattr_credits, + struct ocfs2_alloc_context **xattr_ac) +{ + int ret = 0; + struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); + int s_size = 0, a_size = 0, acl_len = 0, new_clusters; + + if (si->enable) + s_size = ocfs2_xattr_entry_real_size(strlen(si->name), + si->value_len); + + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + "", NULL, 0); + if (acl_len > 0) { + a_size = ocfs2_xattr_entry_real_size(0, acl_len); + if (S_ISDIR(mode)) + a_size <<= 1; + } else if (acl_len != 0 && acl_len != -ENODATA) { + mlog_errno(ret); + return ret; + } + } + + if (!(s_size + a_size)) + return ret; + + /* + * The max space of security xattr taken inline is + * 256(name) + 80(value) + 16(entry) = 352 bytes, + * The max space of acl xattr taken inline is + * 80(value) + 16(entry) * 2(if directory) = 192 bytes, + * when blocksize = 512, may reserve one more cluser for + * xattr bucket, otherwise reserve one metadata block + * for them is ok. + */ + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + *xattr_credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } + + if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE && + (s_size + a_size) > OCFS2_XATTR_FREE_IN_BLOCK(dir)) { + *want_clusters += 1; + *xattr_credits += ocfs2_blocks_per_xattr_bucket(dir->i_sb); + } + + /* + * reserve credits and clusters for xattrs which has large value + * and have to be set outside + */ + if (si->enable && si->value_len > OCFS2_XATTR_INLINE_SIZE) { + new_clusters = ocfs2_clusters_for_bytes(dir->i_sb, + si->value_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL && + acl_len > OCFS2_XATTR_INLINE_SIZE) { + /* for directory, it has DEFAULT and ACCESS two types of acls */ + new_clusters = (S_ISDIR(mode) ? 2 : 1) * + ocfs2_clusters_for_bytes(dir->i_sb, acl_len); + *xattr_credits += ocfs2_clusters_to_blocks(dir->i_sb, + new_clusters); + *want_clusters += new_clusters; + } + + return ret; +} + static int ocfs2_xattr_extend_allocation(struct inode *inode, u32 clusters_to_add, - struct buffer_head *xattr_bh, - struct ocfs2_xattr_value_root *xv) + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) { int status = 0; - int restart_func = 0; - int credits = 0; - handle_t *handle = NULL; - struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = ctxt->handle; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); + u32 prev_clusters, logical_start = le32_to_cpu(vb->vb_xv->xr_clusters); struct ocfs2_extent_tree et; mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); - ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); - -restart_all: - - status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, - &data_ac, &meta_ac); - if (status) { - mlog_errno(status); - goto leave; - } - - credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, - clusters_to_add); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - handle = NULL; - mlog_errno(status); - goto leave; - } + ocfs2_init_xattr_value_extent_tree(&et, inode, vb); -restarted_transaction: - status = ocfs2_journal_access(handle, inode, xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + status = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto leave; } - prev_clusters = le32_to_cpu(xv->xr_clusters); + prev_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); status = ocfs2_add_clusters_in_btree(osb, inode, &logical_start, @@ -255,157 +614,84 @@ restarted_transaction: 0, &et, handle, - data_ac, - meta_ac, + ctxt->data_ac, + ctxt->meta_ac, &why); - if ((status < 0) && (status != -EAGAIN)) { - if (status != -ENOSPC) - mlog_errno(status); + if (status < 0) { + mlog_errno(status); goto leave; } - status = ocfs2_journal_dirty(handle, xattr_bh); + status = ocfs2_journal_dirty(handle, vb->vb_bh); if (status < 0) { mlog_errno(status); goto leave; } - clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; + clusters_to_add -= le32_to_cpu(vb->vb_xv->xr_clusters) - prev_clusters; - if (why != RESTART_NONE && clusters_to_add) { - if (why == RESTART_META) { - mlog(0, "restarting function.\n"); - restart_func = 1; - } else { - BUG_ON(why != RESTART_TRANS); - - mlog(0, "restarting transaction.\n"); - /* TODO: This can be more intelligent. */ - credits = ocfs2_calc_extend_credits(osb->sb, - et.et_root_el, - clusters_to_add); - status = ocfs2_extend_trans(handle, credits); - if (status < 0) { - /* handle still has to be committed at - * this point. */ - status = -ENOMEM; - mlog_errno(status); - goto leave; - } - goto restarted_transaction; - } - } + /* + * We should have already allocated enough space before the transaction, + * so no need to restart. + */ + BUG_ON(why != RESTART_NONE || clusters_to_add); leave: - if (handle) { - ocfs2_commit_trans(osb, handle); - handle = NULL; - } - if (data_ac) { - ocfs2_free_alloc_context(data_ac); - data_ac = NULL; - } - if (meta_ac) { - ocfs2_free_alloc_context(meta_ac); - meta_ac = NULL; - } - if ((!status) && restart_func) { - restart_func = 0; - goto restart_all; - } return status; } static int __ocfs2_remove_xattr_range(struct inode *inode, - struct buffer_head *root_bh, - struct ocfs2_xattr_value_root *xv, + struct ocfs2_xattr_value_buf *vb, u32 cpos, u32 phys_cpos, u32 len, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_xattr_set_ctxt *ctxt) { int ret; u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct inode *tl_inode = osb->osb_tl_inode; - handle_t *handle; - struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = ctxt->handle; struct ocfs2_extent_tree et; - ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); + ocfs2_init_xattr_value_extent_tree(&et, inode, vb); - ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + ret = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - return ret; - } - - mutex_lock(&tl_inode->i_mutex); - - if (ocfs2_truncate_log_needs_flush(osb)) { - ret = __ocfs2_flush_truncate_log(osb); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - - ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, - dealloc); + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, ctxt->meta_ac, + &ctxt->dealloc); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - le32_add_cpu(&xv->xr_clusters, -len); + le32_add_cpu(&vb->vb_xv->xr_clusters, -len); - ret = ocfs2_journal_dirty(handle, root_bh); + ret = ocfs2_journal_dirty(handle, vb->vb_bh); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + ret = ocfs2_cache_cluster_dealloc(&ctxt->dealloc, phys_blkno, len); if (ret) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); - - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - return ret; } static int ocfs2_xattr_shrink_size(struct inode *inode, u32 old_clusters, u32 new_clusters, - struct buffer_head *root_bh, - struct ocfs2_xattr_value_root *xv) + struct ocfs2_xattr_value_buf *vb, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret = 0; u32 trunc_len, cpos, phys_cpos, alloc_size; u64 block; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_cached_dealloc_ctxt dealloc; - - ocfs2_init_dealloc_ctxt(&dealloc); if (old_clusters <= new_clusters) return 0; @@ -414,7 +700,8 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, trunc_len = old_clusters - new_clusters; while (trunc_len) { ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, - &alloc_size, &xv->xr_list); + &alloc_size, + &vb->vb_xv->xr_list); if (ret) { mlog_errno(ret); goto out; @@ -423,9 +710,9 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, if (alloc_size > trunc_len) alloc_size = trunc_len; - ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, + ret = __ocfs2_remove_xattr_range(inode, vb, cpos, phys_cpos, alloc_size, - &dealloc); + ctxt); if (ret) { mlog_errno(ret); goto out; @@ -439,20 +726,17 @@ static int ocfs2_xattr_shrink_size(struct inode *inode, } out: - ocfs2_schedule_truncate_log_flush(osb, 1); - ocfs2_run_deallocs(osb, &dealloc); - return ret; } static int ocfs2_xattr_value_truncate(struct inode *inode, - struct buffer_head *root_bh, - struct ocfs2_xattr_value_root *xv, - int len) + struct ocfs2_xattr_value_buf *vb, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret; u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); - u32 old_clusters = le32_to_cpu(xv->xr_clusters); + u32 old_clusters = le32_to_cpu(vb->vb_xv->xr_clusters); if (new_clusters == old_clusters) return 0; @@ -460,11 +744,11 @@ static int ocfs2_xattr_value_truncate(struct inode *inode, if (new_clusters > old_clusters) ret = ocfs2_xattr_extend_allocation(inode, new_clusters - old_clusters, - root_bh, xv); + vb, ctxt); else ret = ocfs2_xattr_shrink_size(inode, old_clusters, new_clusters, - root_bh, xv); + vb, ctxt); return ret; } @@ -554,18 +838,14 @@ static int ocfs2_xattr_block_list(struct inode *inode, if (!di->i_xattr_loc) return ret; - ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); if (ret < 0) { mlog_errno(ret); return ret; } xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; - goto cleanup; - } - if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; ret = ocfs2_xattr_list_entries(inode, header, @@ -575,7 +855,7 @@ static int ocfs2_xattr_block_list(struct inode *inode, ret = ocfs2_xattr_tree_list_index_block(inode, xt, buffer, buffer_size); } -cleanup: + brelse(blk_bh); return ret; @@ -685,7 +965,7 @@ static int ocfs2_xattr_get_value_outside(struct inode *inode, blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); /* Copy ocfs2_xattr_value */ for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_block(inode, blkno, &bh, NULL); if (ret) { mlog_errno(ret); goto out; @@ -769,7 +1049,12 @@ static int ocfs2_xattr_block_get(struct inode *inode, size_t size; int ret = -ENODATA, name_offset, name_len, block_off, i; - memset(&xs->bucket, 0, sizeof(xs->bucket)); + xs->bucket = ocfs2_xattr_bucket_new(inode); + if (!xs->bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto cleanup; + } ret = ocfs2_xattr_block_find(inode, name_index, name, xs); if (ret) { @@ -795,11 +1080,11 @@ static int ocfs2_xattr_block_get(struct inode *inode, if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { ret = ocfs2_xattr_bucket_get_name_value(inode, - xs->bucket.xh, + bucket_xh(xs->bucket), i, &block_off, &name_offset); - xs->base = xs->bucket.bhs[block_off]->b_data; + xs->base = bucket_block(xs->bucket, block_off); } if (ocfs2_xattr_is_local(xs->here)) { memcpy(buffer, (void *)xs->base + @@ -817,21 +1102,15 @@ static int ocfs2_xattr_block_get(struct inode *inode, } ret = size; cleanup: - for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) - brelse(xs->bucket.bhs[i]); - memset(&xs->bucket, 0, sizeof(xs->bucket)); + ocfs2_xattr_bucket_free(xs->bucket); brelse(xs->xattr_bh); xs->xattr_bh = NULL; return ret; } -/* ocfs2_xattr_get() - * - * Copy an extended attribute into the buffer provided. - * Buffer is NULL to compute the size of buffer required. - */ -static int ocfs2_xattr_get(struct inode *inode, +int ocfs2_xattr_get_nolock(struct inode *inode, + struct buffer_head *di_bh, int name_index, const char *name, void *buffer, @@ -839,7 +1118,6 @@ static int ocfs2_xattr_get(struct inode *inode, { int ret; struct ocfs2_dinode *di = NULL; - struct buffer_head *di_bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_xattr_search xis = { .not_found = -ENODATA, @@ -854,11 +1132,6 @@ static int ocfs2_xattr_get(struct inode *inode, if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) ret = -ENODATA; - ret = ocfs2_inode_lock(inode, &di_bh, 0); - if (ret < 0) { - mlog_errno(ret); - return ret; - } xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; @@ -869,6 +1142,32 @@ static int ocfs2_xattr_get(struct inode *inode, ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, buffer_size, &xbs); up_read(&oi->ip_xattr_sem); + + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +static int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct buffer_head *di_bh = NULL; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = ocfs2_xattr_get_nolock(inode, di_bh, name_index, + name, buffer, buffer_size); + ocfs2_inode_unlock(inode, 0); brelse(di_bh); @@ -877,44 +1176,36 @@ static int ocfs2_xattr_get(struct inode *inode, } static int __ocfs2_xattr_set_value_outside(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_value_root *xv, const void *value, int value_len) { - int ret = 0, i, cp_len, credits; + int ret = 0, i, cp_len; u16 blocksize = inode->i_sb->s_blocksize; u32 p_cluster, num_clusters; u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); u64 blkno; struct buffer_head *bh = NULL; - handle_t *handle; BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); - credits = clusters * bpc; - handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - while (cpos < clusters) { ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, &num_clusters, &xv->xr_list); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); for (i = 0; i < num_clusters * bpc; i++, blkno++) { - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_block(inode, blkno, &bh, NULL); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } ret = ocfs2_journal_access(handle, @@ -923,7 +1214,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto out; } cp_len = value_len > blocksize ? blocksize : value_len; @@ -937,7 +1228,7 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, ret = ocfs2_journal_dirty(handle, bh); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto out; } brelse(bh); bh = NULL; @@ -951,8 +1242,6 @@ static int __ocfs2_xattr_set_value_outside(struct inode *inode, } cpos += num_clusters; } -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: brelse(bh); @@ -960,28 +1249,22 @@ out: } static int ocfs2_xattr_cleanup(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_value_buf *vb, size_t offs) { - handle_t *handle = NULL; int ret = 0; size_t name_len = strlen(xi->name); void *val = xs->base + offs; size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), - OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } /* Decrease xattr count */ le16_add_cpu(&xs->header->xh_count, -1); @@ -989,35 +1272,27 @@ static int ocfs2_xattr_cleanup(struct inode *inode, memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); memset(val, 0, size); - ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + ret = ocfs2_journal_dirty(handle, vb->vb_bh); if (ret < 0) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } static int ocfs2_xattr_update_entry(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_value_buf *vb, size_t offs) { - handle_t *handle = NULL; - int ret = 0; + int ret; - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), - OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = vb->vb_access(handle, inode, vb->vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } xs->here->xe_name_offset = cpu_to_le16(offs); @@ -1028,11 +1303,9 @@ static int ocfs2_xattr_update_entry(struct inode *inode, ocfs2_xattr_set_local(xs->here, 0); ocfs2_xattr_hash_entry(inode, xs->header, xs->here); - ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + ret = ocfs2_journal_dirty(handle, vb->vb_bh); if (ret < 0) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } @@ -1045,6 +1318,8 @@ out: static int ocfs2_xattr_set_value_outside(struct inode *inode, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt, + struct ocfs2_xattr_value_buf *vb, size_t offs) { size_t name_len = strlen(xi->name); @@ -1062,20 +1337,20 @@ static int ocfs2_xattr_set_value_outside(struct inode *inode, xv->xr_list.l_tree_depth = 0; xv->xr_list.l_count = cpu_to_le16(1); xv->xr_list.l_next_free_rec = 0; + vb->vb_xv = xv; - ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, - xi->value_len); + ret = ocfs2_xattr_value_truncate(inode, vb, xi->value_len, ctxt); if (ret < 0) { mlog_errno(ret); return ret; } - ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, - xi->value_len); + ret = ocfs2_xattr_update_entry(inode, ctxt->handle, xi, xs, vb, offs); if (ret < 0) { mlog_errno(ret); return ret; } - ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); + ret = __ocfs2_xattr_set_value_outside(inode, ctxt->handle, vb->vb_xv, + xi->value, xi->value_len); if (ret < 0) mlog_errno(ret); @@ -1195,6 +1470,7 @@ static void ocfs2_xattr_set_entry_local(struct inode *inode, static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt, int flag) { struct ocfs2_xattr_entry *last; @@ -1202,7 +1478,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); size_t size_l = 0; - handle_t *handle = NULL; + handle_t *handle = ctxt->handle; int free, i, ret; struct ocfs2_xattr_info xi_l = { .name_index = xi->name_index, @@ -1210,6 +1486,16 @@ static int ocfs2_xattr_set_entry(struct inode *inode, .value = xi->value, .value_len = xi->value_len, }; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = xs->xattr_bh, + .vb_access = ocfs2_journal_access_di, + }; + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + BUG_ON(xs->xattr_bh == xs->inode_bh); + vb.vb_access = ocfs2_journal_access_xb; + } else + BUG_ON(xs->xattr_bh != xs->inode_bh); /* Compute min_offs, last and free space. */ last = xs->header->xh_entries; @@ -1265,15 +1551,14 @@ static int ocfs2_xattr_set_entry(struct inode *inode, if (ocfs2_xattr_is_local(xs->here) && size == size_l) { /* Replace existing local xattr with tree root */ ret = ocfs2_xattr_set_value_outside(inode, xi, xs, - offs); + ctxt, &vb, offs); if (ret < 0) mlog_errno(ret); goto out; } else if (!ocfs2_xattr_is_local(xs->here)) { /* For existing xattr which has value outside */ - struct ocfs2_xattr_value_root *xv = NULL; - xv = (struct ocfs2_xattr_value_root *)(val + - OCFS2_XATTR_SIZE(name_len)); + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(name_len)); if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { /* @@ -1282,27 +1567,30 @@ static int ocfs2_xattr_set_entry(struct inode *inode, * then set new value with set_value_outside(). */ ret = ocfs2_xattr_value_truncate(inode, - xs->xattr_bh, - xv, - xi->value_len); + &vb, + xi->value_len, + ctxt); if (ret < 0) { mlog_errno(ret); goto out; } - ret = __ocfs2_xattr_set_value_outside(inode, - xv, - xi->value, - xi->value_len); + ret = ocfs2_xattr_update_entry(inode, + handle, + xi, + xs, + &vb, + offs); if (ret < 0) { mlog_errno(ret); goto out; } - ret = ocfs2_xattr_update_entry(inode, - xi, - xs, - offs); + ret = __ocfs2_xattr_set_value_outside(inode, + handle, + vb.vb_xv, + xi->value, + xi->value_len); if (ret < 0) mlog_errno(ret); goto out; @@ -1312,44 +1600,28 @@ static int ocfs2_xattr_set_entry(struct inode *inode, * just trucate old value to zero. */ ret = ocfs2_xattr_value_truncate(inode, - xs->xattr_bh, - xv, - 0); + &vb, + 0, + ctxt); if (ret < 0) mlog_errno(ret); } } } - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), - OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, xs->inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } if (!(flag & OCFS2_INLINE_XATTR_FL)) { - /* set extended attribute in external block. */ - ret = ocfs2_extend_trans(handle, - OCFS2_INODE_UPDATE_CREDITS + - OCFS2_XATTR_BLOCK_UPDATE_CREDITS); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = vb.vb_access(handle, inode, vb.vb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } } @@ -1363,7 +1635,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, ret = ocfs2_journal_dirty(handle, xs->xattr_bh); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto out; } } @@ -1391,25 +1663,19 @@ static int ocfs2_xattr_set_entry(struct inode *inode, oi->ip_dyn_features |= flag; di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); spin_unlock(&oi->ip_lock); - /* Update inode ctime */ - inode->i_ctime = CURRENT_TIME; - di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); ret = ocfs2_journal_dirty(handle, xs->inode_bh); if (ret < 0) mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); - if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { /* * Set value outside in B tree. * This is the second step for value size > INLINE_SIZE. */ size_t offs = le16_to_cpu(xs->here->xe_name_offset); - ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, ctxt, + &vb, offs); if (ret < 0) { int ret2; @@ -1418,41 +1684,56 @@ out_commit: * If set value outside failed, we have to clean * the junk tree root we have already set in local. */ - ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); + ret2 = ocfs2_xattr_cleanup(inode, ctxt->handle, + xi, xs, &vb, offs); if (ret2 < 0) mlog_errno(ret2); } } out: return ret; - } static int ocfs2_remove_value_outside(struct inode*inode, - struct buffer_head *bh, + struct ocfs2_xattr_value_buf *vb, struct ocfs2_xattr_header *header) { int ret = 0, i; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); + + ctxt.handle = ocfs2_start_trans(osb, + ocfs2_remove_extent_credits(osb->sb)); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto out; + } for (i = 0; i < le16_to_cpu(header->xh_count); i++) { struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; if (!ocfs2_xattr_is_local(entry)) { - struct ocfs2_xattr_value_root *xv; void *val; val = (void *)header + le16_to_cpu(entry->xe_name_offset); - xv = (struct ocfs2_xattr_value_root *) + vb->vb_xv = (struct ocfs2_xattr_value_root *) (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); - ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); + ret = ocfs2_xattr_value_truncate(inode, vb, 0, &ctxt); if (ret < 0) { mlog_errno(ret); - return ret; + break; } } } + ocfs2_commit_trans(osb, ctxt.handle); + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); +out: return ret; } @@ -1463,12 +1744,16 @@ static int ocfs2_xattr_ibody_remove(struct inode *inode, struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_xattr_header *header; int ret; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = di_bh, + .vb_access = ocfs2_journal_access_di, + }; header = (struct ocfs2_xattr_header *) ((void *)di + inode->i_sb->s_blocksize - le16_to_cpu(di->i_xattr_inline_size)); - ret = ocfs2_remove_value_outside(inode, di_bh, header); + ret = ocfs2_remove_value_outside(inode, &vb, header); return ret; } @@ -1478,11 +1763,15 @@ static int ocfs2_xattr_block_remove(struct inode *inode, { struct ocfs2_xattr_block *xb; int ret = 0; + struct ocfs2_xattr_value_buf vb = { + .vb_bh = blk_bh, + .vb_access = ocfs2_journal_access_xb, + }; xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); - ret = ocfs2_remove_value_outside(inode, blk_bh, header); + ret = ocfs2_remove_value_outside(inode, &vb, header); } else ret = ocfs2_delete_xattr_index_block(inode, blk_bh); @@ -1502,24 +1791,19 @@ static int ocfs2_xattr_free_block(struct inode *inode, u64 blk, bg_blkno; u16 bit; - ret = ocfs2_read_block(inode, block, &blk_bh); + ret = ocfs2_read_xattr_block(inode, block, &blk_bh); if (ret < 0) { mlog_errno(ret); goto out; } - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; - goto out; - } - ret = ocfs2_xattr_block_remove(inode, blk_bh); if (ret < 0) { mlog_errno(ret); goto out; } + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; blk = le64_to_cpu(xb->xb_blkno); bit = le16_to_cpu(xb->xb_suballoc_bit); bg_blkno = ocfs2_which_suballoc_group(blk, bit); @@ -1606,8 +1890,8 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, di_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -1714,7 +1998,8 @@ static int ocfs2_xattr_ibody_find(struct inode *inode, */ static int ocfs2_xattr_ibody_set(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; @@ -1731,7 +2016,7 @@ static int ocfs2_xattr_ibody_set(struct inode *inode, } } - ret = ocfs2_xattr_set_entry(inode, xi, xs, + ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); out: up_write(&oi->ip_alloc_sem); @@ -1758,19 +2043,15 @@ static int ocfs2_xattr_block_find(struct inode *inode, if (!di->i_xattr_loc) return ret; - ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + ret = ocfs2_read_xattr_block(inode, le64_to_cpu(di->i_xattr_loc), + &blk_bh); if (ret < 0) { mlog_errno(ret); return ret; } - xb = (struct ocfs2_xattr_block *)blk_bh->b_data; - if (!OCFS2_IS_VALID_XATTR_BLOCK(xb)) { - ret = -EIO; - goto cleanup; - } - xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { xs->header = &xb->xb_attrs.xb_header; @@ -1804,13 +2085,13 @@ cleanup: */ static int ocfs2_xattr_block_set(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { struct buffer_head *new_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; - struct ocfs2_alloc_context *meta_ac = NULL; - handle_t *handle = NULL; + handle_t *handle = ctxt->handle; struct ocfs2_xattr_block *xblk = NULL; u16 suballoc_bit_start; u32 num_got; @@ -1818,45 +2099,29 @@ static int ocfs2_xattr_block_set(struct inode *inode, int ret; if (!xs->xattr_bh) { - /* - * Alloc one external block for extended attribute - * outside of inode. - */ - ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + ret = ocfs2_journal_access_di(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); - goto out; - } - handle = ocfs2_start_trans(osb, - OCFS2_XATTR_BLOCK_CREATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access(handle, inode, xs->inode_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; + goto end; } - ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + ret = ocfs2_claim_metadata(osb, handle, ctxt->meta_ac, 1, &suballoc_bit_start, &num_got, &first_blkno); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto end; } new_bh = sb_getblk(inode->i_sb, first_blkno); ocfs2_set_new_buffer_uptodate(inode, new_bh); - ret = ocfs2_journal_access(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_journal_access_xb(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto end; } /* Initialize ocfs2_xattr_block */ @@ -1874,44 +2139,555 @@ static int ocfs2_xattr_block_set(struct inode *inode, xs->end = (void *)xblk + inode->i_sb->s_blocksize; xs->here = xs->header->xh_entries; - ret = ocfs2_journal_dirty(handle, new_bh); if (ret < 0) { mlog_errno(ret); - goto out_commit; + goto end; } di->i_xattr_loc = cpu_to_le64(first_blkno); - ret = ocfs2_journal_dirty(handle, xs->inode_bh); - if (ret < 0) - mlog_errno(ret); -out_commit: - ocfs2_commit_trans(osb, handle); -out: - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - if (ret < 0) - return ret; + ocfs2_journal_dirty(handle, xs->inode_bh); } else xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { /* Set extended attribute into external block */ - ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); + ret = ocfs2_xattr_set_entry(inode, xi, xs, ctxt, + OCFS2_HAS_XATTR_FL); if (!ret || ret != -ENOSPC) goto end; - ret = ocfs2_xattr_create_index_block(inode, xs); + ret = ocfs2_xattr_create_index_block(inode, xs, ctxt); if (ret) goto end; } - ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs, ctxt); end: return ret; } +/* Check whether the new xattr can be inserted into the inode. */ +static int ocfs2_xattr_can_be_in_inode(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + u64 value_size; + struct ocfs2_xattr_entry *last; + int free, i; + size_t min_offs = xs->end - xs->base; + + if (!xs->header) + return 0; + + last = xs->header->xh_entries; + + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + if (free < 0) + return 0; + + BUG_ON(!xs->not_found); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_size = OCFS2_XATTR_ROOT_SIZE; + else + value_size = OCFS2_XATTR_SIZE(xi->value_len); + + if (free >= sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(strlen(xi->name)) + value_size) + return 1; + + return 0; +} + +static int ocfs2_calc_xattr_set_need(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + int *clusters_need, + int *meta_need, + int *credits_need) +{ + int ret = 0, old_in_xb = 0; + int clusters_add = 0, meta_add = 0, credits = 0; + struct buffer_head *bh = NULL; + struct ocfs2_xattr_block *xb = NULL; + struct ocfs2_xattr_entry *xe = NULL; + struct ocfs2_xattr_value_root *xv = NULL; + char *base = NULL; + int name_offset, name_len = 0; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + xi->value_len); + u64 value_size; + + /* + * Calculate the clusters we need to write. + * No matter whether we replace an old one or add a new one, + * we need this for writing. + */ + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + credits += new_clusters * + ocfs2_clusters_to_blocks(inode->i_sb, 1); + + if (xis->not_found && xbs->not_found) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + clusters_add += new_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &def_xv.xv.xr_list, + new_clusters); + } + + goto meta_guess; + } + + if (!xis->not_found) { + xe = xis->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + base = xis->base; + credits += OCFS2_INODE_UPDATE_CREDITS; + } else { + int i, block_off = 0; + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + xe = xbs->here; + name_offset = le16_to_cpu(xe->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + i = xbs->here - xbs->header->xh_entries; + old_in_xb = 1; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket_xh(xbs->bucket), + i, &block_off, + &name_offset); + base = bucket_block(xbs->bucket, block_off); + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } else { + base = xbs->base; + credits += OCFS2_XATTR_BLOCK_UPDATE_CREDITS; + } + } + + /* + * delete a xattr doesn't need metadata and cluster allocation. + * so just calculate the credits and return. + * + * The credits for removing the value tree will be extended + * by ocfs2_remove_extent itself. + */ + if (!xi->value) { + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_remove_extent_credits(inode->i_sb); + + goto out; + } + + /* do cluster allocation guess first. */ + value_size = le64_to_cpu(xe->xe_value_size); + + if (old_in_xb) { + /* + * In xattr set, we always try to set the xe in inode first, + * so if it can be inserted into inode successfully, the old + * one will be removed from the xattr block, and this xattr + * will be inserted into inode as a new xattr in inode. + */ + if (ocfs2_xattr_can_be_in_inode(inode, xi, xis)) { + clusters_add += new_clusters; + credits += ocfs2_remove_extent_credits(inode->i_sb) + + OCFS2_INODE_UPDATE_CREDITS; + if (!ocfs2_xattr_is_local(xe)) + credits += ocfs2_calc_extend_credits( + inode->i_sb, + &def_xv.xv.xr_list, + new_clusters); + goto out; + } + } + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* the new values will be stored outside. */ + u32 old_clusters = 0; + + if (!ocfs2_xattr_is_local(xe)) { + old_clusters = ocfs2_clusters_for_bytes(inode->i_sb, + value_size); + xv = (struct ocfs2_xattr_value_root *) + (base + name_offset + name_len); + value_size = OCFS2_XATTR_ROOT_SIZE; + } else + xv = &def_xv.xv; + + if (old_clusters >= new_clusters) { + credits += ocfs2_remove_extent_credits(inode->i_sb); + goto out; + } else { + meta_add += ocfs2_extend_meta_needed(&xv->xr_list); + clusters_add += new_clusters - old_clusters; + credits += ocfs2_calc_extend_credits(inode->i_sb, + &xv->xr_list, + new_clusters - + old_clusters); + if (value_size >= OCFS2_XATTR_ROOT_SIZE) + goto out; + } + } else { + /* + * Now the new value will be stored inside. So if the new + * value is smaller than the size of value root or the old + * value, we don't need any allocation, otherwise we have + * to guess metadata allocation. + */ + if ((ocfs2_xattr_is_local(xe) && value_size >= xi->value_len) || + (!ocfs2_xattr_is_local(xe) && + OCFS2_XATTR_ROOT_SIZE >= xi->value_len)) + goto out; + } + +meta_guess: + /* calculate metadata allocation. */ + if (di->i_xattr_loc) { + if (!xbs->xattr_bh) { + ret = ocfs2_read_xattr_block(inode, + le64_to_cpu(di->i_xattr_loc), + &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)bh->b_data; + } else + xb = (struct ocfs2_xattr_block *)xbs->xattr_bh->b_data; + + /* + * If there is already an xattr tree, good, we can calculate + * like other b-trees. Otherwise we may have the chance of + * create a tree, the credit calculation is borrowed from + * ocfs2_calc_extend_credits with root_el = NULL. And the + * new tree will be cluster based, so no meta is needed. + */ + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + struct ocfs2_extent_list *el = + &xb->xb_attrs.xb_root.xt_list; + meta_add += ocfs2_extend_meta_needed(el); + credits += ocfs2_calc_extend_credits(inode->i_sb, + el, 1); + } else + credits += OCFS2_SUBALLOC_ALLOC + 1; + + /* + * This cluster will be used either for new bucket or for + * new xattr block. + * If the cluster size is the same as the bucket size, one + * more is needed since we may need to extend the bucket + * also. + */ + clusters_add += 1; + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + if (OCFS2_XATTR_BUCKET_SIZE == + OCFS2_SB(inode->i_sb)->s_clustersize) { + credits += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + clusters_add += 1; + } + } else { + meta_add += 1; + credits += OCFS2_XATTR_BLOCK_CREATE_CREDITS; + } +out: + if (clusters_need) + *clusters_need = clusters_add; + if (meta_need) + *meta_need = meta_add; + if (credits_need) + *credits_need = credits; + brelse(bh); + return ret; +} + +static int ocfs2_init_xattr_set_ctxt(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt, + int *credits) +{ + int clusters_add, meta_add, ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + memset(ctxt, 0, sizeof(struct ocfs2_xattr_set_ctxt)); + + ocfs2_init_dealloc_ctxt(&ctxt->dealloc); + + ret = ocfs2_calc_xattr_set_need(inode, di, xi, xis, xbs, + &clusters_add, &meta_add, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + mlog(0, "Set xattr %s, reserve meta blocks = %d, clusters = %d, " + "credits = %d\n", xi->name, meta_add, clusters_add, *credits); + + if (meta_add) { + ret = ocfs2_reserve_new_metadata_blocks(osb, meta_add, + &ctxt->meta_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (clusters_add) { + ret = ocfs2_reserve_clusters(osb, clusters_add, &ctxt->data_ac); + if (ret) + mlog_errno(ret); + } +out: + if (ret) { + if (ctxt->meta_ac) { + ocfs2_free_alloc_context(ctxt->meta_ac); + ctxt->meta_ac = NULL; + } + + /* + * We cannot have an error and a non null ctxt->data_ac. + */ + } + + return ret; +} + +static int __ocfs2_xattr_set_handle(struct inode *inode, + struct ocfs2_dinode *di, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xis, + struct ocfs2_xattr_search *xbs, + struct ocfs2_xattr_set_ctxt *ctxt) +{ + int ret = 0, credits, old_found; + + if (!xi->value) { + /* Remove existing extended attribute */ + if (!xis->not_found) + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + else if (!xbs->not_found) + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, xi, xis, ctxt); + if (!ret && !xbs->not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi->value = NULL; + xi->value_len = 0; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits + + ctxt->handle->h_buffer_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + } else if (ret == -ENOSPC) { + if (di->i_xattr_loc && !xbs->xattr_bh) { + ret = ocfs2_xattr_block_find(inode, + xi->name_index, + xi->name, xbs); + if (ret) + goto out; + + old_found = xis->not_found; + xis->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + xis->not_found = old_found; + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits + + ctxt->handle->h_buffer_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, xi, xbs, ctxt); + if (ret) + goto out; + if (!xis->not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi->value = NULL; + xi->value_len = 0; + xbs->not_found = -ENODATA; + ret = ocfs2_calc_xattr_set_need(inode, + di, + xi, + xis, + xbs, + NULL, + NULL, + &credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extend_trans(ctxt->handle, credits + + ctxt->handle->h_buffer_credits); + if (ret) { + mlog_errno(ret); + goto out; + } + ret = ocfs2_xattr_ibody_set(inode, xi, + xis, ctxt); + } + } + } + + if (!ret) { + /* Update inode ctime. */ + ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + ocfs2_journal_dirty(ctxt->handle, xis->inode_bh); + } +out: + return ret; +} + +/* + * This function only called duing creating inode + * for init security/acl xattrs of the new inode. + * All transanction credits have been reserved in mknod. + */ +int ocfs2_xattr_set_handle(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags, + struct ocfs2_alloc_context *meta_ac, + struct ocfs2_alloc_context *data_ac) +{ + struct ocfs2_dinode *di; + int ret; + + struct ocfs2_xattr_info xi = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_set_ctxt ctxt = { + .handle = handle, + .meta_ac = meta_ac, + .data_ac = data_ac, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + /* + * In extreme situation, may need xattr bucket when + * block size is too small. And we have already reserved + * the credits for bucket in mknod. + */ + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) { + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + } + + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + brelse(xbs.xattr_bh); + ocfs2_xattr_bucket_free(xbs.bucket); + + return ret; +} + /* * ocfs2_xattr_set() * @@ -1928,8 +2704,10 @@ int ocfs2_xattr_set(struct inode *inode, { struct buffer_head *di_bh = NULL; struct ocfs2_dinode *di; - int ret; - u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + struct ocfs2_xattr_set_ctxt ctxt = { NULL, NULL, }; struct ocfs2_xattr_info xi = { .name_index = name_index, @@ -1949,10 +2727,20 @@ int ocfs2_xattr_set(struct inode *inode, if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) return -EOPNOTSUPP; + /* + * Only xbs will be used on indexed trees. xis doesn't need a + * bucket. + */ + xbs.bucket = ocfs2_xattr_bucket_new(inode); + if (!xbs.bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } + ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret < 0) { mlog_errno(ret); - return ret; + goto cleanup_nolock; } xis.inode_bh = xbs.inode_bh = di_bh; di = (struct ocfs2_dinode *)di_bh->b_data; @@ -1984,55 +2772,53 @@ int ocfs2_xattr_set(struct inode *inode, goto cleanup; } - if (!value) { - /* Remove existing extended attribute */ - if (!xis.not_found) - ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); - else if (!xbs.not_found) - ret = ocfs2_xattr_block_set(inode, &xi, &xbs); - } else { - /* We always try to set extended attribute into inode first*/ - ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); - if (!ret && !xbs.not_found) { - /* - * If succeed and that extended attribute existing in - * external block, then we will remove it. - */ - xi.value = NULL; - xi.value_len = 0; - ret = ocfs2_xattr_block_set(inode, &xi, &xbs); - } else if (ret == -ENOSPC) { - if (di->i_xattr_loc && !xbs.xattr_bh) { - ret = ocfs2_xattr_block_find(inode, name_index, - name, &xbs); - if (ret) - goto cleanup; - } - /* - * If no space in inode, we will set extended attribute - * into external block. - */ - ret = ocfs2_xattr_block_set(inode, &xi, &xbs); - if (ret) - goto cleanup; - if (!xis.not_found) { - /* - * If succeed and that extended attribute - * existing in inode, we will remove it. - */ - xi.value = NULL; - xi.value_len = 0; - ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); - } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mutex_unlock(&tl_inode->i_mutex); + mlog_errno(ret); + goto cleanup; } } + mutex_unlock(&tl_inode->i_mutex); + + ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, + &xbs, &ctxt, &credits); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + + /* we need to update inode's ctime field, so add credit for it. */ + credits += OCFS2_INODE_UPDATE_CREDITS; + ctxt.handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + goto cleanup; + } + + ret = __ocfs2_xattr_set_handle(inode, di, &xi, &xis, &xbs, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); + + if (ctxt.data_ac) + ocfs2_free_alloc_context(ctxt.data_ac); + if (ctxt.meta_ac) + ocfs2_free_alloc_context(ctxt.meta_ac); + if (ocfs2_dealloc_has_cluster(&ctxt.dealloc)) + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); cleanup: up_write(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock(inode, 1); +cleanup_nolock: brelse(di_bh); brelse(xbs.xattr_bh); - for (i = 0; i < blk_per_bucket; i++) - brelse(xbs.bucket.bhs[i]); + ocfs2_xattr_bucket_free(xbs.bucket); return ret; } @@ -2107,7 +2893,7 @@ typedef int (xattr_bucket_func)(struct inode *inode, void *para); static int ocfs2_find_xe_in_bucket(struct inode *inode, - struct buffer_head *header_bh, + struct ocfs2_xattr_bucket *bucket, int name_index, const char *name, u32 name_hash, @@ -2115,11 +2901,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, int *found) { int i, ret = 0, cmp = 1, block_off, new_offset; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)header_bh->b_data; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); size_t name_len = strlen(name); struct ocfs2_xattr_entry *xe = NULL; - struct buffer_head *name_bh = NULL; char *xe_name; /* @@ -2150,19 +2934,9 @@ static int ocfs2_find_xe_in_bucket(struct inode *inode, break; } - ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off, - &name_bh); - if (ret) { - mlog_errno(ret); - break; - } - xe_name = name_bh->b_data + new_offset; - cmp = memcmp(name, xe_name, name_len); - brelse(name_bh); - name_bh = NULL; - - if (cmp == 0) { + xe_name = bucket_block(bucket, block_off) + new_offset; + if (!memcmp(name, xe_name, name_len)) { *xe_index = i; *found = 1; ret = 0; @@ -2192,39 +2966,42 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, struct ocfs2_xattr_search *xs) { int ret, found = 0; - struct buffer_head *bh = NULL; - struct buffer_head *lower_bh = NULL; struct ocfs2_xattr_header *xh = NULL; struct ocfs2_xattr_entry *xe = NULL; u16 index = 0; u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); int low_bucket = 0, bucket, high_bucket; + struct ocfs2_xattr_bucket *search; u32 last_hash; - u64 blkno; + u64 blkno, lower_blkno = 0; - ret = ocfs2_read_block(inode, p_blkno, &bh); + search = ocfs2_xattr_bucket_new(inode); + if (!search) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_xattr_bucket(search, p_blkno); if (ret) { mlog_errno(ret); goto out; } - xh = (struct ocfs2_xattr_header *)bh->b_data; + xh = bucket_xh(search); high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; - while (low_bucket <= high_bucket) { - brelse(bh); - bh = NULL; - bucket = (low_bucket + high_bucket) / 2; + ocfs2_xattr_bucket_relse(search); + bucket = (low_bucket + high_bucket) / 2; blkno = p_blkno + bucket * blk_per_bucket; - - ret = ocfs2_read_block(inode, blkno, &bh); + ret = ocfs2_read_xattr_bucket(search, blkno); if (ret) { mlog_errno(ret); goto out; } - xh = (struct ocfs2_xattr_header *)bh->b_data; + xh = bucket_xh(search); xe = &xh->xh_entries[0]; if (name_hash < le32_to_cpu(xe->xe_name_hash)) { high_bucket = bucket - 1; @@ -2241,10 +3018,8 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, last_hash = le32_to_cpu(xe->xe_name_hash); - /* record lower_bh which may be the insert place. */ - brelse(lower_bh); - lower_bh = bh; - bh = NULL; + /* record lower_blkno which may be the insert place. */ + lower_blkno = blkno; if (name_hash > le32_to_cpu(xe->xe_name_hash)) { low_bucket = bucket + 1; @@ -2252,7 +3027,7 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, } /* the searched xattr should reside in this bucket if exists. */ - ret = ocfs2_find_xe_in_bucket(inode, lower_bh, + ret = ocfs2_find_xe_in_bucket(inode, search, name_index, name, name_hash, &index, &found); if (ret) { @@ -2267,46 +3042,29 @@ static int ocfs2_xattr_bucket_find(struct inode *inode, * When the xattr's hash value is in the gap of 2 buckets, we will * always set it to the previous bucket. */ - if (!lower_bh) { - /* - * We can't find any bucket whose first name_hash is less - * than the find name_hash. - */ - BUG_ON(bh->b_blocknr != p_blkno); - lower_bh = bh; - bh = NULL; + if (!lower_blkno) + lower_blkno = p_blkno; + + /* This should be in cache - we just read it during the search */ + ret = ocfs2_read_xattr_bucket(xs->bucket, lower_blkno); + if (ret) { + mlog_errno(ret); + goto out; } - xs->bucket.bhs[0] = lower_bh; - xs->bucket.xh = (struct ocfs2_xattr_header *) - xs->bucket.bhs[0]->b_data; - lower_bh = NULL; - xs->header = xs->bucket.xh; - xs->base = xs->bucket.bhs[0]->b_data; + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); xs->end = xs->base + inode->i_sb->s_blocksize; if (found) { - /* - * If we have found the xattr enty, read all the blocks in - * this bucket. - */ - ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1, - blk_per_bucket - 1, &xs->bucket.bhs[1], - 0); - if (ret) { - mlog_errno(ret); - goto out; - } - xs->here = &xs->header->xh_entries[index]; mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, - (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); + (unsigned long long)bucket_blkno(xs->bucket), index); } else ret = -ENODATA; out: - brelse(bh); - brelse(lower_bh); + ocfs2_xattr_bucket_free(search); return ret; } @@ -2357,53 +3115,50 @@ static int ocfs2_iterate_xattr_buckets(struct inode *inode, xattr_bucket_func *func, void *para) { - int i, j, ret = 0; - int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int i, ret = 0; u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); u32 num_buckets = clusters * bpc; - struct ocfs2_xattr_bucket bucket; + struct ocfs2_xattr_bucket *bucket; - memset(&bucket, 0, sizeof(bucket)); + bucket = ocfs2_xattr_bucket_new(inode); + if (!bucket) { + mlog_errno(-ENOMEM); + return -ENOMEM; + } mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", clusters, (unsigned long long)blkno); - for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { - ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, - bucket.bhs, 0); + for (i = 0; i < num_buckets; i++, blkno += bucket->bu_blocks) { + ret = ocfs2_read_xattr_bucket(bucket, blkno); if (ret) { mlog_errno(ret); - goto out; + break; } - bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data; /* * The real bucket num in this series of blocks is stored * in the 1st bucket. */ if (i == 0) - num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); + num_buckets = le16_to_cpu(bucket_xh(bucket)->xh_num_buckets); mlog(0, "iterating xattr bucket %llu, first hash %u\n", (unsigned long long)blkno, - le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); + le32_to_cpu(bucket_xh(bucket)->xh_entries[0].xe_name_hash)); if (func) { - ret = func(inode, &bucket, para); - if (ret) { + ret = func(inode, bucket, para); + if (ret) mlog_errno(ret); - break; - } + /* Fall through to bucket_relse() */ } - for (j = 0; j < blk_per_bucket; j++) - brelse(bucket.bhs[j]); - memset(&bucket, 0, sizeof(bucket)); + ocfs2_xattr_bucket_relse(bucket); + if (ret) + break; } -out: - for (j = 0; j < blk_per_bucket; j++) - brelse(bucket.bhs[j]); - + ocfs2_xattr_bucket_free(bucket); return ret; } @@ -2441,21 +3196,21 @@ static int ocfs2_list_xattr_bucket(struct inode *inode, int i, block_off, new_offset; const char *prefix, *name; - for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { - struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; + for (i = 0 ; i < le16_to_cpu(bucket_xh(bucket)->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket_xh(bucket)->xh_entries[i]; type = ocfs2_xattr_get_type(entry); prefix = ocfs2_xattr_prefix(type); if (prefix) { ret = ocfs2_xattr_bucket_get_name_value(inode, - bucket->xh, + bucket_xh(bucket), i, &block_off, &new_offset); if (ret) break; - name = (const char *)bucket->bhs[block_off]->b_data + + name = (const char *)bucket_block(bucket, block_off) + new_offset; ret = ocfs2_xattr_list_entry(xl->buffer, xl->buffer_size, @@ -2540,32 +3295,34 @@ static void swap_xe(void *a, void *b, int size) /* * When the ocfs2_xattr_block is filled up, new bucket will be created * and all the xattr entries will be moved to the new bucket. + * The header goes at the start of the bucket, and the names+values are + * filled from the end. This is why *target starts as the last buffer. * Note: we need to sort the entries since they are not saved in order * in the ocfs2_xattr_block. */ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, struct buffer_head *xb_bh, - struct buffer_head *xh_bh, - struct buffer_head *data_bh) + struct ocfs2_xattr_bucket *bucket) { int i, blocksize = inode->i_sb->s_blocksize; + int blks = ocfs2_blocks_per_xattr_bucket(inode->i_sb); u16 offset, size, off_change; struct ocfs2_xattr_entry *xe; struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)xb_bh->b_data; struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)xh_bh->b_data; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 count = le16_to_cpu(xb_xh->xh_count); - char *target = xh_bh->b_data, *src = xb_bh->b_data; + char *src = xb_bh->b_data; + char *target = bucket_block(bucket, blks - 1); mlog(0, "cp xattr from block %llu to bucket %llu\n", (unsigned long long)xb_bh->b_blocknr, - (unsigned long long)xh_bh->b_blocknr); + (unsigned long long)bucket_blkno(bucket)); + + for (i = 0; i < blks; i++) + memset(bucket_block(bucket, i), 0, blocksize); - memset(xh_bh->b_data, 0, blocksize); - if (data_bh) - memset(data_bh->b_data, 0, blocksize); /* * Since the xe_name_offset is based on ocfs2_xattr_header, * there is a offset change corresponding to the change of @@ -2577,8 +3334,6 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, size = blocksize - offset; /* copy all the names and values. */ - if (data_bh) - target = data_bh->b_data; memcpy(target + offset, src + offset, size); /* Init new header now. */ @@ -2588,7 +3343,7 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); /* copy all the entries. */ - target = xh_bh->b_data; + target = bucket_block(bucket, 0); offset = offsetof(struct ocfs2_xattr_header, xh_entries); size = count * sizeof(struct ocfs2_xattr_entry); memcpy(target + offset, (char *)xb_xh + offset, size); @@ -2614,73 +3369,47 @@ static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, * While if the entry is in index b-tree, "bucket" indicates the * real place of the xattr. */ -static int ocfs2_xattr_update_xattr_search(struct inode *inode, - struct ocfs2_xattr_search *xs, - struct buffer_head *old_bh, - struct buffer_head *new_bh) +static void ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh) { - int ret = 0; char *buf = old_bh->b_data; struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; - int i, blocksize = inode->i_sb->s_blocksize; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - - xs->bucket.bhs[0] = new_bh; - get_bh(new_bh); - xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data; - xs->header = xs->bucket.xh; + int i; - xs->base = new_bh->b_data; + xs->header = bucket_xh(xs->bucket); + xs->base = bucket_block(xs->bucket, 0); xs->end = xs->base + inode->i_sb->s_blocksize; - if (!xs->not_found) { - if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { - ret = ocfs2_read_blocks(inode, - xs->bucket.bhs[0]->b_blocknr + 1, - blk_per_bucket - 1, &xs->bucket.bhs[1], - 0); - if (ret) { - mlog_errno(ret); - return ret; - } - - } - i = xs->here - old_xh->xh_entries; - xs->here = &xs->header->xh_entries[i]; - } + if (xs->not_found) + return; - return ret; + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; } static int ocfs2_xattr_create_index_block(struct inode *inode, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { - int ret, credits = OCFS2_SUBALLOC_ALLOC; + int ret; u32 bit_off, len; u64 blkno; - handle_t *handle; + handle_t *handle = ctxt->handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_inode_info *oi = OCFS2_I(inode); - struct ocfs2_alloc_context *data_ac; - struct buffer_head *xh_bh = NULL, *data_bh = NULL; struct buffer_head *xb_bh = xs->xattr_bh; struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)xb_bh->b_data; struct ocfs2_xattr_tree_root *xr; u16 xb_flags = le16_to_cpu(xb->xb_flags); - u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb); mlog(0, "create xattr index block for %llu\n", (unsigned long long)xb_bh->b_blocknr); BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); - - ret = ocfs2_reserve_clusters(osb, 1, &data_ac); - if (ret) { - mlog_errno(ret); - goto out; - } + BUG_ON(!xs->bucket); /* * XXX: @@ -2689,29 +3418,18 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, */ down_write(&oi->ip_alloc_sem); - /* - * 3 more credits, one for xattr block update, one for the 1st block - * of the new xattr bucket and one for the value/data. - */ - credits += 3; - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out_sem; - } - - ret = ocfs2_journal_access(handle, inode, xb_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_xb(handle, inode, xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, + 1, 1, &bit_off, &len); if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } /* @@ -2724,51 +3442,23 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, mlog(0, "allocate 1 cluster from %llu to xattr block\n", (unsigned long long)blkno); - xh_bh = sb_getblk(inode->i_sb, blkno); - if (!xh_bh) { - ret = -EIO; + ret = ocfs2_init_xattr_bucket(xs->bucket, blkno); + if (ret) { mlog_errno(ret); - goto out_commit; + goto out; } - ocfs2_set_new_buffer_uptodate(inode, xh_bh); - - ret = ocfs2_journal_access(handle, inode, xh_bh, - OCFS2_JOURNAL_ACCESS_CREATE); + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_CREATE); if (ret) { mlog_errno(ret); - goto out_commit; - } - - if (bpb > 1) { - data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1); - if (!data_bh) { - ret = -EIO; - mlog_errno(ret); - goto out_commit; - } - - ocfs2_set_new_buffer_uptodate(inode, data_bh); - - ret = ocfs2_journal_access(handle, inode, data_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + goto out; } - ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xs->bucket); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); - ocfs2_journal_dirty(handle, xh_bh); - if (data_bh) - ocfs2_journal_dirty(handle, data_bh); - - ret = ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh); /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - @@ -2787,24 +3477,10 @@ static int ocfs2_xattr_create_index_block(struct inode *inode, xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); - ret = ocfs2_journal_dirty(handle, xb_bh); - if (ret) { - mlog_errno(ret); - goto out_commit; - } - -out_commit: - ocfs2_commit_trans(osb, handle); - -out_sem: - up_write(&oi->ip_alloc_sem); + ocfs2_journal_dirty(handle, xb_bh); out: - if (data_ac) - ocfs2_free_alloc_context(data_ac); - - brelse(xh_bh); - brelse(data_bh); + up_write(&oi->ip_alloc_sem); return ret; } @@ -2829,29 +3505,18 @@ static int cmp_xe_offset(const void *a, const void *b) * so that we can spare some space for insertion. */ static int ocfs2_defrag_xattr_bucket(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_bucket *bucket) { int ret, i; size_t end, offset, len, value_len; struct ocfs2_xattr_header *xh; char *entries, *buf, *bucket_buf = NULL; - u64 blkno = bucket->bhs[0]->b_blocknr; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 blkno = bucket_blkno(bucket); u16 xh_free_start; size_t blocksize = inode->i_sb->s_blocksize; - handle_t *handle; - struct buffer_head **bhs; struct ocfs2_xattr_entry *xe; - bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, - GFP_NOFS); - if (!bhs) - return -ENOMEM; - - ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0); - if (ret) - goto out; - /* * In order to make the operation more efficient and generic, * we copy all the blocks into a contiguous memory and do the @@ -2865,26 +3530,16 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, } buf = bucket_buf; - for (i = 0; i < blk_per_bucket; i++, buf += blocksize) - memcpy(buf, bhs[i]->b_data, blocksize); + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(buf, bucket_block(bucket, i), blocksize); - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + ret = ocfs2_xattr_bucket_journal_access(handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto commit; - } - } - xh = (struct ocfs2_xattr_header *)bucket_buf; entries = (char *)xh->xh_entries; xh_free_start = le16_to_cpu(xh->xh_free_start); @@ -2940,7 +3595,7 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, "bucket %llu\n", (unsigned long long)blkno); if (xh_free_start == end) - goto commit; + goto out; memset(bucket_buf + xh_free_start, 0, end - xh_free_start); xh->xh_free_start = cpu_to_le16(end); @@ -2951,169 +3606,94 @@ static int ocfs2_defrag_xattr_bucket(struct inode *inode, cmp_xe, swap_xe); buf = bucket_buf; - for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { - memcpy(bhs[i]->b_data, buf, blocksize); - ocfs2_journal_dirty(handle, bhs[i]); - } + for (i = 0; i < bucket->bu_blocks; i++, buf += blocksize) + memcpy(bucket_block(bucket, i), buf, blocksize); + ocfs2_xattr_bucket_journal_dirty(handle, bucket); -commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: - - if (bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(bhs[i]); - } - kfree(bhs); - kfree(bucket_buf); return ret; } /* - * Move half nums of the xattr bucket in the previous cluster to this new - * cluster. We only touch the last cluster of the previous extend record. + * prev_blkno points to the start of an existing extent. new_blkno + * points to a newly allocated extent. Because we know each of our + * clusters contains more than bucket, we can easily split one cluster + * at a bucket boundary. So we take the last cluster of the existing + * extent and split it down the middle. We move the last half of the + * buckets in the last cluster of the existing extent over to the new + * extent. + * + * first_bh is the buffer at prev_blkno so we can update the existing + * extent's bucket count. header_bh is the bucket were we were hoping + * to insert our xattr. If the bucket move places the target in the new + * extent, we'll update first_bh and header_bh after modifying the old + * extent. * - * first_bh is the first buffer_head of a series of bucket in the same - * extent rec and header_bh is the header of one bucket in this cluster. - * They will be updated if we move the data header_bh contains to the new - * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster. + * first_hash will be set as the 1st xe's name_hash in the new extent. */ static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, handle_t *handle, - struct buffer_head **first_bh, - struct buffer_head **header_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, u64 new_blkno, - u64 prev_blkno, u32 num_clusters, u32 *first_hash) { - int i, ret, credits; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); - int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); - int blocksize = inode->i_sb->s_blocksize; - struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; - struct ocfs2_xattr_header *new_xh; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)((*first_bh)->b_data); - - BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets); - BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize); - - prev_bh = *first_bh; - get_bh(prev_bh); - xh = (struct ocfs2_xattr_header *)prev_bh->b_data; + int ret; + struct super_block *sb = inode->i_sb; + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(sb); + int num_buckets = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(sb)); + int to_move = num_buckets / 2; + u64 src_blkno; + u64 last_cluster_blkno = bucket_blkno(first) + + ((num_clusters - 1) * ocfs2_clusters_to_blocks(sb, 1)); - prev_blkno += (num_clusters - 1) * bpc + bpc / 2; + BUG_ON(le16_to_cpu(bucket_xh(first)->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == OCFS2_SB(sb)->s_clustersize); mlog(0, "move half of xattrs in cluster %llu to %llu\n", - (unsigned long long)prev_blkno, (unsigned long long)new_blkno); + (unsigned long long)last_cluster_blkno, (unsigned long long)new_blkno); - /* - * We need to update the 1st half of the new cluster and - * 1 more for the update of the 1st bucket of the previous - * extent record. - */ - credits = bpc / 2 + 1; - ret = ocfs2_extend_trans(handle, credits); + ret = ocfs2_mv_xattr_buckets(inode, handle, bucket_blkno(first), + last_cluster_blkno, new_blkno, + to_move, first_hash); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, prev_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } + /* This is the first bucket that got moved */ + src_blkno = last_cluster_blkno + (to_move * blks_per_bucket); - for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { - old_bh = new_bh = NULL; - new_bh = sb_getblk(inode->i_sb, new_blkno); - if (!new_bh) { - ret = -EIO; - mlog_errno(ret); - goto out; - } + /* + * If the target bucket was part of the moved buckets, we need to + * update first and target. + */ + if (bucket_blkno(target) >= src_blkno) { + /* Find the block for the new target bucket */ + src_blkno = new_blkno + + (bucket_blkno(target) - src_blkno); - ocfs2_set_new_buffer_uptodate(inode, new_bh); + ocfs2_xattr_bucket_relse(first); + ocfs2_xattr_bucket_relse(target); - ret = ocfs2_journal_access(handle, inode, new_bh, - OCFS2_JOURNAL_ACCESS_CREATE); - if (ret < 0) { + /* + * These shouldn't fail - the buffers are in the + * journal from ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_read_xattr_bucket(first, new_blkno); + if (ret) { mlog_errno(ret); - brelse(new_bh); goto out; } - - ret = ocfs2_read_block(inode, prev_blkno, &old_bh); - if (ret < 0) { + ret = ocfs2_read_xattr_bucket(target, src_blkno); + if (ret) mlog_errno(ret); - brelse(new_bh); - goto out; - } - memcpy(new_bh->b_data, old_bh->b_data, blocksize); - - if (i == 0) { - new_xh = (struct ocfs2_xattr_header *)new_bh->b_data; - new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2); - - if (first_hash) - *first_hash = le32_to_cpu( - new_xh->xh_entries[0].xe_name_hash); - new_first_bh = new_bh; - get_bh(new_first_bh); - } - - ocfs2_journal_dirty(handle, new_bh); - - if (*header_bh == old_bh) { - brelse(*header_bh); - *header_bh = new_bh; - get_bh(*header_bh); - - brelse(*first_bh); - *first_bh = new_first_bh; - get_bh(*first_bh); - } - brelse(new_bh); - brelse(old_bh); } - le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2)); - - ocfs2_journal_dirty(handle, prev_bh); out: - brelse(prev_bh); - brelse(new_first_bh); - return ret; -} - -static int ocfs2_read_xattr_bucket(struct inode *inode, - u64 blkno, - struct buffer_head **bhs, - int new) -{ - int ret = 0; - u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - - if (!new) - return ocfs2_read_blocks(inode, blkno, - blk_per_bucket, bhs, 0); - - for (i = 0; i < blk_per_bucket; i++) { - bhs[i] = sb_getblk(inode->i_sb, blkno + i); - if (bhs[i] == NULL) { - ret = -EIO; - mlog_errno(ret); - break; - } - ocfs2_set_new_buffer_uptodate(inode, bhs[i]); - } - return ret; } @@ -3178,8 +3758,7 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, { int ret, i; int count, start, len, name_value_len = 0, xe_len, name_offset = 0; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - struct buffer_head **s_bhs, **t_bhs = NULL; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; struct ocfs2_xattr_header *xh; struct ocfs2_xattr_entry *xe; int blocksize = inode->i_sb->s_blocksize; @@ -3187,47 +3766,52 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, mlog(0, "move some of xattrs from bucket %llu to %llu\n", (unsigned long long)blk, (unsigned long long)new_blk); - s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); - if (!s_bhs) - return -ENOMEM; - - ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0); - if (ret) { + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, s_bhs[0], - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_read_xattr_bucket(s_bucket, blk); if (ret) { mlog_errno(ret); goto out; } - t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); - if (!t_bhs) { - ret = -ENOMEM; + ret = ocfs2_xattr_bucket_journal_access(handle, s_bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); goto out; } - ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); + /* + * Even if !new_bucket_head, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, new_blk); if (ret) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, t_bhs[i], - new_bucket_head ? - OCFS2_JOURNAL_ACCESS_CREATE : - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) { - mlog_errno(ret); - goto out; - } + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? See the comment in the + * same part of ocfs2_cp_xattr_bucket(). + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + new_bucket_head ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; } - xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + xh = bucket_xh(s_bucket); count = le16_to_cpu(xh->xh_count); start = ocfs2_xattr_find_divide_pos(xh); @@ -3239,10 +3823,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, * The hash value is set as one larger than * that of the last entry in the previous bucket. */ - for (i = 0; i < blk_per_bucket; i++) - memset(t_bhs[i]->b_data, 0, blocksize); + for (i = 0; i < t_bucket->bu_blocks; i++) + memset(bucket_block(t_bucket, i), 0, blocksize); - xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + xh = bucket_xh(t_bucket); xh->xh_free_start = cpu_to_le16(blocksize); xh->xh_entries[0].xe_name_hash = xe->xe_name_hash; le32_add_cpu(&xh->xh_entries[0].xe_name_hash, 1); @@ -3251,11 +3835,10 @@ static int ocfs2_divide_xattr_bucket(struct inode *inode, } /* copy the whole bucket to the new first. */ - for (i = 0; i < blk_per_bucket; i++) - memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); /* update the new bucket. */ - xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + xh = bucket_xh(t_bucket); /* * Calculate the total name/value len and xh_free_start for @@ -3319,11 +3902,7 @@ set_num_buckets: else xh->xh_num_buckets = 0; - for (i = 0; i < blk_per_bucket; i++) { - ocfs2_journal_dirty(handle, t_bhs[i]); - if (ret) - mlog_errno(ret); - } + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); /* store the first_hash of the new bucket. */ if (first_hash) @@ -3337,29 +3916,18 @@ set_num_buckets: if (start == count) goto out; - xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + xh = bucket_xh(s_bucket); memset(&xh->xh_entries[start], 0, sizeof(struct ocfs2_xattr_entry) * (count - start)); xh->xh_count = cpu_to_le16(start); xh->xh_free_start = cpu_to_le16(name_offset); xh->xh_name_value_len = cpu_to_le16(name_value_len); - ocfs2_journal_dirty(handle, s_bhs[0]); - if (ret) - mlog_errno(ret); + ocfs2_xattr_bucket_journal_dirty(handle, s_bucket); out: - if (s_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(s_bhs[i]); - } - kfree(s_bhs); - - if (t_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(t_bhs[i]); - } - kfree(t_bhs); + ocfs2_xattr_bucket_free(s_bucket); + ocfs2_xattr_bucket_free(t_bucket); return ret; } @@ -3376,10 +3944,8 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, u64 t_blkno, int t_is_new) { - int ret, i; - int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - int blocksize = inode->i_sb->s_blocksize; - struct buffer_head **s_bhs, **t_bhs = NULL; + int ret; + struct ocfs2_xattr_bucket *s_bucket = NULL, *t_bucket = NULL; BUG_ON(s_blkno == t_blkno); @@ -3387,92 +3953,115 @@ static int ocfs2_cp_xattr_bucket(struct inode *inode, (unsigned long long)s_blkno, (unsigned long long)t_blkno, t_is_new); - s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, - GFP_NOFS); - if (!s_bhs) - return -ENOMEM; + s_bucket = ocfs2_xattr_bucket_new(inode); + t_bucket = ocfs2_xattr_bucket_new(inode); + if (!s_bucket || !t_bucket) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } - ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); + ret = ocfs2_read_xattr_bucket(s_bucket, s_blkno); if (ret) goto out; - t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, - GFP_NOFS); - if (!t_bhs) { - ret = -ENOMEM; + /* + * Even if !t_is_new, we're overwriting t_bucket. Thus, + * there's no need to read it. + */ + ret = ocfs2_init_xattr_bucket(t_bucket, t_blkno); + if (ret) goto out; - } - ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); + /* + * Hey, if we're overwriting t_bucket, what difference does + * ACCESS_CREATE vs ACCESS_WRITE make? Well, if we allocated a new + * cluster to fill, we came here from + * ocfs2_mv_xattr_buckets(), and it is really new - + * ACCESS_CREATE is required. But we also might have moved data + * out of t_bucket before extending back into it. + * ocfs2_add_new_xattr_bucket() can do this - its call to + * ocfs2_add_new_xattr_cluster() may have created a new extent + * and copied out the end of the old extent. Then it re-extends + * the old extent back to create space for new xattrs. That's + * how we get here, and the bucket isn't really new. + */ + ret = ocfs2_xattr_bucket_journal_access(handle, t_bucket, + t_is_new ? + OCFS2_JOURNAL_ACCESS_CREATE : + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) goto out; - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, t_bhs[i], - t_is_new ? - OCFS2_JOURNAL_ACCESS_CREATE : - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret) - goto out; - } - - for (i = 0; i < blk_per_bucket; i++) { - memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); - ocfs2_journal_dirty(handle, t_bhs[i]); - } + ocfs2_xattr_bucket_copy_data(t_bucket, s_bucket); + ocfs2_xattr_bucket_journal_dirty(handle, t_bucket); out: - if (s_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(s_bhs[i]); - } - kfree(s_bhs); - - if (t_bhs) { - for (i = 0; i < blk_per_bucket; i++) - brelse(t_bhs[i]); - } - kfree(t_bhs); + ocfs2_xattr_bucket_free(t_bucket); + ocfs2_xattr_bucket_free(s_bucket); return ret; } /* - * Copy one xattr cluster from src_blk to to_blk. - * The to_blk will become the first bucket header of the cluster, so its - * xh_num_buckets will be initialized as the bucket num in the cluster. + * src_blk points to the start of an existing extent. last_blk points to + * last cluster in that extent. to_blk points to a newly allocated + * extent. We copy the buckets from the cluster at last_blk to the new + * extent. If start_bucket is non-zero, we skip that many buckets before + * we start copying. The new extent's xh_num_buckets gets set to the + * number of buckets we copied. The old extent's xh_num_buckets shrinks + * by the same amount. */ -static int ocfs2_cp_xattr_cluster(struct inode *inode, - handle_t *handle, - struct buffer_head *first_bh, - u64 src_blk, - u64 to_blk, +static int ocfs2_mv_xattr_buckets(struct inode *inode, handle_t *handle, + u64 src_blk, u64 last_blk, u64 to_blk, + unsigned int start_bucket, u32 *first_hash) { int i, ret, credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int blks_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); - struct buffer_head *bh = NULL; - struct ocfs2_xattr_header *xh; - u64 to_blk_start = to_blk; + struct ocfs2_xattr_bucket *old_first, *new_first; + + mlog(0, "mv xattrs from cluster %llu to %llu\n", + (unsigned long long)last_blk, (unsigned long long)to_blk); + + BUG_ON(start_bucket >= num_buckets); + if (start_bucket) { + num_buckets -= start_bucket; + last_blk += (start_bucket * blks_per_bucket); + } + + /* The first bucket of the original extent */ + old_first = ocfs2_xattr_bucket_new(inode); + /* The first bucket of the new extent */ + new_first = ocfs2_xattr_bucket_new(inode); + if (!old_first || !new_first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } - mlog(0, "cp xattrs from cluster %llu to %llu\n", - (unsigned long long)src_blk, (unsigned long long)to_blk); + ret = ocfs2_read_xattr_bucket(old_first, src_blk); + if (ret) { + mlog_errno(ret); + goto out; + } /* - * We need to update the new cluster and 1 more for the update of - * the 1st bucket of the previous extent rec. + * We need to update the first bucket of the old extent and all + * the buckets going to the new extent. */ - credits = bpc + 1; + credits = ((num_buckets + 1) * blks_per_bucket) + + handle->h_buffer_credits; ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, first_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, old_first, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; @@ -3480,45 +4069,45 @@ static int ocfs2_cp_xattr_cluster(struct inode *inode, for (i = 0; i < num_buckets; i++) { ret = ocfs2_cp_xattr_bucket(inode, handle, - src_blk, to_blk, 1); + last_blk + (i * blks_per_bucket), + to_blk + (i * blks_per_bucket), + 1); if (ret) { mlog_errno(ret); goto out; } - - src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); - to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); } - /* update the old bucket header. */ - xh = (struct ocfs2_xattr_header *)first_bh->b_data; - le16_add_cpu(&xh->xh_num_buckets, -num_buckets); - - ocfs2_journal_dirty(handle, first_bh); - - /* update the new bucket header. */ - ret = ocfs2_read_block(inode, to_blk_start, &bh); - if (ret < 0) { + /* + * Get the new bucket ready before we dirty anything + * (This actually shouldn't fail, because we already dirtied + * it once in ocfs2_cp_xattr_bucket()). + */ + ret = ocfs2_read_xattr_bucket(new_first, to_blk); + if (ret) { mlog_errno(ret); goto out; } - - ret = ocfs2_journal_access(handle, inode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, new_first, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - xh = (struct ocfs2_xattr_header *)bh->b_data; - xh->xh_num_buckets = cpu_to_le16(num_buckets); + /* Now update the headers */ + le16_add_cpu(&bucket_xh(old_first)->xh_num_buckets, -num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, old_first); - ocfs2_journal_dirty(handle, bh); + bucket_xh(new_first)->xh_num_buckets = cpu_to_le16(num_buckets); + ocfs2_xattr_bucket_journal_dirty(handle, new_first); if (first_hash) - *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + *first_hash = le32_to_cpu(bucket_xh(new_first)->xh_entries[0].xe_name_hash); + out: - brelse(bh); + ocfs2_xattr_bucket_free(new_first); + ocfs2_xattr_bucket_free(old_first); return ret; } @@ -3534,7 +4123,7 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, u32 *first_hash) { u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - int ret, credits = 2 * blk_per_bucket; + int ret, credits = 2 * blk_per_bucket + handle->h_buffer_credits; BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); @@ -3577,43 +4166,49 @@ static int ocfs2_divide_xattr_cluster(struct inode *inode, */ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, handle_t *handle, - struct buffer_head **first_bh, - struct buffer_head **header_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, u64 new_blk, - u64 prev_blk, u32 prev_clusters, u32 *v_start, int *extend) { - int ret = 0; - int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int ret; mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", - (unsigned long long)prev_blk, prev_clusters, + (unsigned long long)bucket_blkno(first), prev_clusters, (unsigned long long)new_blk); - if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) { ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, handle, - first_bh, - header_bh, + first, target, new_blk, - prev_blk, prev_clusters, v_start); - else { - u64 last_blk = prev_blk + bpc * (prev_clusters - 1); - - if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) - ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, - last_blk, new_blk, + if (ret) + mlog_errno(ret); + } else { + /* The start of the last cluster in the first extent */ + u64 last_blk = bucket_blkno(first) + + ((prev_clusters - 1) * + ocfs2_clusters_to_blocks(inode->i_sb, 1)); + + if (prev_clusters > 1 && bucket_blkno(target) != last_blk) { + ret = ocfs2_mv_xattr_buckets(inode, handle, + bucket_blkno(first), + last_blk, new_blk, 0, v_start); - else { + if (ret) + mlog_errno(ret); + } else { ret = ocfs2_divide_xattr_cluster(inode, handle, last_blk, new_blk, v_start); + if (ret) + mlog_errno(ret); - if ((*header_bh)->b_blocknr == last_blk && extend) + if ((bucket_blkno(target) == last_blk) && extend) *extend = 0; } } @@ -3639,56 +4234,37 @@ static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, */ static int ocfs2_add_new_xattr_cluster(struct inode *inode, struct buffer_head *root_bh, - struct buffer_head **first_bh, - struct buffer_head **header_bh, + struct ocfs2_xattr_bucket *first, + struct ocfs2_xattr_bucket *target, u32 *num_clusters, u32 prev_cpos, - u64 prev_blkno, - int *extend) + int *extend, + struct ocfs2_xattr_set_ctxt *ctxt) { - int ret, credits; + int ret; u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); u32 prev_clusters = *num_clusters; u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; u64 block; - handle_t *handle = NULL; - struct ocfs2_alloc_context *data_ac = NULL; - struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = ctxt->handle; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct ocfs2_extent_tree et; mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " "previous xattr blkno = %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, - prev_cpos, (unsigned long long)prev_blkno); + prev_cpos, (unsigned long long)bucket_blkno(first)); ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); - ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, - &data_ac, &meta_ac); - if (ret) { - mlog_errno(ret); - goto leave; - } - - credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, - clusters_to_add); - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - mlog_errno(ret); - goto leave; - } - - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_xb(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret < 0) { mlog_errno(ret); goto leave; } - ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + ret = __ocfs2_claim_clusters(osb, handle, ctxt->data_ac, 1, clusters_to_add, &bit_off, &num_bits); if (ret < 0) { if (ret != -ENOSPC) @@ -3702,7 +4278,7 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - if (prev_blkno + prev_clusters * bpc == block && + if (bucket_blkno(first) + (prev_clusters * bpc) == block && (prev_clusters + num_bits) << osb->s_clustersize_bits <= OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { /* @@ -3721,10 +4297,9 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, } else { ret = ocfs2_adjust_xattr_cross_cluster(inode, handle, - first_bh, - header_bh, + first, + target, block, - prev_blkno, prev_clusters, &v_start, extend); @@ -3734,149 +4309,137 @@ static int ocfs2_add_new_xattr_cluster(struct inode *inode, } } - if (handle->h_buffer_credits < credits) { - /* - * The journal has been restarted before, and don't - * have enough space for the insertion, so extend it - * here. - */ - ret = ocfs2_extend_trans(handle, credits); - if (ret) { - mlog_errno(ret); - goto leave; - } - } mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", num_bits, (unsigned long long)block, v_start); ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, - num_bits, 0, meta_ac); + num_bits, 0, ctxt->meta_ac); if (ret < 0) { mlog_errno(ret); goto leave; } ret = ocfs2_journal_dirty(handle, root_bh); - if (ret < 0) { + if (ret < 0) mlog_errno(ret); - goto leave; - } leave: - if (handle) - ocfs2_commit_trans(osb, handle); - if (data_ac) - ocfs2_free_alloc_context(data_ac); - if (meta_ac) - ocfs2_free_alloc_context(meta_ac); - return ret; } /* - * Extend a new xattr bucket and move xattrs to the end one by one until - * We meet with start_bh. Only move half of the xattrs to the bucket after it. + * We are given an extent. 'first' is the bucket at the very front of + * the extent. The extent has space for an additional bucket past + * bucket_xh(first)->xh_num_buckets. 'target_blkno' is the block number + * of the target bucket. We wish to shift every bucket past the target + * down one, filling in that additional space. When we get back to the + * target, we split the target between itself and the now-empty bucket + * at target+1 (aka, target_blkno + blks_per_bucket). */ static int ocfs2_extend_xattr_bucket(struct inode *inode, - struct buffer_head *first_bh, - struct buffer_head *start_bh, + handle_t *handle, + struct ocfs2_xattr_bucket *first, + u64 target_blk, u32 num_clusters) { int ret, credits; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - u64 start_blk = start_bh->b_blocknr, end_blk; - u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); - handle_t *handle; - struct ocfs2_xattr_header *first_xh = - (struct ocfs2_xattr_header *)first_bh->b_data; - u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); + u64 end_blk; + u16 new_bucket = le16_to_cpu(bucket_xh(first)->xh_num_buckets); mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " - "from %llu, len = %u\n", (unsigned long long)start_blk, - (unsigned long long)first_bh->b_blocknr, num_clusters); + "from %llu, len = %u\n", (unsigned long long)target_blk, + (unsigned long long)bucket_blkno(first), num_clusters); - BUG_ON(bucket >= num_buckets); + /* The extent must have room for an additional bucket */ + BUG_ON(new_bucket >= + (num_clusters * ocfs2_xattr_buckets_per_cluster(osb))); - end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; + /* end_blk points to the last existing bucket */ + end_blk = bucket_blkno(first) + ((new_bucket - 1) * blk_per_bucket); /* - * We will touch all the buckets after the start_bh(include it). - * Add one more bucket and modify the first_bh. + * end_blk is the start of the last existing bucket. + * Thus, (end_blk - target_blk) covers the target bucket and + * every bucket after it up to, but not including, the last + * existing bucket. Then we add the last existing bucket, the + * new bucket, and the first bucket (3 * blk_per_bucket). */ - credits = end_blk - start_blk + 2 * blk_per_bucket + 1; - handle = ocfs2_start_trans(osb, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + credits = (end_blk - target_blk) + (3 * blk_per_bucket) + + handle->h_buffer_credits; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, first_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, first, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto commit; + goto out; } - while (end_blk != start_blk) { + while (end_blk != target_blk) { ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, end_blk + blk_per_bucket, 0); if (ret) - goto commit; + goto out; end_blk -= blk_per_bucket; } - /* Move half of the xattr in start_blk to the next bucket. */ - ret = ocfs2_divide_xattr_bucket(inode, handle, start_blk, - start_blk + blk_per_bucket, NULL, 0); + /* Move half of the xattr in target_blkno to the next bucket. */ + ret = ocfs2_divide_xattr_bucket(inode, handle, target_blk, + target_blk + blk_per_bucket, NULL, 0); - le16_add_cpu(&first_xh->xh_num_buckets, 1); - ocfs2_journal_dirty(handle, first_bh); + le16_add_cpu(&bucket_xh(first)->xh_num_buckets, 1); + ocfs2_xattr_bucket_journal_dirty(handle, first); -commit: - ocfs2_commit_trans(osb, handle); out: return ret; } /* - * Add new xattr bucket in an extent record and adjust the buckets accordingly. - * xb_bh is the ocfs2_xattr_block. - * We will move all the buckets starting from header_bh to the next place. As - * for this one, half num of its xattrs will be moved to the next one. + * Add new xattr bucket in an extent record and adjust the buckets + * accordingly. xb_bh is the ocfs2_xattr_block, and target is the + * bucket we want to insert into. + * + * In the easy case, we will move all the buckets after target down by + * one. Half of target's xattrs will be moved to the next bucket. * - * We will allocate a new cluster if current cluster is full and adjust - * header_bh and first_bh if the insert place is moved to the new cluster. + * If current cluster is full, we'll allocate a new one. This may not + * be contiguous. The underlying calls will make sure that there is + * space for the insert, shifting buckets around if necessary. + * 'target' may be moved by those calls. */ static int ocfs2_add_new_xattr_bucket(struct inode *inode, struct buffer_head *xb_bh, - struct buffer_head *header_bh) + struct ocfs2_xattr_bucket *target, + struct ocfs2_xattr_set_ctxt *ctxt) { - struct ocfs2_xattr_header *first_xh = NULL; - struct buffer_head *first_bh = NULL; struct ocfs2_xattr_block *xb = (struct ocfs2_xattr_block *)xb_bh->b_data; struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; struct ocfs2_extent_list *el = &xb_root->xt_list; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)header_bh->b_data; - u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); - struct super_block *sb = inode->i_sb; - struct ocfs2_super *osb = OCFS2_SB(sb); + u32 name_hash = + le32_to_cpu(bucket_xh(target)->xh_entries[0].xe_name_hash); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); int ret, num_buckets, extend = 1; u64 p_blkno; u32 e_cpos, num_clusters; + /* The bucket at the front of the extent */ + struct ocfs2_xattr_bucket *first; - mlog(0, "Add new xattr bucket starting form %llu\n", - (unsigned long long)header_bh->b_blocknr); + mlog(0, "Add new xattr bucket starting from %llu\n", + (unsigned long long)bucket_blkno(target)); - /* - * Add refrence for header_bh here because it may be - * changed in ocfs2_add_new_xattr_cluster and we need - * to free it in the end. - */ - get_bh(header_bh); + /* The first bucket of the original extent */ + first = ocfs2_xattr_bucket_new(inode); + if (!first) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, &num_clusters, el); @@ -3885,40 +4448,45 @@ static int ocfs2_add_new_xattr_bucket(struct inode *inode, goto out; } - ret = ocfs2_read_block(inode, p_blkno, &first_bh); + ret = ocfs2_read_xattr_bucket(first, p_blkno); if (ret) { mlog_errno(ret); goto out; } num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; - first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; - - if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { + if (num_buckets == le16_to_cpu(bucket_xh(first)->xh_num_buckets)) { + /* + * This can move first+target if the target bucket moves + * to the new extent. + */ ret = ocfs2_add_new_xattr_cluster(inode, xb_bh, - &first_bh, - &header_bh, + first, + target, &num_clusters, e_cpos, - p_blkno, - &extend); + &extend, + ctxt); if (ret) { mlog_errno(ret); goto out; } } - if (extend) + if (extend) { ret = ocfs2_extend_xattr_bucket(inode, - first_bh, - header_bh, + ctxt->handle, + first, + bucket_blkno(target), num_clusters); - if (ret) - mlog_errno(ret); + if (ret) + mlog_errno(ret); + } + out: - brelse(first_bh); - brelse(header_bh); + ocfs2_xattr_bucket_free(first); + return ret; } @@ -3929,7 +4497,7 @@ static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, int block_off = offs >> inode->i_sb->s_blocksize_bits; offs = offs % inode->i_sb->s_blocksize; - return bucket->bhs[block_off]->b_data + offs; + return bucket_block(bucket, block_off) + offs; } /* @@ -3984,7 +4552,7 @@ static void ocfs2_xattr_set_entry_normal(struct inode *inode, xe->xe_value_size = 0; val = ocfs2_xattr_bucket_get_val(inode, - &xs->bucket, offs); + xs->bucket, offs); memset(val + OCFS2_XATTR_SIZE(name_len), 0, size - OCFS2_XATTR_SIZE(name_len)); if (OCFS2_XATTR_SIZE(xi->value_len) > 0) @@ -4062,8 +4630,7 @@ set_new_name_value: xh->xh_free_start = cpu_to_le16(offs); } - val = ocfs2_xattr_bucket_get_val(inode, - &xs->bucket, offs - size); + val = ocfs2_xattr_bucket_get_val(inode, xs->bucket, offs - size); xe->xe_name_offset = cpu_to_le16(offs - size); memset(val, 0, size); @@ -4079,125 +4646,45 @@ set_new_name_value: return; } -static int ocfs2_xattr_bucket_handle_journal(struct inode *inode, - handle_t *handle, - struct ocfs2_xattr_search *xs, - struct buffer_head **bhs, - u16 bh_num) -{ - int ret = 0, off, block_off; - struct ocfs2_xattr_entry *xe = xs->here; - - /* - * First calculate all the blocks we should journal_access - * and journal_dirty. The first block should always be touched. - */ - ret = ocfs2_journal_dirty(handle, bhs[0]); - if (ret) - mlog_errno(ret); - - /* calc the data. */ - off = le16_to_cpu(xe->xe_name_offset); - block_off = off >> inode->i_sb->s_blocksize_bits; - ret = ocfs2_journal_dirty(handle, bhs[block_off]); - if (ret) - mlog_errno(ret); - - return ret; -} - /* * Set the xattr entry in the specified bucket. * The bucket is indicated by xs->bucket and it should have the enough * space for the xattr insertion. */ static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_info *xi, struct ocfs2_xattr_search *xs, u32 name_hash, int local) { - int i, ret; - handle_t *handle = NULL; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int ret; + u64 blkno; mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", (unsigned long)xi->value_len, xi->name_index, - (unsigned long long)xs->bucket.bhs[0]->b_blocknr); + (unsigned long long)bucket_blkno(xs->bucket)); - if (!xs->bucket.bhs[1]) { - ret = ocfs2_read_blocks(inode, - xs->bucket.bhs[0]->b_blocknr + 1, - blk_per_bucket - 1, &xs->bucket.bhs[1], - 0); + if (!xs->bucket->bu_bhs[1]) { + blkno = bucket_blkno(xs->bucket); + ocfs2_xattr_bucket_relse(xs->bucket); + ret = ocfs2_read_xattr_bucket(xs->bucket, blkno); if (ret) { mlog_errno(ret); goto out; } } - handle = ocfs2_start_trans(osb, blk_per_bucket); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) { - ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i], - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out; - } - } - ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); - /*Only dirty the blocks we have touched in set xattr. */ - ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs, - xs->bucket.bhs, blk_per_bucket); - if (ret) - mlog_errno(ret); -out: - ocfs2_commit_trans(osb, handle); - - return ret; -} - -static int ocfs2_xattr_value_update_size(struct inode *inode, - struct buffer_head *xe_bh, - struct ocfs2_xattr_entry *xe, - u64 new_size) -{ - int ret; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - handle_t *handle = NULL; - - handle = ocfs2_start_trans(osb, 1); - if (IS_ERR(handle)) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; - } - - ret = ocfs2_journal_access(handle, inode, xe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (ret < 0) { - mlog_errno(ret); - goto out_commit; - } - - xe->xe_value_size = cpu_to_le64(new_size); - - ret = ocfs2_journal_dirty(handle, xe_bh); - if (ret < 0) - mlog_errno(ret); - -out_commit: - ocfs2_commit_trans(osb, handle); out: return ret; } @@ -4210,18 +4697,19 @@ out: * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. */ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, - struct buffer_head *header_bh, + struct ocfs2_xattr_bucket *bucket, int xe_off, - int len) + int len, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret, offset; u64 value_blk; - struct buffer_head *value_bh = NULL; - struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe; - struct ocfs2_xattr_header *xh = - (struct ocfs2_xattr_header *)header_bh->b_data; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); size_t blocksize = inode->i_sb->s_blocksize; + struct ocfs2_xattr_value_buf vb = { + .vb_access = ocfs2_journal_access, + }; xe = &xh->xh_entries[xe_off]; @@ -4234,49 +4722,58 @@ static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, /* We don't allow ocfs2_xattr_value to be stored in different block. */ BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); - value_blk += header_bh->b_blocknr; - ret = ocfs2_read_block(inode, value_blk, &value_bh); - if (ret) { - mlog_errno(ret); - goto out; - } + vb.vb_bh = bucket->bu_bhs[value_blk]; + BUG_ON(!vb.vb_bh); - xv = (struct ocfs2_xattr_value_root *) - (value_bh->b_data + offset % blocksize); + vb.vb_xv = (struct ocfs2_xattr_value_root *) + (vb.vb_bh->b_data + offset % blocksize); - mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", - xe_off, (unsigned long long)header_bh->b_blocknr, len); - ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); + ret = ocfs2_xattr_bucket_journal_access(ctxt->handle, bucket, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; } - ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); + /* + * From here on out we have to dirty the bucket. The generic + * value calls only modify one of the bucket's bhs, but we need + * to send the bucket at once. So if they error, they *could* have + * modified something. We have to assume they did, and dirty + * the whole bucket. This leaves us in a consistent state. + */ + mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", + xe_off, (unsigned long long)bucket_blkno(bucket), len); + ret = ocfs2_xattr_value_truncate(inode, &vb, len, ctxt); if (ret) { mlog_errno(ret); - goto out; + goto out_dirty; } + xe->xe_value_size = cpu_to_le64(len); + +out_dirty: + ocfs2_xattr_bucket_journal_dirty(ctxt->handle, bucket); + out: - brelse(value_bh); return ret; } static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, - struct ocfs2_xattr_search *xs, - int len) + struct ocfs2_xattr_search *xs, + int len, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret, offset; struct ocfs2_xattr_entry *xe = xs->here; struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; - BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); + BUG_ON(!xs->bucket->bu_bhs[0] || !xe || ocfs2_xattr_is_local(xe)); offset = xe - xh->xh_entries; - ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], - offset, len); + ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket, + offset, len, ctxt); if (ret) mlog_errno(ret); @@ -4284,6 +4781,7 @@ static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, } static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_search *xs, char *val, int value_len) @@ -4299,7 +4797,8 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); - return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); + return __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); } static int ocfs2_rm_xattr_cluster(struct inode *inode, @@ -4343,15 +4842,15 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, } } - handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); if (IS_ERR(handle)) { ret = -ENOMEM; mlog_errno(ret); goto out; } - ret = ocfs2_journal_access(handle, inode, root_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_xb(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out_commit; @@ -4392,26 +4891,19 @@ out: } static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, + handle_t *handle, struct ocfs2_xattr_search *xs) { - handle_t *handle = NULL; - struct ocfs2_xattr_header *xh = xs->bucket.xh; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); struct ocfs2_xattr_entry *last = &xh->xh_entries[ le16_to_cpu(xh->xh_count) - 1]; int ret = 0; - handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - return; - } - - ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0], - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_xattr_bucket_journal_access(handle, xs->bucket, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); - goto out_commit; + return; } /* Remove the old entry. */ @@ -4420,11 +4912,7 @@ static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, memset(last, 0, sizeof(struct ocfs2_xattr_entry)); le16_add_cpu(&xh->xh_count, -1); - ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); - if (ret < 0) - mlog_errno(ret); -out_commit: - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + ocfs2_xattr_bucket_journal_dirty(handle, xs->bucket); } /* @@ -4440,7 +4928,8 @@ out_commit: */ static int ocfs2_xattr_set_in_bucket(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { int ret, local = 1; size_t value_len; @@ -4468,7 +4957,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, value_len = 0; ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, - value_len); + value_len, + ctxt); if (ret) goto out; @@ -4488,7 +4978,8 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, xi->value_len = OCFS2_XATTR_ROOT_SIZE; } - ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); + ret = ocfs2_xattr_set_entry_in_bucket(inode, ctxt->handle, xi, xs, + name_hash, local); if (ret) { mlog_errno(ret); goto out; @@ -4499,7 +4990,7 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, /* allocate the space now for the outside block storage. */ ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, - value_len); + value_len, ctxt); if (ret) { mlog_errno(ret); @@ -4509,13 +5000,14 @@ static int ocfs2_xattr_set_in_bucket(struct inode *inode, * storage and we have allocated xattr already, * so need to remove it. */ - ocfs2_xattr_bucket_remove_xs(inode, xs); + ocfs2_xattr_bucket_remove_xs(inode, ctxt->handle, xs); } goto out; } set_value_outside: - ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); + ret = ocfs2_xattr_bucket_set_value_outside(inode, ctxt->handle, + xs, val, value_len); out: return ret; } @@ -4530,7 +5022,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, struct ocfs2_xattr_bucket *bucket, const char *name) { - struct ocfs2_xattr_header *xh = bucket->xh; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); if (name_hash != le32_to_cpu(xh->xh_entries[0].xe_name_hash)) @@ -4540,7 +5032,7 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, xh->xh_entries[0].xe_name_hash) { mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " "hash = %u\n", - (unsigned long long)bucket->bhs[0]->b_blocknr, + (unsigned long long)bucket_blkno(bucket), le32_to_cpu(xh->xh_entries[0].xe_name_hash)); return -ENOSPC; } @@ -4550,16 +5042,16 @@ static int ocfs2_check_xattr_bucket_collision(struct inode *inode, static int ocfs2_xattr_set_entry_index_block(struct inode *inode, struct ocfs2_xattr_info *xi, - struct ocfs2_xattr_search *xs) + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_set_ctxt *ctxt) { struct ocfs2_xattr_header *xh; struct ocfs2_xattr_entry *xe; u16 count, header_size, xh_free_start; - int i, free, max_free, need, old; + int free, max_free, need, old; size_t value_size = 0, name_len = strlen(xi->name); size_t blocksize = inode->i_sb->s_blocksize; int ret, allocation = 0; - u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); mlog_entry("Set xattr %s in xattr index block\n", xi->name); @@ -4574,7 +5066,7 @@ try_again: mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " "of %u which exceed block size\n", - (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + (unsigned long long)bucket_blkno(xs->bucket), header_size); if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) @@ -4614,11 +5106,13 @@ try_again: mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" " %u\n", xs->not_found, - (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + (unsigned long long)bucket_blkno(xs->bucket), free, need, max_free, le16_to_cpu(xh->xh_free_start), le16_to_cpu(xh->xh_name_value_len)); - if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + if (free < need || + (xs->not_found && + count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb))) { if (need <= max_free && count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { /* @@ -4626,7 +5120,8 @@ try_again: * name/value will be moved, the xe shouldn't be changed * in xs. */ - ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); + ret = ocfs2_defrag_xattr_bucket(inode, ctxt->handle, + xs->bucket); if (ret) { mlog_errno(ret); goto out; @@ -4658,7 +5153,7 @@ try_again: * add a new bucket for the insert. */ ret = ocfs2_check_xattr_bucket_collision(inode, - &xs->bucket, + xs->bucket, xi->name); if (ret) { mlog_errno(ret); @@ -4667,17 +5162,21 @@ try_again: ret = ocfs2_add_new_xattr_bucket(inode, xs->xattr_bh, - xs->bucket.bhs[0]); + xs->bucket, + ctxt); if (ret) { mlog_errno(ret); goto out; } - for (i = 0; i < blk_per_bucket; i++) - brelse(xs->bucket.bhs[i]); - - memset(&xs->bucket, 0, sizeof(xs->bucket)); - + /* + * ocfs2_add_new_xattr_bucket() will have updated + * xs->bucket if it moved, but it will not have updated + * any of the other search fields. Thus, we drop it and + * re-search. Everything should be cached, so it'll be + * quick. + */ + ocfs2_xattr_bucket_relse(xs->bucket); ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, xi->name_index, xi->name, xs); @@ -4689,7 +5188,7 @@ try_again: } xattr_set: - ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); + ret = ocfs2_xattr_set_in_bucket(inode, xi, xs, ctxt); out: mlog_exit(ret); return ret; @@ -4700,24 +5199,41 @@ static int ocfs2_delete_xattr_in_bucket(struct inode *inode, void *para) { int ret = 0; - struct ocfs2_xattr_header *xh = bucket->xh; + struct ocfs2_xattr_header *xh = bucket_xh(bucket); u16 i; struct ocfs2_xattr_entry *xe; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_set_ctxt ctxt = {NULL, NULL,}; + int credits = ocfs2_remove_extent_credits(osb->sb) + + ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + + ocfs2_init_dealloc_ctxt(&ctxt.dealloc); for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { xe = &xh->xh_entries[i]; if (ocfs2_xattr_is_local(xe)) continue; - ret = ocfs2_xattr_bucket_value_truncate(inode, - bucket->bhs[0], - i, 0); + ctxt.handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(ctxt.handle)) { + ret = PTR_ERR(ctxt.handle); + mlog_errno(ret); + break; + } + + ret = ocfs2_xattr_bucket_value_truncate(inode, bucket, + i, 0, &ctxt); + + ocfs2_commit_trans(osb, ctxt.handle); if (ret) { mlog_errno(ret); break; } } + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &ctxt.dealloc); return ret; } @@ -4768,6 +5284,74 @@ out: } /* + * 'security' attributes support + */ +static size_t ocfs2_xattr_security_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = XATTR_SECURITY_PREFIX_LEN; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_SECURITY_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_security_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_SECURITY, name, + buffer, size); +} + +static int ocfs2_xattr_security_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_SECURITY, name, value, + size, flags); +} + +int ocfs2_init_security_get(struct inode *inode, + struct inode *dir, + struct ocfs2_security_xattr_info *si) +{ + /* check whether ocfs2 support feature xattr */ + if (!ocfs2_supports_xattr(OCFS2_SB(dir->i_sb))) + return -EOPNOTSUPP; + return security_inode_init_security(inode, dir, &si->name, &si->value, + &si->value_len); +} + +int ocfs2_init_security_set(handle_t *handle, + struct inode *inode, + struct buffer_head *di_bh, + struct ocfs2_security_xattr_info *si, + struct ocfs2_alloc_context *xattr_ac, + struct ocfs2_alloc_context *data_ac) +{ + return ocfs2_xattr_set_handle(handle, inode, di_bh, + OCFS2_XATTR_INDEX_SECURITY, + si->name, si->value, si->value_len, 0, + xattr_ac, data_ac); +} + +struct xattr_handler ocfs2_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = ocfs2_xattr_security_list, + .get = ocfs2_xattr_security_get, + .set = ocfs2_xattr_security_set, +}; + +/* * 'trusted' attributes support */ static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h index 1d8314c7656..5a1ebc789f7 100644 --- a/fs/ocfs2/xattr.h +++ b/fs/ocfs2/xattr.h @@ -30,13 +30,58 @@ enum ocfs2_xattr_type { OCFS2_XATTR_MAX }; +struct ocfs2_security_xattr_info { + int enable; + char *name; + void *value; + size_t value_len; +}; + extern struct xattr_handler ocfs2_xattr_user_handler; extern struct xattr_handler ocfs2_xattr_trusted_handler; +extern struct xattr_handler ocfs2_xattr_security_handler; +#ifdef CONFIG_OCFS2_FS_POSIX_ACL +extern struct xattr_handler ocfs2_xattr_acl_access_handler; +extern struct xattr_handler ocfs2_xattr_acl_default_handler; +#endif extern struct xattr_handler *ocfs2_xattr_handlers[]; ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +int ocfs2_xattr_get_nolock(struct inode *, struct buffer_head *, int, + const char *, void *, size_t); int ocfs2_xattr_set(struct inode *, int, const char *, const void *, size_t, int); +int ocfs2_xattr_set_handle(handle_t *, struct inode *, struct buffer_head *, + int, const char *, const void *, size_t, int, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); int ocfs2_xattr_remove(struct inode *, struct buffer_head *); +int ocfs2_init_security_get(struct inode *, struct inode *, + struct ocfs2_security_xattr_info *); +int ocfs2_init_security_set(handle_t *, struct inode *, + struct buffer_head *, + struct ocfs2_security_xattr_info *, + struct ocfs2_alloc_context *, + struct ocfs2_alloc_context *); +int ocfs2_calc_security_init(struct inode *, + struct ocfs2_security_xattr_info *, + int *, int *, struct ocfs2_alloc_context **); +int ocfs2_calc_xattr_init(struct inode *, struct buffer_head *, + int, struct ocfs2_security_xattr_info *, + int *, int *, struct ocfs2_alloc_context **); + +/* + * xattrs can live inside an inode, as part of an external xattr block, + * or inside an xattr bucket, which is the leaf of a tree rooted in an + * xattr block. Some of the xattr calls, especially the value setting + * functions, want to treat each of these locations as equal. Let's wrap + * them in a structure that we can pass around instead of raw buffer_heads. + */ +struct ocfs2_xattr_value_buf { + struct buffer_head *vb_bh; + ocfs2_journal_access_func vb_access; + struct ocfs2_xattr_value_root *vb_xv; +}; + #endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index 6afe57c84f8..633e9dc972b 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -39,7 +39,6 @@ struct inode *omfs_new_inode(struct inode *dir, int mode) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->a_ops = &omfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; diff --git a/fs/open.c b/fs/open.c index 1cd7d40e999..d882fd2351d 100644 --- a/fs/open.c +++ b/fs/open.c @@ -412,7 +412,7 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len) if (((offset + len) > inode->i_sb->s_maxbytes) || ((offset + len) < 0)) goto out_fput; - if (inode->i_op && inode->i_op->fallocate) + if (inode->i_op->fallocate) ret = inode->i_op->fallocate(inode, mode, offset, len); else ret = -EOPNOTSUPP; diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index d41bdc784de..ffcd04f0012 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -256,9 +256,6 @@ found: break; } - inode->i_gid = 0; - inode->i_uid = 0; - d_add(dentry, inode); return NULL; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 6d5b213b8a9..6d720243f5f 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno) blk_free_devt(part_devt(part)); rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -384,9 +385,9 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) - snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); + dev_set_name(pdev, "%sp%d", dname, partno); else - snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); + dev_set_name(pdev, "%s%d", dname, partno); device_initialize(pdev); pdev->class = &block_class; @@ -447,16 +448,11 @@ void register_disk(struct gendisk *disk) struct block_device *bdev; struct disk_part_iter piter; struct hd_struct *part; - char *s; int err; ddev->parent = disk->driverfs_dev; - strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); - /* ewww... some of these buggers have / in the name... */ - s = strchr(ddev->bus_id, '/'); - if (s) - *s = '!'; + dev_set_name(ddev, disk->disk_name); /* delay uevents, until we scanned partition table */ ddev->uevent_suppress = 1; diff --git a/fs/pipe.c b/fs/pipe.c index aaf797bd57b..891697112f6 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1016,10 +1016,7 @@ int do_pipe_flags(int *fd, int flags) goto err_fdr; fdw = error; - error = audit_fd_pair(fdr, fdw); - if (error < 0) - goto err_fdw; - + audit_fd_pair(fdr, fdw); fd_install(fdr, fr); fd_install(fdw, fw); fd[0] = fdr; @@ -1027,8 +1024,6 @@ int do_pipe_flags(int *fd, int flags) return 0; - err_fdw: - put_unused_fd(fdw); err_fdr: put_unused_fd(fdr); err_read_pipe: diff --git a/fs/proc/base.c b/fs/proc/base.c index cad92c1ac2b..0c9de19a163 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -65,6 +65,7 @@ #include <linux/mm.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> +#include <linux/stacktrace.h> #include <linux/resource.h> #include <linux/module.h> #include <linux/mount.h> @@ -109,25 +110,22 @@ struct pid_entry { .op = OP, \ } -#define DIR(NAME, MODE, OTYPE) \ - NOD(NAME, (S_IFDIR|(MODE)), \ - &proc_##OTYPE##_inode_operations, &proc_##OTYPE##_operations, \ - {} ) -#define LNK(NAME, OTYPE) \ +#define DIR(NAME, MODE, iops, fops) \ + NOD(NAME, (S_IFDIR|(MODE)), &iops, &fops, {} ) +#define LNK(NAME, get_link) \ NOD(NAME, (S_IFLNK|S_IRWXUGO), \ &proc_pid_link_inode_operations, NULL, \ - { .proc_get_link = &proc_##OTYPE##_link } ) -#define REG(NAME, MODE, OTYPE) \ - NOD(NAME, (S_IFREG|(MODE)), NULL, \ - &proc_##OTYPE##_operations, {}) -#define INF(NAME, MODE, OTYPE) \ + { .proc_get_link = get_link } ) +#define REG(NAME, MODE, fops) \ + NOD(NAME, (S_IFREG|(MODE)), NULL, &fops, {}) +#define INF(NAME, MODE, read) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_info_file_operations, \ - { .proc_read = &proc_##OTYPE } ) -#define ONE(NAME, MODE, OTYPE) \ + { .proc_read = read } ) +#define ONE(NAME, MODE, show) \ NOD(NAME, (S_IFREG|(MODE)), \ NULL, &proc_single_file_operations, \ - { .proc_show = &proc_##OTYPE } ) + { .proc_show = show } ) /* * Count the number of hardlinks for the pid_entry table, excluding the . @@ -308,9 +306,9 @@ static int proc_pid_auxv(struct task_struct *task, char *buffer) struct mm_struct *mm = get_task_mm(task); if (mm) { unsigned int nwords = 0; - do + do { nwords += 2; - while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ + } while (mm->saved_auxv[nwords - 2] != 0); /* AT_NULL */ res = nwords * sizeof(mm->saved_auxv[0]); if (res > PAGE_SIZE) res = PAGE_SIZE; @@ -340,6 +338,37 @@ static int proc_pid_wchan(struct task_struct *task, char *buffer) } #endif /* CONFIG_KALLSYMS */ +#ifdef CONFIG_STACKTRACE + +#define MAX_STACK_TRACE_DEPTH 64 + +static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct stack_trace trace; + unsigned long *entries; + int i; + + entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); + if (!entries) + return -ENOMEM; + + trace.nr_entries = 0; + trace.max_entries = MAX_STACK_TRACE_DEPTH; + trace.entries = entries; + trace.skip = 0; + save_stack_trace_tsk(task, &trace); + + for (i = 0; i < trace.nr_entries; i++) { + seq_printf(m, "[<%p>] %pS\n", + (void *)entries[i], (void *)entries[i]); + } + kfree(entries); + + return 0; +} +#endif + #ifdef CONFIG_SCHEDSTATS /* * Provides /proc/PID/schedstat @@ -1186,8 +1215,6 @@ static int sched_show(struct seq_file *m, void *v) struct inode *inode = m->private; struct task_struct *p; - WARN_ON(!inode); - p = get_proc_task(inode); if (!p) return -ESRCH; @@ -1205,8 +1232,6 @@ sched_write(struct file *file, const char __user *buf, struct inode *inode = file->f_path.dentry->d_inode; struct task_struct *p; - WARN_ON(!inode); - p = get_proc_task(inode); if (!p) return -ESRCH; @@ -1426,8 +1451,6 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st if (!ei->pid) goto out_unlock; - inode->i_uid = 0; - inode->i_gid = 0; if (task_dumpable(task)) { rcu_read_lock(); cred = __task_cred(task); @@ -1976,13 +1999,11 @@ static struct dentry *proc_pident_lookup(struct inode *dir, const struct pid_entry *ents, unsigned int nents) { - struct inode *inode; struct dentry *error; struct task_struct *task = get_proc_task(dir); const struct pid_entry *p, *last; error = ERR_PTR(-ENOENT); - inode = NULL; if (!task) goto out_no_task; @@ -2138,12 +2159,12 @@ static const struct file_operations proc_pid_attr_operations = { }; static const struct pid_entry attr_dir_stuff[] = { - REG("current", S_IRUGO|S_IWUGO, pid_attr), - REG("prev", S_IRUGO, pid_attr), - REG("exec", S_IRUGO|S_IWUGO, pid_attr), - REG("fscreate", S_IRUGO|S_IWUGO, pid_attr), - REG("keycreate", S_IRUGO|S_IWUGO, pid_attr), - REG("sockcreate", S_IRUGO|S_IWUGO, pid_attr), + REG("current", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("prev", S_IRUGO, proc_pid_attr_operations), + REG("exec", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("fscreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("keycreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), + REG("sockcreate", S_IRUGO|S_IWUGO, proc_pid_attr_operations), }; static int proc_attr_dir_readdir(struct file * filp, @@ -2349,8 +2370,6 @@ static struct dentry *proc_base_instantiate(struct inode *dir, if (!ei->pid) goto out_iput; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_mode = p->mode; if (S_ISDIR(inode->i_mode)) inode->i_nlink = 2; @@ -2465,74 +2484,77 @@ static const struct file_operations proc_task_operations; static const struct inode_operations proc_task_inode_operations; static const struct pid_entry tgid_base_stuff[] = { - DIR("task", S_IRUGO|S_IXUGO, task), - DIR("fd", S_IRUSR|S_IXUSR, fd), - DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), + DIR("task", S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations), #ifdef CONFIG_NET - DIR("net", S_IRUGO|S_IXUGO, net), + DIR("net", S_IRUGO|S_IXUGO, proc_net_inode_operations, proc_net_operations), #endif - REG("environ", S_IRUSR, environ), - INF("auxv", S_IRUSR, pid_auxv), - ONE("status", S_IRUGO, pid_status), - ONE("personality", S_IRUSR, pid_personality), - INF("limits", S_IRUSR, pid_limits), + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + INF("limits", S_IRUSR, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, pid_sched), + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, pid_cmdline), - ONE("stat", S_IRUGO, tgid_stat), - ONE("statm", S_IRUGO, pid_statm), - REG("maps", S_IRUGO, maps), + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tgid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, numa_maps), + REG("numa_maps", S_IRUGO, proc_numa_maps_operations), #endif - REG("mem", S_IRUSR|S_IWUSR, mem), - LNK("cwd", cwd), - LNK("root", root), - LNK("exe", exe), - REG("mounts", S_IRUGO, mounts), - REG("mountinfo", S_IRUGO, mountinfo), - REG("mountstats", S_IRUSR, mountstats), + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), + REG("mountstats", S_IRUSR, proc_mountstats_operations), #ifdef CONFIG_PROC_PAGE_MONITOR - REG("clear_refs", S_IWUSR, clear_refs), - REG("smaps", S_IRUGO, smaps), - REG("pagemap", S_IRUSR, pagemap), + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_smaps_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY - DIR("attr", S_IRUGO|S_IXUGO, attr_dir), + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, pid_wchan), + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, pid_schedstat), + INF("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP - REG("latency", S_IRUGO, lstats), + REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, cpuset), + REG("cpuset", S_IRUGO, proc_cpuset_operations), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, cgroup), + REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), #ifdef CONFIG_AUDITSYSCALL - REG("loginuid", S_IWUSR|S_IRUGO, loginuid), - REG("sessionid", S_IRUGO, sessionid), + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUGO, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION - REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #if defined(USE_ELF_CORE_DUMP) && defined(CONFIG_ELF_CORE) - REG("coredump_filter", S_IRUGO|S_IWUSR, coredump_filter), + REG("coredump_filter", S_IRUGO|S_IWUSR, proc_coredump_filter_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, tgid_io_accounting), + INF("io", S_IRUGO, proc_tgid_io_accounting), #endif }; @@ -2805,66 +2827,69 @@ out_no_task: * Tasks */ static const struct pid_entry tid_base_stuff[] = { - DIR("fd", S_IRUSR|S_IXUSR, fd), - DIR("fdinfo", S_IRUSR|S_IXUSR, fdinfo), - REG("environ", S_IRUSR, environ), - INF("auxv", S_IRUSR, pid_auxv), - ONE("status", S_IRUGO, pid_status), - ONE("personality", S_IRUSR, pid_personality), - INF("limits", S_IRUSR, pid_limits), + DIR("fd", S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations), + DIR("fdinfo", S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fd_operations), + REG("environ", S_IRUSR, proc_environ_operations), + INF("auxv", S_IRUSR, proc_pid_auxv), + ONE("status", S_IRUGO, proc_pid_status), + ONE("personality", S_IRUSR, proc_pid_personality), + INF("limits", S_IRUSR, proc_pid_limits), #ifdef CONFIG_SCHED_DEBUG - REG("sched", S_IRUGO|S_IWUSR, pid_sched), + REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif #ifdef CONFIG_HAVE_ARCH_TRACEHOOK - INF("syscall", S_IRUSR, pid_syscall), + INF("syscall", S_IRUSR, proc_pid_syscall), #endif - INF("cmdline", S_IRUGO, pid_cmdline), - ONE("stat", S_IRUGO, tid_stat), - ONE("statm", S_IRUGO, pid_statm), - REG("maps", S_IRUGO, maps), + INF("cmdline", S_IRUGO, proc_pid_cmdline), + ONE("stat", S_IRUGO, proc_tid_stat), + ONE("statm", S_IRUGO, proc_pid_statm), + REG("maps", S_IRUGO, proc_maps_operations), #ifdef CONFIG_NUMA - REG("numa_maps", S_IRUGO, numa_maps), + REG("numa_maps", S_IRUGO, proc_numa_maps_operations), #endif - REG("mem", S_IRUSR|S_IWUSR, mem), - LNK("cwd", cwd), - LNK("root", root), - LNK("exe", exe), - REG("mounts", S_IRUGO, mounts), - REG("mountinfo", S_IRUGO, mountinfo), + REG("mem", S_IRUSR|S_IWUSR, proc_mem_operations), + LNK("cwd", proc_cwd_link), + LNK("root", proc_root_link), + LNK("exe", proc_exe_link), + REG("mounts", S_IRUGO, proc_mounts_operations), + REG("mountinfo", S_IRUGO, proc_mountinfo_operations), #ifdef CONFIG_PROC_PAGE_MONITOR - REG("clear_refs", S_IWUSR, clear_refs), - REG("smaps", S_IRUGO, smaps), - REG("pagemap", S_IRUSR, pagemap), + REG("clear_refs", S_IWUSR, proc_clear_refs_operations), + REG("smaps", S_IRUGO, proc_smaps_operations), + REG("pagemap", S_IRUSR, proc_pagemap_operations), #endif #ifdef CONFIG_SECURITY - DIR("attr", S_IRUGO|S_IXUGO, attr_dir), + DIR("attr", S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations), #endif #ifdef CONFIG_KALLSYMS - INF("wchan", S_IRUGO, pid_wchan), + INF("wchan", S_IRUGO, proc_pid_wchan), +#endif +#ifdef CONFIG_STACKTRACE + ONE("stack", S_IRUSR, proc_pid_stack), #endif #ifdef CONFIG_SCHEDSTATS - INF("schedstat", S_IRUGO, pid_schedstat), + INF("schedstat", S_IRUGO, proc_pid_schedstat), #endif #ifdef CONFIG_LATENCYTOP - REG("latency", S_IRUGO, lstats), + REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, cpuset), + REG("cpuset", S_IRUGO, proc_cpuset_operations), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, cgroup), + REG("cgroup", S_IRUGO, proc_cgroup_operations), #endif - INF("oom_score", S_IRUGO, oom_score), - REG("oom_adj", S_IRUGO|S_IWUSR, oom_adjust), + INF("oom_score", S_IRUGO, proc_oom_score), + REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), #ifdef CONFIG_AUDITSYSCALL - REG("loginuid", S_IWUSR|S_IRUGO, loginuid), - REG("sessionid", S_IRUSR, sessionid), + REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), + REG("sessionid", S_IRUSR, proc_sessionid_operations), #endif #ifdef CONFIG_FAULT_INJECTION - REG("make-it-fail", S_IRUGO|S_IWUSR, fault_inject), + REG("make-it-fail", S_IRUGO|S_IWUSR, proc_fault_inject_operations), #endif #ifdef CONFIG_TASK_IO_ACCOUNTING - INF("io", S_IRUGO, tid_io_accounting), + INF("io", S_IRUGO, proc_tid_io_accounting), #endif }; diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 60a359b3558..db7fa5cab98 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -14,7 +14,6 @@ #include <linux/stat.h> #include <linux/module.h> #include <linux/mount.h> -#include <linux/smp_lock.h> #include <linux/init.h> #include <linux/idr.h> #include <linux/namei.h> @@ -379,7 +378,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, struct inode *inode = NULL; int error = -ENOENT; - lock_kernel(); spin_lock(&proc_subdir_lock); for (de = de->subdir; de ; de = de->next) { if (de->namelen != dentry->d_name.len) @@ -397,7 +395,6 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, } spin_unlock(&proc_subdir_lock); out_unlock: - unlock_kernel(); if (inode) { dentry->d_op = &proc_dentry_operations; @@ -432,8 +429,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; - lock_kernel(); - ino = inode->i_ino; i = filp->f_pos; switch (i) { @@ -487,7 +482,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, spin_unlock(&proc_subdir_lock); } ret = 1; -out: unlock_kernel(); +out: return ret; } @@ -504,6 +499,7 @@ int proc_readdir(struct file *filp, void *dirent, filldir_t filldir) * the /proc directory. */ static const struct file_operations proc_dir_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = proc_readdir, }; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 2543fd00c65..3e76bb9b3ad 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -35,16 +35,13 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de) */ void de_put(struct proc_dir_entry *de) { - lock_kernel(); if (!atomic_read(&de->count)) { printk("de_put: entry %s already free!\n", de->name); - unlock_kernel(); return; } if (atomic_dec_and_test(&de->count)) free_proc_entry(de); - unlock_kernel(); } /* diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 3e8aeb8b61c..cd53ff83849 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -41,8 +41,6 @@ do { \ (vmi)->used = 0; \ (vmi)->largest_chunk = 0; \ } while(0) - -extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index b1675c4e66d..43d23948384 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -74,6 +74,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "LowTotal: %8lu kB\n" "LowFree: %8lu kB\n" #endif +#ifndef CONFIG_MMU + "MmapCopy: %8lu kB\n" +#endif "SwapTotal: %8lu kB\n" "SwapFree: %8lu kB\n" "Dirty: %8lu kB\n" @@ -116,6 +119,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.totalram-i.totalhigh), K(i.freeram-i.freehigh), #endif +#ifndef CONFIG_MMU + K((unsigned long) atomic_read(&mmap_pages_allocated)), +#endif K(i.totalswap), K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 3f87d263294..b446d7ad0b0 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -33,33 +33,33 @@ #include "internal.h" /* - * display a single VMA to a sequenced file + * display a single region to a sequenced file */ -int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +static int nommu_region_show(struct seq_file *m, struct vm_region *region) { unsigned long ino = 0; struct file *file; dev_t dev = 0; int flags, len; - flags = vma->vm_flags; - file = vma->vm_file; + flags = region->vm_flags; + file = region->vm_file; if (file) { - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + struct inode *inode = region->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; ino = inode->i_ino; } seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", - vma->vm_start, - vma->vm_end, + region->vm_start, + region->vm_end, flags & VM_READ ? 'r' : '-', flags & VM_WRITE ? 'w' : '-', flags & VM_EXEC ? 'x' : '-', flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', - ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, + ((loff_t)region->vm_pgoff) << PAGE_SHIFT, MAJOR(dev), MINOR(dev), ino, &len); if (file) { @@ -75,61 +75,54 @@ int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) } /* - * display a list of all the VMAs the kernel knows about + * display a list of all the REGIONs the kernel knows about * - nommu kernals have a single flat list */ -static int nommu_vma_list_show(struct seq_file *m, void *v) +static int nommu_region_list_show(struct seq_file *m, void *_p) { - struct vm_area_struct *vma; + struct rb_node *p = _p; - vma = rb_entry((struct rb_node *) v, struct vm_area_struct, vm_rb); - return nommu_vma_show(m, vma); + return nommu_region_show(m, rb_entry(p, struct vm_region, vm_rb)); } -static void *nommu_vma_list_start(struct seq_file *m, loff_t *_pos) +static void *nommu_region_list_start(struct seq_file *m, loff_t *_pos) { - struct rb_node *_rb; + struct rb_node *p; loff_t pos = *_pos; - void *next = NULL; - down_read(&nommu_vma_sem); + down_read(&nommu_region_sem); - for (_rb = rb_first(&nommu_vma_tree); _rb; _rb = rb_next(_rb)) { - if (pos == 0) { - next = _rb; - break; - } - pos--; - } - - return next; + for (p = rb_first(&nommu_region_tree); p; p = rb_next(p)) + if (pos-- == 0) + return p; + return NULL; } -static void nommu_vma_list_stop(struct seq_file *m, void *v) +static void nommu_region_list_stop(struct seq_file *m, void *v) { - up_read(&nommu_vma_sem); + up_read(&nommu_region_sem); } -static void *nommu_vma_list_next(struct seq_file *m, void *v, loff_t *pos) +static void *nommu_region_list_next(struct seq_file *m, void *v, loff_t *pos) { (*pos)++; return rb_next((struct rb_node *) v); } -static const struct seq_operations proc_nommu_vma_list_seqop = { - .start = nommu_vma_list_start, - .next = nommu_vma_list_next, - .stop = nommu_vma_list_stop, - .show = nommu_vma_list_show +static struct seq_operations proc_nommu_region_list_seqop = { + .start = nommu_region_list_start, + .next = nommu_region_list_next, + .stop = nommu_region_list_stop, + .show = nommu_region_list_show }; -static int proc_nommu_vma_list_open(struct inode *inode, struct file *file) +static int proc_nommu_region_list_open(struct inode *inode, struct file *file) { - return seq_open(file, &proc_nommu_vma_list_seqop); + return seq_open(file, &proc_nommu_region_list_seqop); } -static const struct file_operations proc_nommu_vma_list_operations = { - .open = proc_nommu_vma_list_open, +static const struct file_operations proc_nommu_region_list_operations = { + .open = proc_nommu_region_list_open, .read = seq_read, .llseek = seq_lseek, .release = seq_release, @@ -137,7 +130,7 @@ static const struct file_operations proc_nommu_vma_list_operations = { static int __init proc_nommu_init(void) { - proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); + proc_create("maps", S_IRUGO, NULL, &proc_nommu_region_list_operations); return 0; } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 7bc296f424a..04d1270f1c3 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -18,7 +18,6 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include <linux/nsproxy.h> #include <net/net_namespace.h> @@ -172,6 +171,7 @@ static int proc_tgid_net_readdir(struct file *filp, void *dirent, } const struct file_operations proc_net_operations = { + .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = proc_tgid_net_readdir, }; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 06ed10b7da9..94fcfff6863 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -31,7 +31,6 @@ static struct inode *proc_sys_make_inode(struct super_block *sb, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_flags |= S_PRIVATE; /* tell selinux to ignore this inode */ inode->i_mode = table->mode; - inode->i_uid = inode->i_gid = 0; if (!table->child) { inode->i_mode |= S_IFREG; inode->i_op = &proc_sys_inode_operations; diff --git a/fs/proc/root.c b/fs/proc/root.c index 7761602af9d..f6299a25594 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -16,7 +16,6 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/bitops.h> -#include <linux/smp_lock.h> #include <linux/mount.h> #include <linux/pid_namespace.h> @@ -162,17 +161,12 @@ static int proc_root_readdir(struct file * filp, unsigned int nr = filp->f_pos; int ret; - lock_kernel(); - if (nr < FIRST_PROCESS_ENTRY) { int error = proc_readdir(filp, dirent, filldir); - if (error <= 0) { - unlock_kernel(); + if (error <= 0) return error; - } filp->f_pos = FIRST_PROCESS_ENTRY; } - unlock_kernel(); ret = proc_pid_readdir(filp, dirent, filldir); return ret; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 3a8bdd7f575..94063840832 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -396,7 +396,9 @@ static int show_smap(struct seq_file *m, void *v) "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" "Referenced: %8lu kB\n" - "Swap: %8lu kB\n", + "Swap: %8lu kB\n" + "KernelPageSize: %8lu kB\n" + "MMUPageSize: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -405,7 +407,9 @@ static int show_smap(struct seq_file *m, void *v) mss.private_clean >> 10, mss.private_dirty >> 10, mss.referenced >> 10, - mss.swap >> 10); + mss.swap >> 10, + vma_kernel_pagesize(vma) >> 10, + vma_mmu_pagesize(vma) >> 10); if (m->count < m->size) /* vma is copied successfully */ m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 219bd79ea89..343ea1216bc 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -9,31 +9,38 @@ /* * Logic: we've got two memory sums for each process, "shared", and - * "non-shared". Shared memory may get counted more then once, for + * "non-shared". Shared memory may get counted more than once, for * each process that owns it. Non-shared memory is counted * accurately. */ void task_mem(struct seq_file *m, struct mm_struct *mm) { - struct vm_list_struct *vml; - unsigned long bytes = 0, sbytes = 0, slack = 0; + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; + unsigned long bytes = 0, sbytes = 0, slack = 0, size; down_read(&mm->mmap_sem); - for (vml = mm->context.vmlist; vml; vml = vml->next) { - if (!vml->vma) - continue; + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + + bytes += kobjsize(vma); + + region = vma->vm_region; + if (region) { + size = kobjsize(region); + size += region->vm_end - region->vm_start; + } else { + size = vma->vm_end - vma->vm_start; + } - bytes += kobjsize(vml); if (atomic_read(&mm->mm_count) > 1 || - atomic_read(&vml->vma->vm_usage) > 1 - ) { - sbytes += kobjsize((void *) vml->vma->vm_start); - sbytes += kobjsize(vml->vma); + vma->vm_flags & VM_MAYSHARE) { + sbytes += size; } else { - bytes += kobjsize((void *) vml->vma->vm_start); - bytes += kobjsize(vml->vma); - slack += kobjsize((void *) vml->vma->vm_start) - - (vml->vma->vm_end - vml->vma->vm_start); + bytes += size; + if (region) + slack = region->vm_end - vma->vm_end; } } @@ -70,13 +77,14 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) unsigned long task_vsize(struct mm_struct *mm) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct rb_node *p; unsigned long vsize = 0; down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - if (tbp->vma) - vsize += kobjsize((void *) tbp->vma->vm_start); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + vsize += vma->vm_end - vma->vm_start; } up_read(&mm->mmap_sem); return vsize; @@ -85,15 +93,19 @@ unsigned long task_vsize(struct mm_struct *mm) int task_statm(struct mm_struct *mm, int *shared, int *text, int *data, int *resident) { - struct vm_list_struct *tbp; + struct vm_area_struct *vma; + struct vm_region *region; + struct rb_node *p; int size = kobjsize(mm); down_read(&mm->mmap_sem); - for (tbp = mm->context.vmlist; tbp; tbp = tbp->next) { - size += kobjsize(tbp); - if (tbp->vma) { - size += kobjsize(tbp->vma); - size += kobjsize((void *) tbp->vma->vm_start); + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) { + vma = rb_entry(p, struct vm_area_struct, vm_rb); + size += kobjsize(vma); + region = vma->vm_region; + if (region) { + size += kobjsize(region); + size += region->vm_end - region->vm_start; } } @@ -105,20 +117,62 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, } /* + * display a single VMA to a sequenced file + */ +static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma) +{ + unsigned long ino = 0; + struct file *file; + dev_t dev = 0; + int flags, len; + + flags = vma->vm_flags; + file = vma->vm_file; + + if (file) { + struct inode *inode = vma->vm_file->f_path.dentry->d_inode; + dev = inode->i_sb->s_dev; + ino = inode->i_ino; + } + + seq_printf(m, + "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n", + vma->vm_start, + vma->vm_end, + flags & VM_READ ? 'r' : '-', + flags & VM_WRITE ? 'w' : '-', + flags & VM_EXEC ? 'x' : '-', + flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', + vma->vm_pgoff << PAGE_SHIFT, + MAJOR(dev), MINOR(dev), ino, &len); + + if (file) { + len = 25 + sizeof(void *) * 6 - len; + if (len < 1) + len = 1; + seq_printf(m, "%*c", len, ' '); + seq_path(m, &file->f_path, ""); + } + + seq_putc(m, '\n'); + return 0; +} + +/* * display mapping lines for a particular process's /proc/pid/maps */ -static int show_map(struct seq_file *m, void *_vml) +static int show_map(struct seq_file *m, void *_p) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; - return nommu_vma_show(m, vml->vma); + return nommu_vma_show(m, rb_entry(p, struct vm_area_struct, vm_rb)); } static void *m_start(struct seq_file *m, loff_t *pos) { struct proc_maps_private *priv = m->private; - struct vm_list_struct *vml; struct mm_struct *mm; + struct rb_node *p; loff_t n = *pos; /* pin the task and mm whilst we play with them */ @@ -134,9 +188,9 @@ static void *m_start(struct seq_file *m, loff_t *pos) } /* start from the Nth VMA */ - for (vml = mm->context.vmlist; vml; vml = vml->next) + for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) - return vml; + return p; return NULL; } @@ -152,12 +206,12 @@ static void m_stop(struct seq_file *m, void *_vml) } } -static void *m_next(struct seq_file *m, void *_vml, loff_t *pos) +static void *m_next(struct seq_file *m, void *_p, loff_t *pos) { - struct vm_list_struct *vml = _vml; + struct rb_node *p = _p; (*pos)++; - return vml ? vml->next : NULL; + return p ? rb_next(p) : NULL; } static const struct seq_operations proc_pid_maps_ops = { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 03ec5950490..5edcc3f92ba 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -47,8 +47,6 @@ static ssize_t read_from_oldmem(char *buf, size_t count, offset = (unsigned long)(*ppos % PAGE_SIZE); pfn = (unsigned long)(*ppos / PAGE_SIZE); - if (pfn > saved_max_pfn) - return -EINVAL; do { if (count > (PAGE_SIZE - offset)) diff --git a/fs/quota.c b/fs/quota.c index b7fe44e0161..4a8c94f05f7 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -73,7 +73,7 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid case Q_SETQUOTA: case Q_GETQUOTA: /* This is just informative test so we are satisfied without a lock */ - if (!sb_has_quota_enabled(sb, type)) + if (!sb_has_quota_active(sb, type)) return -ESRCH; } @@ -160,6 +160,9 @@ static void quota_sync_sb(struct super_block *sb, int type) int cnt; sb->s_qcop->quota_sync(sb, type); + + if (sb_dqopt(sb)->flags & DQUOT_QUOTA_SYS_FILE) + return; /* This is not very clever (and fast) but currently I don't know about * any other simple way of getting quota data to disk and we must get * them there for userspace to be visible... */ @@ -175,7 +178,7 @@ static void quota_sync_sb(struct super_block *sb, int type) for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && cnt != type) continue; - if (!sb_has_quota_enabled(sb, cnt)) + if (!sb_has_quota_active(sb, cnt)) continue; mutex_lock_nested(&sb_dqopt(sb)->files[cnt]->i_mutex, I_MUTEX_QUOTA); truncate_inode_pages(&sb_dqopt(sb)->files[cnt]->i_data, 0); @@ -201,7 +204,7 @@ restart: for (cnt = 0; cnt < MAXQUOTAS; cnt++) { if (type != -1 && type != cnt) continue; - if (!sb_has_quota_enabled(sb, cnt)) + if (!sb_has_quota_active(sb, cnt)) continue; if (!info_dirty(&sb_dqopt(sb)->info[cnt]) && list_empty(&sb_dqopt(sb)->info[cnt].dqi_dirty_list)) @@ -245,7 +248,7 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void __u32 fmt; down_read(&sb_dqopt(sb)->dqptr_sem); - if (!sb_has_quota_enabled(sb, type)) { + if (!sb_has_quota_active(sb, type)) { up_read(&sb_dqopt(sb)->dqptr_sem); return -ESRCH; } diff --git a/fs/quota_tree.c b/fs/quota_tree.c new file mode 100644 index 00000000000..953404c95b1 --- /dev/null +++ b/fs/quota_tree.c @@ -0,0 +1,645 @@ +/* + * vfsv0 quota IO operations on file + */ + +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/mount.h> +#include <linux/dqblk_v2.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/slab.h> +#include <linux/quotaops.h> + +#include <asm/byteorder.h> + +#include "quota_tree.h" + +MODULE_AUTHOR("Jan Kara"); +MODULE_DESCRIPTION("Quota trie support"); +MODULE_LICENSE("GPL"); + +#define __QUOTA_QT_PARANOIA + +typedef char *dqbuf_t; + +static int get_index(struct qtree_mem_dqinfo *info, qid_t id, int depth) +{ + unsigned int epb = info->dqi_usable_bs >> 2; + + depth = info->dqi_qtree_depth - depth - 1; + while (depth--) + id /= epb; + return id % epb; +} + +/* Number of entries in one blocks */ +static inline int qtree_dqstr_in_blk(struct qtree_mem_dqinfo *info) +{ + return (info->dqi_usable_bs - sizeof(struct qt_disk_dqdbheader)) + / info->dqi_entry_size; +} + +static dqbuf_t getdqbuf(size_t size) +{ + dqbuf_t buf = kmalloc(size, GFP_NOFS); + if (!buf) + printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n"); + return buf; +} + +static inline void freedqbuf(dqbuf_t buf) +{ + kfree(buf); +} + +static inline ssize_t read_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf) +{ + struct super_block *sb = info->dqi_sb; + + memset(buf, 0, info->dqi_usable_bs); + return sb->s_op->quota_read(sb, info->dqi_type, (char *)buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); +} + +static inline ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, dqbuf_t buf) +{ + struct super_block *sb = info->dqi_sb; + + return sb->s_op->quota_write(sb, info->dqi_type, (char *)buf, + info->dqi_usable_bs, blk << info->dqi_blocksize_bits); +} + +/* Remove empty block from list and return it */ +static int get_free_dqblk(struct qtree_mem_dqinfo *info) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int ret, blk; + + if (!buf) + return -ENOMEM; + if (info->dqi_free_blk) { + blk = info->dqi_free_blk; + ret = read_blk(info, blk, buf); + if (ret < 0) + goto out_buf; + info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); + } + else { + memset(buf, 0, info->dqi_usable_bs); + /* Assure block allocation... */ + ret = write_blk(info, info->dqi_blocks, buf); + if (ret < 0) + goto out_buf; + blk = info->dqi_blocks++; + } + mark_info_dirty(info->dqi_sb, info->dqi_type); + ret = blk; +out_buf: + freedqbuf(buf); + return ret; +} + +/* Insert empty block to the list */ +static int put_free_dqblk(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_blk); + dh->dqdh_prev_free = cpu_to_le32(0); + dh->dqdh_entries = cpu_to_le16(0); + err = write_blk(info, blk, buf); + if (err < 0) + return err; + info->dqi_free_blk = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +} + +/* Remove given block from the list of blocks with free entries */ +static int remove_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + uint nextblk = le32_to_cpu(dh->dqdh_next_free); + uint prevblk = le32_to_cpu(dh->dqdh_prev_free); + int err; + + if (!tmpbuf) + return -ENOMEM; + if (nextblk) { + err = read_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + dh->dqdh_prev_free; + err = write_blk(info, nextblk, tmpbuf); + if (err < 0) + goto out_buf; + } + if (prevblk) { + err = read_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_next_free = + dh->dqdh_next_free; + err = write_blk(info, prevblk, tmpbuf); + if (err < 0) + goto out_buf; + } else { + info->dqi_free_entry = nextblk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + } + freedqbuf(tmpbuf); + dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); + /* No matter whether write succeeds block is out of list */ + if (write_blk(info, blk, buf) < 0) + printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); + return 0; +out_buf: + freedqbuf(tmpbuf); + return err; +} + +/* Insert given block to the beginning of list with free entries */ +static int insert_free_dqentry(struct qtree_mem_dqinfo *info, dqbuf_t buf, uint blk) +{ + dqbuf_t tmpbuf = getdqbuf(info->dqi_usable_bs); + struct qt_disk_dqdbheader *dh = (struct qt_disk_dqdbheader *)buf; + int err; + + if (!tmpbuf) + return -ENOMEM; + dh->dqdh_next_free = cpu_to_le32(info->dqi_free_entry); + dh->dqdh_prev_free = cpu_to_le32(0); + err = write_blk(info, blk, buf); + if (err < 0) + goto out_buf; + if (info->dqi_free_entry) { + err = read_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + ((struct qt_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = + cpu_to_le32(blk); + err = write_blk(info, info->dqi_free_entry, tmpbuf); + if (err < 0) + goto out_buf; + } + freedqbuf(tmpbuf); + info->dqi_free_entry = blk; + mark_info_dirty(info->dqi_sb, info->dqi_type); + return 0; +out_buf: + freedqbuf(tmpbuf); + return err; +} + +/* Is the entry in the block free? */ +int qtree_entry_unused(struct qtree_mem_dqinfo *info, char *disk) +{ + int i; + + for (i = 0; i < info->dqi_entry_size; i++) + if (disk[i]) + return 0; + return 1; +} +EXPORT_SYMBOL(qtree_entry_unused); + +/* Find space for dquot */ +static uint find_free_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, int *err) +{ + uint blk, i; + struct qt_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + char *ddquot; + + *err = 0; + if (!buf) { + *err = -ENOMEM; + return 0; + } + dh = (struct qt_disk_dqdbheader *)buf; + if (info->dqi_free_entry) { + blk = info->dqi_free_entry; + *err = read_blk(info, blk, buf); + if (*err < 0) + goto out_buf; + } else { + blk = get_free_dqblk(info); + if ((int)blk < 0) { + *err = blk; + freedqbuf(buf); + return 0; + } + memset(buf, 0, info->dqi_usable_bs); + /* This is enough as block is already zeroed and entry list is empty... */ + info->dqi_free_entry = blk; + mark_info_dirty(dquot->dq_sb, dquot->dq_type); + } + /* Block will be full? */ + if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { + *err = remove_free_dqentry(info, buf, blk); + if (*err < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't " + "remove block (%u) from entry free list.\n", + blk); + goto out_buf; + } + } + le16_add_cpu(&dh->dqdh_entries, 1); + /* Find free structure in block */ + for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader); + i < qtree_dqstr_in_blk(info) && !qtree_entry_unused(info, ddquot); + i++, ddquot += info->dqi_entry_size); +#ifdef __QUOTA_QT_PARANOIA + if (i == qtree_dqstr_in_blk(info)) { + printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " + "but it shouldn't.\n"); + *err = -EIO; + goto out_buf; + } +#endif + *err = write_blk(info, blk, buf); + if (*err < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " + "data block %u.\n", blk); + goto out_buf; + } + dquot->dq_off = (blk << info->dqi_blocksize_bits) + + sizeof(struct qt_disk_dqdbheader) + + i * info->dqi_entry_size; + freedqbuf(buf); + return blk; +out_buf: + freedqbuf(buf); + return 0; +} + +/* Insert reference to structure into the trie */ +static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *treeblk, int depth) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + int ret = 0, newson = 0, newact = 0; + __le32 *ref; + uint newblk; + + if (!buf) + return -ENOMEM; + if (!*treeblk) { + ret = get_free_dqblk(info); + if (ret < 0) + goto out_buf; + *treeblk = ret; + memset(buf, 0, info->dqi_usable_bs); + newact = 1; + } else { + ret = read_blk(info, *treeblk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read tree quota block " + "%u.\n", *treeblk); + goto out_buf; + } + } + ref = (__le32 *)buf; + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!newblk) + newson = 1; + if (depth == info->dqi_qtree_depth - 1) { +#ifdef __QUOTA_QT_PARANOIA + if (newblk) { + printk(KERN_ERR "VFS: Inserting already present quota " + "entry (block %u).\n", + le32_to_cpu(ref[get_index(info, + dquot->dq_id, depth)])); + ret = -EIO; + goto out_buf; + } +#endif + newblk = find_free_dqentry(info, dquot, &ret); + } else { + ret = do_insert_tree(info, dquot, &newblk, depth+1); + } + if (newson && ret >= 0) { + ref[get_index(info, dquot->dq_id, depth)] = + cpu_to_le32(newblk); + ret = write_blk(info, *treeblk, buf); + } else if (newact && ret < 0) { + put_free_dqblk(info, buf, *treeblk); + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Wrapper for inserting quota structure into tree */ +static inline int dq_insert_tree(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + int tmp = QT_TREEOFF; + return do_insert_tree(info, dquot, &tmp, 0); +} + +/* + * We don't have to be afraid of deadlocks as we never have quotas on quota files... + */ +int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_type; + struct super_block *sb = dquot->dq_sb; + ssize_t ret; + dqbuf_t ddquot = getdqbuf(info->dqi_entry_size); + + if (!ddquot) + return -ENOMEM; + + /* dq_off is guarded by dqio_mutex */ + if (!dquot->dq_off) { + ret = dq_insert_tree(info, dquot); + if (ret < 0) { + printk(KERN_ERR "VFS: Error %zd occurred while " + "creating quota.\n", ret); + freedqbuf(ddquot); + return ret; + } + } + spin_lock(&dq_data_lock); + info->dqi_ops->mem2disk_dqblk(ddquot, dquot); + spin_unlock(&dq_data_lock); + ret = sb->s_op->quota_write(sb, type, (char *)ddquot, + info->dqi_entry_size, dquot->dq_off); + if (ret != info->dqi_entry_size) { + printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", + sb->s_id); + if (ret >= 0) + ret = -ENOSPC; + } else { + ret = 0; + } + dqstats.writes++; + freedqbuf(ddquot); + + return ret; +} +EXPORT_SYMBOL(qtree_write_dquot); + +/* Free dquot entry in data block */ +static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint blk) +{ + struct qt_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + + if (!buf) + return -ENOMEM; + if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { + printk(KERN_ERR "VFS: Quota structure has offset to other " + "block (%u) than it should (%u).\n", blk, + (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + goto out_buf; + } + ret = read_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + goto out_buf; + } + dh = (struct qt_disk_dqdbheader *)buf; + le16_add_cpu(&dh->dqdh_entries, -1); + if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ + ret = remove_free_dqentry(info, buf, blk); + if (ret >= 0) + ret = put_free_dqblk(info, buf, blk); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't move quota data block (%u) " + "to free list.\n", blk); + goto out_buf; + } + } else { + memset(buf + + (dquot->dq_off & ((1 << info->dqi_blocksize_bits) - 1)), + 0, info->dqi_entry_size); + if (le16_to_cpu(dh->dqdh_entries) == + qtree_dqstr_in_blk(info) - 1) { + /* Insert will write block itself */ + ret = insert_free_dqentry(info, buf, blk); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't insert quota data " + "block (%u) to free entry list.\n", blk); + goto out_buf; + } + } else { + ret = write_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't write quota data " + "block %u\n", blk); + goto out_buf; + } + } + } + dquot->dq_off = 0; /* Quota is now unattached */ +out_buf: + freedqbuf(buf); + return ret; +} + +/* Remove reference to dquot from tree */ +static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, + uint *blk, int depth) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + int ret = 0; + uint newblk; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, *blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + goto out_buf; + } + newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (depth == info->dqi_qtree_depth - 1) { + ret = free_dqentry(info, dquot, newblk); + newblk = 0; + } else { + ret = remove_tree(info, dquot, &newblk, depth+1); + } + if (ret >= 0 && !newblk) { + int i; + ref[get_index(info, dquot->dq_id, depth)] = cpu_to_le32(0); + /* Block got empty? */ + for (i = 0; + i < (info->dqi_usable_bs >> 2) && !ref[i]; + i++); + /* Don't put the root block into the free block list */ + if (i == (info->dqi_usable_bs >> 2) + && *blk != QT_TREEOFF) { + put_free_dqblk(info, buf, *blk); + *blk = 0; + } else { + ret = write_blk(info, *blk, buf); + if (ret < 0) + printk(KERN_ERR "VFS: Can't write quota tree " + "block %u.\n", *blk); + } + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Delete dquot from tree */ +int qtree_delete_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + uint tmp = QT_TREEOFF; + + if (!dquot->dq_off) /* Even not allocated? */ + return 0; + return remove_tree(info, dquot, &tmp, 0); +} +EXPORT_SYMBOL(qtree_delete_dquot); + +/* Find entry in block */ +static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + int i; + char *ddquot; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + for (i = 0, ddquot = ((char *)buf) + sizeof(struct qt_disk_dqdbheader); + i < qtree_dqstr_in_blk(info) && !info->dqi_ops->is_id(ddquot, dquot); + i++, ddquot += info->dqi_entry_size); + if (i == qtree_dqstr_in_blk(info)) { + printk(KERN_ERR "VFS: Quota for id %u referenced " + "but not present.\n", dquot->dq_id); + ret = -EIO; + goto out_buf; + } else { + ret = (blk << info->dqi_blocksize_bits) + sizeof(struct + qt_disk_dqdbheader) + i * info->dqi_entry_size; + } +out_buf: + freedqbuf(buf); + return ret; +} + +/* Find entry for given id in the tree */ +static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot, uint blk, int depth) +{ + dqbuf_t buf = getdqbuf(info->dqi_usable_bs); + loff_t ret = 0; + __le32 *ref = (__le32 *)buf; + + if (!buf) + return -ENOMEM; + ret = read_blk(info, blk, buf); + if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + ret = 0; + blk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); + if (!blk) /* No reference? */ + goto out_buf; + if (depth < info->dqi_qtree_depth - 1) + ret = find_tree_dqentry(info, dquot, blk, depth+1); + else + ret = find_block_dqentry(info, dquot, blk); +out_buf: + freedqbuf(buf); + return ret; +} + +/* Find entry for given id in the tree - wrapper function */ +static inline loff_t find_dqentry(struct qtree_mem_dqinfo *info, + struct dquot *dquot) +{ + return find_tree_dqentry(info, dquot, QT_TREEOFF, 0); +} + +int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + int type = dquot->dq_type; + struct super_block *sb = dquot->dq_sb; + loff_t offset; + dqbuf_t ddquot; + int ret = 0; + +#ifdef __QUOTA_QT_PARANOIA + /* Invalidated quota? */ + if (!sb_dqopt(dquot->dq_sb)->files[type]) { + printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); + return -EIO; + } +#endif + /* Do we know offset of the dquot entry in the quota file? */ + if (!dquot->dq_off) { + offset = find_dqentry(info, dquot); + if (offset <= 0) { /* Entry not present? */ + if (offset < 0) + printk(KERN_ERR "VFS: Can't read quota " + "structure for id %u.\n", dquot->dq_id); + dquot->dq_off = 0; + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + ret = offset; + goto out; + } + dquot->dq_off = offset; + } + ddquot = getdqbuf(info->dqi_entry_size); + if (!ddquot) + return -ENOMEM; + ret = sb->s_op->quota_read(sb, type, (char *)ddquot, + info->dqi_entry_size, dquot->dq_off); + if (ret != info->dqi_entry_size) { + if (ret >= 0) + ret = -EIO; + printk(KERN_ERR "VFS: Error while reading quota " + "structure for id %u.\n", dquot->dq_id); + set_bit(DQ_FAKE_B, &dquot->dq_flags); + memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + freedqbuf(ddquot); + goto out; + } + spin_lock(&dq_data_lock); + info->dqi_ops->disk2mem_dqblk(dquot, ddquot); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && + !dquot->dq_dqb.dqb_isoftlimit) + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + freedqbuf(ddquot); +out: + dqstats.reads++; + return ret; +} +EXPORT_SYMBOL(qtree_read_dquot); + +/* Check whether dquot should not be deleted. We know we are + * the only one operating on dquot (thanks to dq_lock) */ +int qtree_release_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) +{ + if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) + return qtree_delete_dquot(info, dquot); + return 0; +} +EXPORT_SYMBOL(qtree_release_dquot); diff --git a/fs/quota_tree.h b/fs/quota_tree.h new file mode 100644 index 00000000000..a1ab8db81a5 --- /dev/null +++ b/fs/quota_tree.h @@ -0,0 +1,25 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTA_TREE_H +#define _LINUX_QUOTA_TREE_H + +#include <linux/types.h> +#include <linux/quota.h> + +/* + * Structure of header of block with quota structures. It is padded to 16 bytes so + * there will be space for exactly 21 quota-entries in a block + */ +struct qt_disk_dqdbheader { + __le32 dqdh_next_free; /* Number of next block with free entry */ + __le32 dqdh_prev_free; /* Number of previous block with free entry */ + __le16 dqdh_entries; /* Number of valid entries in block */ + __le16 dqdh_pad1; + __le32 dqdh_pad2; +}; + +#define QT_TREEOFF 1 /* Offset of tree in file in blocks */ + +#endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/fs/quota_v1.c b/fs/quota_v1.c index 5ae15b13eeb..b4af1c69ad1 100644 --- a/fs/quota_v1.c +++ b/fs/quota_v1.c @@ -3,25 +3,39 @@ #include <linux/quota.h> #include <linux/quotaops.h> #include <linux/dqblk_v1.h> -#include <linux/quotaio_v1.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> #include <asm/byteorder.h> +#include "quotaio_v1.h" + MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Old quota format support"); MODULE_LICENSE("GPL"); +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) + +static inline qsize_t v1_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v1_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} + static void v1_disk2mem_dqblk(struct mem_dqblk *m, struct v1_disk_dqblk *d) { m->dqb_ihardlimit = d->dqb_ihardlimit; m->dqb_isoftlimit = d->dqb_isoftlimit; m->dqb_curinodes = d->dqb_curinodes; - m->dqb_bhardlimit = d->dqb_bhardlimit; - m->dqb_bsoftlimit = d->dqb_bsoftlimit; - m->dqb_curspace = ((qsize_t)d->dqb_curblocks) << QUOTABLOCK_BITS; + m->dqb_bhardlimit = v1_qbtos(d->dqb_bhardlimit); + m->dqb_bsoftlimit = v1_qbtos(d->dqb_bsoftlimit); + m->dqb_curspace = v1_qbtos(d->dqb_curblocks); m->dqb_itime = d->dqb_itime; m->dqb_btime = d->dqb_btime; } @@ -31,9 +45,9 @@ static void v1_mem2disk_dqblk(struct v1_disk_dqblk *d, struct mem_dqblk *m) d->dqb_ihardlimit = m->dqb_ihardlimit; d->dqb_isoftlimit = m->dqb_isoftlimit; d->dqb_curinodes = m->dqb_curinodes; - d->dqb_bhardlimit = m->dqb_bhardlimit; - d->dqb_bsoftlimit = m->dqb_bsoftlimit; - d->dqb_curblocks = toqb(m->dqb_curspace); + d->dqb_bhardlimit = v1_stoqb(m->dqb_bhardlimit); + d->dqb_bsoftlimit = v1_stoqb(m->dqb_bsoftlimit); + d->dqb_curblocks = v1_stoqb(m->dqb_curspace); d->dqb_itime = m->dqb_itime; d->dqb_btime = m->dqb_btime; } diff --git a/fs/quota_v2.c b/fs/quota_v2.c index b53827dc02d..b618b563635 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -6,7 +6,6 @@ #include <linux/fs.h> #include <linux/mount.h> #include <linux/dqblk_v2.h> -#include <linux/quotaio_v2.h> #include <linux/kernel.h> #include <linux/init.h> #include <linux/module.h> @@ -15,16 +14,37 @@ #include <asm/byteorder.h> +#include "quota_tree.h" +#include "quotaio_v2.h" + MODULE_AUTHOR("Jan Kara"); MODULE_DESCRIPTION("Quota format v2 support"); MODULE_LICENSE("GPL"); #define __QUOTA_V2_PARANOIA -typedef char *dqbuf_t; +static void v2_mem2diskdqb(void *dp, struct dquot *dquot); +static void v2_disk2memdqb(struct dquot *dquot, void *dp); +static int v2_is_id(void *dp, struct dquot *dquot); + +static struct qtree_fmt_operations v2_qtree_ops = { + .mem2disk_dqblk = v2_mem2diskdqb, + .disk2mem_dqblk = v2_disk2memdqb, + .is_id = v2_is_id, +}; + +#define QUOTABLOCK_BITS 10 +#define QUOTABLOCK_SIZE (1 << QUOTABLOCK_BITS) -#define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) -#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) +static inline qsize_t v2_stoqb(qsize_t space) +{ + return (space + QUOTABLOCK_SIZE - 1) >> QUOTABLOCK_BITS; +} + +static inline qsize_t v2_qbtos(qsize_t blocks) +{ + return blocks << QUOTABLOCK_BITS; +} /* Check whether given file is really vfsv0 quotafile */ static int v2_check_quota_file(struct super_block *sb, int type) @@ -50,7 +70,8 @@ static int v2_check_quota_file(struct super_block *sb, int type) static int v2_read_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo; ssize_t size; size = sb->s_op->quota_read(sb, type, (char *)&dinfo, @@ -60,15 +81,29 @@ static int v2_read_file_info(struct super_block *sb, int type) sb->s_id); return -1; } + info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); + if (!info->dqi_priv) { + printk(KERN_WARNING + "Not enough memory for quota information structure.\n"); + return -1; + } + qinfo = info->dqi_priv; /* limits are stored as unsigned 32-bit data */ info->dqi_maxblimit = 0xffffffff; info->dqi_maxilimit = 0xffffffff; info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); - info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); - info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); - info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + qinfo->dqi_sb = sb; + qinfo->dqi_type = type; + qinfo->dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + qinfo->dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + qinfo->dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); + qinfo->dqi_blocksize_bits = V2_DQBLKSIZE_BITS; + qinfo->dqi_usable_bs = 1 << V2_DQBLKSIZE_BITS; + qinfo->dqi_qtree_depth = qtree_depth(qinfo); + qinfo->dqi_entry_size = sizeof(struct v2_disk_dqblk); + qinfo->dqi_ops = &v2_qtree_ops; return 0; } @@ -76,7 +111,8 @@ static int v2_read_file_info(struct super_block *sb, int type) static int v2_write_file_info(struct super_block *sb, int type) { struct v2_disk_dqinfo dinfo; - struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + struct mem_dqinfo *info = sb_dqinfo(sb, type); + struct qtree_mem_dqinfo *qinfo = info->dqi_priv; ssize_t size; spin_lock(&dq_data_lock); @@ -85,9 +121,9 @@ static int v2_write_file_info(struct super_block *sb, int type) dinfo.dqi_igrace = cpu_to_le32(info->dqi_igrace); dinfo.dqi_flags = cpu_to_le32(info->dqi_flags & DQF_MASK); spin_unlock(&dq_data_lock); - dinfo.dqi_blocks = cpu_to_le32(info->u.v2_i.dqi_blocks); - dinfo.dqi_free_blk = cpu_to_le32(info->u.v2_i.dqi_free_blk); - dinfo.dqi_free_entry = cpu_to_le32(info->u.v2_i.dqi_free_entry); + dinfo.dqi_blocks = cpu_to_le32(qinfo->dqi_blocks); + dinfo.dqi_free_blk = cpu_to_le32(qinfo->dqi_free_blk); + dinfo.dqi_free_entry = cpu_to_le32(qinfo->dqi_free_entry); size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { @@ -98,574 +134,75 @@ static int v2_write_file_info(struct super_block *sb, int type) return 0; } -static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) +static void v2_disk2memdqb(struct dquot *dquot, void *dp) { + struct v2_disk_dqblk *d = dp, empty; + struct mem_dqblk *m = &dquot->dq_dqb; + m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); m->dqb_itime = le64_to_cpu(d->dqb_itime); - m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); - m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); + m->dqb_bhardlimit = v2_qbtos(le32_to_cpu(d->dqb_bhardlimit)); + m->dqb_bsoftlimit = v2_qbtos(le32_to_cpu(d->dqb_bsoftlimit)); m->dqb_curspace = le64_to_cpu(d->dqb_curspace); m->dqb_btime = le64_to_cpu(d->dqb_btime); + /* We need to escape back all-zero structure */ + memset(&empty, 0, sizeof(struct v2_disk_dqblk)); + empty.dqb_itime = cpu_to_le64(1); + if (!memcmp(&empty, dp, sizeof(struct v2_disk_dqblk))) + m->dqb_itime = 0; } -static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) +static void v2_mem2diskdqb(void *dp, struct dquot *dquot) { + struct v2_disk_dqblk *d = dp; + struct mem_dqblk *m = &dquot->dq_dqb; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); d->dqb_itime = cpu_to_le64(m->dqb_itime); - d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); - d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); + d->dqb_bhardlimit = cpu_to_le32(v2_stoqb(m->dqb_bhardlimit)); + d->dqb_bsoftlimit = cpu_to_le32(v2_stoqb(m->dqb_bsoftlimit)); d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); - d->dqb_id = cpu_to_le32(id); -} - -static dqbuf_t getdqbuf(void) -{ - dqbuf_t buf = kmalloc(V2_DQBLKSIZE, GFP_NOFS); - if (!buf) - printk(KERN_WARNING "VFS: Not enough memory for quota buffers.\n"); - return buf; -} - -static inline void freedqbuf(dqbuf_t buf) -{ - kfree(buf); -} - -static inline ssize_t read_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf) -{ - memset(buf, 0, V2_DQBLKSIZE); - return sb->s_op->quota_read(sb, type, (char *)buf, - V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS); -} - -static inline ssize_t write_blk(struct super_block *sb, int type, uint blk, dqbuf_t buf) -{ - return sb->s_op->quota_write(sb, type, (char *)buf, - V2_DQBLKSIZE, blk << V2_DQBLKSIZE_BITS); -} - -/* Remove empty block from list and return it */ -static int get_free_dqblk(struct super_block *sb, int type) -{ - dqbuf_t buf = getdqbuf(); - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - int ret, blk; - - if (!buf) - return -ENOMEM; - if (info->u.v2_i.dqi_free_blk) { - blk = info->u.v2_i.dqi_free_blk; - if ((ret = read_blk(sb, type, blk, buf)) < 0) - goto out_buf; - info->u.v2_i.dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); - } - else { - memset(buf, 0, V2_DQBLKSIZE); - /* Assure block allocation... */ - if ((ret = write_blk(sb, type, info->u.v2_i.dqi_blocks, buf)) < 0) - goto out_buf; - blk = info->u.v2_i.dqi_blocks++; - } - mark_info_dirty(sb, type); - ret = blk; -out_buf: - freedqbuf(buf); - return ret; -} - -/* Insert empty block to the list */ -static int put_free_dqblk(struct super_block *sb, int type, dqbuf_t buf, uint blk) -{ - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - int err; - - dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_blk); - dh->dqdh_prev_free = cpu_to_le32(0); - dh->dqdh_entries = cpu_to_le16(0); - info->u.v2_i.dqi_free_blk = blk; - mark_info_dirty(sb, type); - /* Some strange block. We had better leave it... */ - if ((err = write_blk(sb, type, blk, buf)) < 0) - return err; - return 0; + d->dqb_id = cpu_to_le32(dquot->dq_id); + if (qtree_entry_unused(info, dp)) + d->dqb_itime = cpu_to_le64(1); } -/* Remove given block from the list of blocks with free entries */ -static int remove_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk) +static int v2_is_id(void *dp, struct dquot *dquot) { - dqbuf_t tmpbuf = getdqbuf(); - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - uint nextblk = le32_to_cpu(dh->dqdh_next_free), prevblk = le32_to_cpu(dh->dqdh_prev_free); - int err; + struct v2_disk_dqblk *d = dp; + struct qtree_mem_dqinfo *info = + sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - if (!tmpbuf) - return -ENOMEM; - if (nextblk) { - if ((err = read_blk(sb, type, nextblk, tmpbuf)) < 0) - goto out_buf; - ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = dh->dqdh_prev_free; - if ((err = write_blk(sb, type, nextblk, tmpbuf)) < 0) - goto out_buf; - } - if (prevblk) { - if ((err = read_blk(sb, type, prevblk, tmpbuf)) < 0) - goto out_buf; - ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_next_free = dh->dqdh_next_free; - if ((err = write_blk(sb, type, prevblk, tmpbuf)) < 0) - goto out_buf; - } - else { - info->u.v2_i.dqi_free_entry = nextblk; - mark_info_dirty(sb, type); - } - freedqbuf(tmpbuf); - dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); - /* No matter whether write succeeds block is out of list */ - if (write_blk(sb, type, blk, buf) < 0) - printk(KERN_ERR "VFS: Can't write block (%u) with free entries.\n", blk); - return 0; -out_buf: - freedqbuf(tmpbuf); - return err; -} - -/* Insert given block to the beginning of list with free entries */ -static int insert_free_dqentry(struct super_block *sb, int type, dqbuf_t buf, uint blk) -{ - dqbuf_t tmpbuf = getdqbuf(); - struct mem_dqinfo *info = sb_dqinfo(sb, type); - struct v2_disk_dqdbheader *dh = (struct v2_disk_dqdbheader *)buf; - int err; - - if (!tmpbuf) - return -ENOMEM; - dh->dqdh_next_free = cpu_to_le32(info->u.v2_i.dqi_free_entry); - dh->dqdh_prev_free = cpu_to_le32(0); - if ((err = write_blk(sb, type, blk, buf)) < 0) - goto out_buf; - if (info->u.v2_i.dqi_free_entry) { - if ((err = read_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) - goto out_buf; - ((struct v2_disk_dqdbheader *)tmpbuf)->dqdh_prev_free = cpu_to_le32(blk); - if ((err = write_blk(sb, type, info->u.v2_i.dqi_free_entry, tmpbuf)) < 0) - goto out_buf; - } - freedqbuf(tmpbuf); - info->u.v2_i.dqi_free_entry = blk; - mark_info_dirty(sb, type); - return 0; -out_buf: - freedqbuf(tmpbuf); - return err; -} - -/* Find space for dquot */ -static uint find_free_dqentry(struct dquot *dquot, int *err) -{ - struct super_block *sb = dquot->dq_sb; - struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type; - uint blk, i; - struct v2_disk_dqdbheader *dh; - struct v2_disk_dqblk *ddquot; - struct v2_disk_dqblk fakedquot; - dqbuf_t buf; - - *err = 0; - if (!(buf = getdqbuf())) { - *err = -ENOMEM; + if (qtree_entry_unused(info, dp)) return 0; - } - dh = (struct v2_disk_dqdbheader *)buf; - ddquot = GETENTRIES(buf); - if (info->u.v2_i.dqi_free_entry) { - blk = info->u.v2_i.dqi_free_entry; - if ((*err = read_blk(sb, dquot->dq_type, blk, buf)) < 0) - goto out_buf; - } - else { - blk = get_free_dqblk(sb, dquot->dq_type); - if ((int)blk < 0) { - *err = blk; - freedqbuf(buf); - return 0; - } - memset(buf, 0, V2_DQBLKSIZE); - /* This is enough as block is already zeroed and entry list is empty... */ - info->u.v2_i.dqi_free_entry = blk; - mark_info_dirty(sb, dquot->dq_type); - } - if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ - if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); - goto out_buf; - } - le16_add_cpu(&dh->dqdh_entries, 1); - memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); - /* Find free structure in block */ - for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); -#ifdef __QUOTA_V2_PARANOIA - if (i == V2_DQSTRINBLK) { - printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); - *err = -EIO; - goto out_buf; - } -#endif - if ((*err = write_blk(sb, dquot->dq_type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); - goto out_buf; - } - dquot->dq_off = (blk<<V2_DQBLKSIZE_BITS)+sizeof(struct v2_disk_dqdbheader)+i*sizeof(struct v2_disk_dqblk); - freedqbuf(buf); - return blk; -out_buf: - freedqbuf(buf); - return 0; -} - -/* Insert reference to structure into the trie */ -static int do_insert_tree(struct dquot *dquot, uint *treeblk, int depth) -{ - struct super_block *sb = dquot->dq_sb; - dqbuf_t buf; - int ret = 0, newson = 0, newact = 0; - __le32 *ref; - uint newblk; - - if (!(buf = getdqbuf())) - return -ENOMEM; - if (!*treeblk) { - ret = get_free_dqblk(sb, dquot->dq_type); - if (ret < 0) - goto out_buf; - *treeblk = ret; - memset(buf, 0, V2_DQBLKSIZE); - newact = 1; - } - else { - if ((ret = read_blk(sb, dquot->dq_type, *treeblk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read tree quota block %u.\n", *treeblk); - goto out_buf; - } - } - ref = (__le32 *)buf; - newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); - if (!newblk) - newson = 1; - if (depth == V2_DQTREEDEPTH-1) { -#ifdef __QUOTA_V2_PARANOIA - if (newblk) { - printk(KERN_ERR "VFS: Inserting already present quota entry (block %u).\n", le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)])); - ret = -EIO; - goto out_buf; - } -#endif - newblk = find_free_dqentry(dquot, &ret); - } - else - ret = do_insert_tree(dquot, &newblk, depth+1); - if (newson && ret >= 0) { - ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk); - ret = write_blk(sb, dquot->dq_type, *treeblk, buf); - } - else if (newact && ret < 0) - put_free_dqblk(sb, dquot->dq_type, buf, *treeblk); -out_buf: - freedqbuf(buf); - return ret; + return le32_to_cpu(d->dqb_id) == dquot->dq_id; } -/* Wrapper for inserting quota structure into tree */ -static inline int dq_insert_tree(struct dquot *dquot) +static int v2_read_dquot(struct dquot *dquot) { - int tmp = V2_DQTREEOFF; - return do_insert_tree(dquot, &tmp, 0); + return qtree_read_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); } -/* - * We don't have to be afraid of deadlocks as we never have quotas on quota files... - */ static int v2_write_dquot(struct dquot *dquot) { - int type = dquot->dq_type; - ssize_t ret; - struct v2_disk_dqblk ddquot, empty; - - /* dq_off is guarded by dqio_mutex */ - if (!dquot->dq_off) - if ((ret = dq_insert_tree(dquot)) < 0) { - printk(KERN_ERR "VFS: Error %zd occurred while creating quota.\n", ret); - return ret; - } - spin_lock(&dq_data_lock); - mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); - /* Argh... We may need to write structure full of zeroes but that would be - * treated as an empty place by the rest of the code. Format change would - * be definitely cleaner but the problems probably are not worth it */ - memset(&empty, 0, sizeof(struct v2_disk_dqblk)); - if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) - ddquot.dqb_itime = cpu_to_le64(1); - spin_unlock(&dq_data_lock); - ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, - (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off); - if (ret != sizeof(struct v2_disk_dqblk)) { - printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id); - if (ret >= 0) - ret = -ENOSPC; - } - else - ret = 0; - dqstats.writes++; - - return ret; + return qtree_write_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); } -/* Free dquot entry in data block */ -static int free_dqentry(struct dquot *dquot, uint blk) -{ - struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; - struct v2_disk_dqdbheader *dh; - dqbuf_t buf = getdqbuf(); - int ret = 0; - - if (!buf) - return -ENOMEM; - if (dquot->dq_off >> V2_DQBLKSIZE_BITS != blk) { - printk(KERN_ERR "VFS: Quota structure has offset to other " - "block (%u) than it should (%u).\n", blk, - (uint)(dquot->dq_off >> V2_DQBLKSIZE_BITS)); - goto out_buf; - } - if ((ret = read_blk(sb, type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota data block %u\n", blk); - goto out_buf; - } - dh = (struct v2_disk_dqdbheader *)buf; - le16_add_cpu(&dh->dqdh_entries, -1); - if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ - if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 || - (ret = put_free_dqblk(sb, type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: Can't move quota data block (%u) " - "to free list.\n", blk); - goto out_buf; - } - } - else { - memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, - sizeof(struct v2_disk_dqblk)); - if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { - /* Insert will write block itself */ - if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) { - printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); - goto out_buf; - } - } - else - if ((ret = write_blk(sb, type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't write quota data " - "block %u\n", blk); - goto out_buf; - } - } - dquot->dq_off = 0; /* Quota is now unattached */ -out_buf: - freedqbuf(buf); - return ret; -} - -/* Remove reference to dquot from tree */ -static int remove_tree(struct dquot *dquot, uint *blk, int depth) -{ - struct super_block *sb = dquot->dq_sb; - int type = dquot->dq_type; - dqbuf_t buf = getdqbuf(); - int ret = 0; - uint newblk; - __le32 *ref = (__le32 *)buf; - - if (!buf) - return -ENOMEM; - if ((ret = read_blk(sb, type, *blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); - goto out_buf; - } - newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); - if (depth == V2_DQTREEDEPTH-1) { - ret = free_dqentry(dquot, newblk); - newblk = 0; - } - else - ret = remove_tree(dquot, &newblk, depth+1); - if (ret >= 0 && !newblk) { - int i; - ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0); - for (i = 0; i < V2_DQBLKSIZE && !buf[i]; i++); /* Block got empty? */ - /* Don't put the root block into the free block list */ - if (i == V2_DQBLKSIZE && *blk != V2_DQTREEOFF) { - put_free_dqblk(sb, type, buf, *blk); - *blk = 0; - } - else - if ((ret = write_blk(sb, type, *blk, buf)) < 0) - printk(KERN_ERR "VFS: Can't write quota tree " - "block %u.\n", *blk); - } -out_buf: - freedqbuf(buf); - return ret; -} - -/* Delete dquot from tree */ -static int v2_delete_dquot(struct dquot *dquot) -{ - uint tmp = V2_DQTREEOFF; - - if (!dquot->dq_off) /* Even not allocated? */ - return 0; - return remove_tree(dquot, &tmp, 0); -} - -/* Find entry in block */ -static loff_t find_block_dqentry(struct dquot *dquot, uint blk) -{ - dqbuf_t buf = getdqbuf(); - loff_t ret = 0; - int i; - struct v2_disk_dqblk *ddquot = GETENTRIES(buf); - - if (!buf) - return -ENOMEM; - if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); - goto out_buf; - } - if (dquot->dq_id) - for (i = 0; i < V2_DQSTRINBLK && - le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); - else { /* ID 0 as a bit more complicated searching... */ - struct v2_disk_dqblk fakedquot; - - memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); - for (i = 0; i < V2_DQSTRINBLK; i++) - if (!le32_to_cpu(ddquot[i].dqb_id) && - memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) - break; - } - if (i == V2_DQSTRINBLK) { - printk(KERN_ERR "VFS: Quota for id %u referenced " - "but not present.\n", dquot->dq_id); - ret = -EIO; - goto out_buf; - } - else - ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct - v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); -out_buf: - freedqbuf(buf); - return ret; -} - -/* Find entry for given id in the tree */ -static loff_t find_tree_dqentry(struct dquot *dquot, uint blk, int depth) -{ - dqbuf_t buf = getdqbuf(); - loff_t ret = 0; - __le32 *ref = (__le32 *)buf; - - if (!buf) - return -ENOMEM; - if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { - printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); - goto out_buf; - } - ret = 0; - blk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); - if (!blk) /* No reference? */ - goto out_buf; - if (depth < V2_DQTREEDEPTH-1) - ret = find_tree_dqentry(dquot, blk, depth+1); - else - ret = find_block_dqentry(dquot, blk); -out_buf: - freedqbuf(buf); - return ret; -} - -/* Find entry for given id in the tree - wrapper function */ -static inline loff_t find_dqentry(struct dquot *dquot) -{ - return find_tree_dqentry(dquot, V2_DQTREEOFF, 0); -} - -static int v2_read_dquot(struct dquot *dquot) +static int v2_release_dquot(struct dquot *dquot) { - int type = dquot->dq_type; - loff_t offset; - struct v2_disk_dqblk ddquot, empty; - int ret = 0; - -#ifdef __QUOTA_V2_PARANOIA - /* Invalidated quota? */ - if (!dquot->dq_sb || !sb_dqopt(dquot->dq_sb)->files[type]) { - printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); - return -EIO; - } -#endif - offset = find_dqentry(dquot); - if (offset <= 0) { /* Entry not present? */ - if (offset < 0) - printk(KERN_ERR "VFS: Can't read quota " - "structure for id %u.\n", dquot->dq_id); - dquot->dq_off = 0; - set_bit(DQ_FAKE_B, &dquot->dq_flags); - memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); - ret = offset; - } - else { - dquot->dq_off = offset; - if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, - (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset)) - != sizeof(struct v2_disk_dqblk)) { - if (ret >= 0) - ret = -EIO; - printk(KERN_ERR "VFS: Error while reading quota " - "structure for id %u.\n", dquot->dq_id); - memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); - } - else { - ret = 0; - /* We need to escape back all-zero structure */ - memset(&empty, 0, sizeof(struct v2_disk_dqblk)); - empty.dqb_itime = cpu_to_le64(1); - if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) - ddquot.dqb_itime = 0; - } - disk2memdqb(&dquot->dq_dqb, &ddquot); - if (!dquot->dq_dqb.dqb_bhardlimit && - !dquot->dq_dqb.dqb_bsoftlimit && - !dquot->dq_dqb.dqb_ihardlimit && - !dquot->dq_dqb.dqb_isoftlimit) - set_bit(DQ_FAKE_B, &dquot->dq_flags); - } - dqstats.reads++; - - return ret; + return qtree_release_dquot(sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv, dquot); } -/* Check whether dquot should not be deleted. We know we are - * the only one operating on dquot (thanks to dq_lock) */ -static int v2_release_dquot(struct dquot *dquot) +static int v2_free_file_info(struct super_block *sb, int type) { - if (test_bit(DQ_FAKE_B, &dquot->dq_flags) && !(dquot->dq_dqb.dqb_curinodes | dquot->dq_dqb.dqb_curspace)) - return v2_delete_dquot(dquot); + kfree(sb_dqinfo(sb, type)->dqi_priv); return 0; } @@ -673,7 +210,7 @@ static struct quota_format_ops v2_format_ops = { .check_quota_file = v2_check_quota_file, .read_file_info = v2_read_file_info, .write_file_info = v2_write_file_info, - .free_file_info = NULL, + .free_file_info = v2_free_file_info, .read_dqblk = v2_read_dquot, .commit_dqblk = v2_write_dquot, .release_dqblk = v2_release_dquot, diff --git a/fs/quotaio_v1.h b/fs/quotaio_v1.h new file mode 100644 index 00000000000..746654b5de7 --- /dev/null +++ b/fs/quotaio_v1.h @@ -0,0 +1,33 @@ +#ifndef _LINUX_QUOTAIO_V1_H +#define _LINUX_QUOTAIO_V1_H + +#include <linux/types.h> + +/* + * The following constants define the amount of time given a user + * before the soft limits are treated as hard limits (usually resulting + * in an allocation failure). The timer is started when the user crosses + * their soft limit, it is reset when they go below their soft limit. + */ +#define MAX_IQ_TIME 604800 /* (7*24*60*60) 1 week */ +#define MAX_DQ_TIME 604800 /* (7*24*60*60) 1 week */ + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is an array of these structures + * indexed by user or group number. + */ +struct v1_disk_dqblk { + __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ + __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ + __u32 dqb_curblocks; /* current block count */ + __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __u32 dqb_isoftlimit; /* preferred inode limit */ + __u32 dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ +}; + +#define v1_dqoff(UID) ((loff_t)((UID) * sizeof (struct v1_disk_dqblk))) + +#endif /* _LINUX_QUOTAIO_V1_H */ diff --git a/fs/quotaio_v2.h b/fs/quotaio_v2.h new file mode 100644 index 00000000000..530fe580685 --- /dev/null +++ b/fs/quotaio_v2.h @@ -0,0 +1,60 @@ +/* + * Definitions of structures for vfsv0 quota format + */ + +#ifndef _LINUX_QUOTAIO_V2_H +#define _LINUX_QUOTAIO_V2_H + +#include <linux/types.h> +#include <linux/quota.h> + +/* + * Definitions of magics and versions of current quota files + */ +#define V2_INITQMAGICS {\ + 0xd9c01f11, /* USRQUOTA */\ + 0xd9c01927 /* GRPQUOTA */\ +} + +#define V2_INITQVERSIONS {\ + 0, /* USRQUOTA */\ + 0 /* GRPQUOTA */\ +} + +/* First generic header */ +struct v2_disk_dqheader { + __le32 dqh_magic; /* Magic number identifying file */ + __le32 dqh_version; /* File version */ +}; + +/* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +struct v2_disk_dqblk { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ + __le32 dqb_curinodes; /* current # allocated inodes */ + __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ +}; + +/* Header with type and version specific information */ +struct v2_disk_dqinfo { + __le32 dqi_bgrace; /* Time before block soft limit becomes hard limit */ + __le32 dqi_igrace; /* Time before inode soft limit becomes hard limit */ + __le32 dqi_flags; /* Flags for quotafile (DQF_*) */ + __le32 dqi_blocks; /* Number of blocks in file */ + __le32 dqi_free_blk; /* Number of first free block in the list */ + __le32 dqi_free_entry; /* Number of block with at least one free entry */ +}; + +#define V2_DQINFOOFF sizeof(struct v2_disk_dqheader) /* Offset of info header in file */ +#define V2_DQBLKSIZE_BITS 10 /* Size of leaf block in tree */ + +#endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 76acdbc3461..b9b567a2837 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -262,11 +262,11 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, ret = -ENOMEM; pages = kzalloc(lpages * sizeof(struct page *), GFP_KERNEL); if (!pages) - goto out; + goto out_free; nr = find_get_pages(inode->i_mapping, pgoff, lpages, pages); if (nr != lpages) - goto out; /* leave if some pages were missing */ + goto out_free_pages; /* leave if some pages were missing */ /* check the pages for physical adjacency */ ptr = pages; @@ -274,19 +274,18 @@ unsigned long ramfs_nommu_get_unmapped_area(struct file *file, page++; for (loop = lpages; loop > 1; loop--) if (*ptr++ != page++) - goto out; + goto out_free_pages; /* okay - all conditions fulfilled */ ret = (unsigned long) page_address(pages[0]); - out: - if (pages) { - ptr = pages; - for (loop = lpages; loop > 0; loop--) - put_page(*ptr++); - kfree(pages); - } - +out_free_pages: + ptr = pages; + for (loop = nr; loop > 0; loop--) + put_page(*ptr++); +out_free: + kfree(pages); +out: return ret; } diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index a83a3518ae3..b7e6ac706b8 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -57,7 +57,6 @@ struct inode *ramfs_get_inode(struct super_block *sb, int mode, dev_t dev) inode->i_mode = mode; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_blocks = 0; inode->i_mapping->a_ops = &ramfs_aops; inode->i_mapping->backing_dev_info = &ramfs_backing_dev_info; mapping_set_gfp_mask(inode->i_mapping, GFP_HIGHUSER); diff --git a/fs/read_write.c b/fs/read_write.c index 969a6d9c020..5cc6924eb15 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -50,6 +50,14 @@ generic_file_llseek_unlocked(struct file *file, loff_t offset, int origin) offset += inode->i_size; break; case SEEK_CUR: + /* + * Here we special-case the lseek(fd, 0, SEEK_CUR) + * position-querying operation. Avoid rewriting the "same" + * f_pos value back to the file because a concurrent read(), + * write() or lseek() might have altered it + */ + if (offset == 0) + return file->f_pos; offset += file->f_pos; break; } @@ -105,6 +113,10 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) offset += i_size_read(file->f_path.dentry->d_inode); break; case SEEK_CUR: + if (offset == 0) { + retval = file->f_pos; + goto out; + } offset += file->f_pos; } retval = -EINVAL; @@ -115,6 +127,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int origin) } retval = offset; } +out: unlock_kernel(); return retval; } diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c index 145c2d3e5e0..55fce92cdf1 100644 --- a/fs/reiserfs/inode.c +++ b/fs/reiserfs/inode.c @@ -1782,6 +1782,12 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, goto out_bad_inode; } args.objectid = inode->i_ino = le32_to_cpu(ih.ih_key.k_objectid); + if (old_format_only(sb)) + make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, + TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); + else + make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, + TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); memcpy(INODE_PKEY(inode), &(ih.ih_key), KEY_SIZE); args.dirid = le32_to_cpu(ih.ih_key.k_dir_id); if (insert_inode_locked4(inode, args.objectid, @@ -1834,13 +1840,6 @@ int reiserfs_new_inode(struct reiserfs_transaction_handle *th, reiserfs_init_acl_default(inode); reiserfs_init_xattr_rwsem(inode); - if (old_format_only(sb)) - make_le_item_head(&ih, NULL, KEY_FORMAT_3_5, SD_OFFSET, - TYPE_STAT_DATA, SD_V1_SIZE, MAX_US_INT); - else - make_le_item_head(&ih, NULL, KEY_FORMAT_3_6, SD_OFFSET, - TYPE_STAT_DATA, SD_SIZE, MAX_US_INT); - /* key to search for correct place for new stat data */ _make_cpu_key(&key, KEY_FORMAT_3_6, le32_to_cpu(ih.ih_key.k_dir_id), le32_to_cpu(ih.ih_key.k_objectid), SD_OFFSET, @@ -2561,7 +2560,7 @@ static int reiserfs_write_begin(struct file *file, } index = pos >> PAGE_CACHE_SHIFT; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; *pagep = page; diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 663a91f5dce..c55651f1407 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -649,6 +649,8 @@ static struct dquot_operations reiserfs_quota_operations = { .release_dquot = reiserfs_release_dquot, .mark_dirty = reiserfs_mark_dquot_dirty, .write_info = reiserfs_write_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, }; static struct quotactl_ops reiserfs_qctl_operations = { @@ -994,8 +996,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin if (c == 'u' || c == 'g') { int qtype = c == 'u' ? USRQUOTA : GRPQUOTA; - if ((sb_any_quota_enabled(s) || - sb_any_quota_suspended(s)) && + if (sb_any_quota_loaded(s) && (!*arg != !REISERFS_SB(s)->s_qf_names[qtype])) { reiserfs_warning(s, "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); @@ -1041,8 +1042,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin "reiserfs_parse_options: unknown quota format specified."); return 0; } - if ((sb_any_quota_enabled(s) || - sb_any_quota_suspended(s)) && + if (sb_any_quota_loaded(s) && *qfmt != REISERFS_SB(s)->s_jquota_fmt) { reiserfs_warning(s, "reiserfs_parse_options: cannot change journaled quota options when quota turned on."); @@ -1067,7 +1067,7 @@ static int reiserfs_parse_options(struct super_block *s, char *options, /* strin } /* This checking is not precise wrt the quota type but for our purposes it is sufficient */ if (!(*mount_options & (1 << REISERFS_QUOTA)) - && sb_any_quota_enabled(s)) { + && sb_any_quota_loaded(s)) { reiserfs_warning(s, "reiserfs_parse_options: quota options must be present when quota is turned on."); return 0; diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c index 60d2f822e87..98a232f7196 100644 --- a/fs/romfs/inode.c +++ b/fs/romfs/inode.c @@ -490,7 +490,7 @@ static mode_t romfs_modemap[] = static struct inode * romfs_iget(struct super_block *sb, unsigned long ino) { - int nextfh; + int nextfh, ret; struct romfs_inode ri; struct inode *i; @@ -524,14 +524,13 @@ romfs_iget(struct super_block *sb, unsigned long ino) i->i_size = be32_to_cpu(ri.size); i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0; i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0; - i->i_uid = i->i_gid = 0; /* Precalculate the data offset */ - ino = romfs_strnlen(i, ino+ROMFH_SIZE, ROMFS_MAXFN); - if (ino >= 0) - ino = ((ROMFH_SIZE+ino+1+ROMFH_PAD)&ROMFH_MASK); - else - ino = 0; + ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN); + if (ret >= 0) + ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK; + else + ino = 0; ROMFS_I(i)->i_metasize = ino; ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK); diff --git a/fs/select.c b/fs/select.c index 87df51eadcf..08b91beed80 100644 --- a/fs/select.c +++ b/fs/select.c @@ -109,11 +109,11 @@ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); + pwq->polling_task = current; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; } - EXPORT_SYMBOL(poll_initwait); static void free_poll_entry(struct poll_table_entry *entry) @@ -142,12 +142,10 @@ void poll_freewait(struct poll_wqueues *pwq) free_page((unsigned long) old); } } - EXPORT_SYMBOL(poll_freewait); -static struct poll_table_entry *poll_get_entry(poll_table *_p) +static struct poll_table_entry *poll_get_entry(struct poll_wqueues *p) { - struct poll_wqueues *p = container_of(_p, struct poll_wqueues, pt); struct poll_table_page *table = p->table; if (p->inline_index < N_INLINE_POLL_ENTRIES) @@ -159,7 +157,6 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) new_table = (struct poll_table_page *) __get_free_page(GFP_KERNEL); if (!new_table) { p->error = -ENOMEM; - __set_current_state(TASK_RUNNING); return NULL; } new_table->entry = new_table->entries; @@ -171,20 +168,75 @@ static struct poll_table_entry *poll_get_entry(poll_table *_p) return table->entry++; } +static int pollwake(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + struct poll_wqueues *pwq = wait->private; + DECLARE_WAITQUEUE(dummy_wait, pwq->polling_task); + + /* + * Although this function is called under waitqueue lock, LOCK + * doesn't imply write barrier and the users expect write + * barrier semantics on wakeup functions. The following + * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up() + * and is paired with set_mb() in poll_schedule_timeout. + */ + smp_wmb(); + pwq->triggered = 1; + + /* + * Perform the default wake up operation using a dummy + * waitqueue. + * + * TODO: This is hacky but there currently is no interface to + * pass in @sync. @sync is scheduled to be removed and once + * that happens, wake_up_process() can be used directly. + */ + return default_wake_function(&dummy_wait, mode, sync, key); +} + /* Add a new entry */ static void __pollwait(struct file *filp, wait_queue_head_t *wait_address, poll_table *p) { - struct poll_table_entry *entry = poll_get_entry(p); + struct poll_wqueues *pwq = container_of(p, struct poll_wqueues, pt); + struct poll_table_entry *entry = poll_get_entry(pwq); if (!entry) return; get_file(filp); entry->filp = filp; entry->wait_address = wait_address; - init_waitqueue_entry(&entry->wait, current); + init_waitqueue_func_entry(&entry->wait, pollwake); + entry->wait.private = pwq; add_wait_queue(wait_address, &entry->wait); } +int poll_schedule_timeout(struct poll_wqueues *pwq, int state, + ktime_t *expires, unsigned long slack) +{ + int rc = -EINTR; + + set_current_state(state); + if (!pwq->triggered) + rc = schedule_hrtimeout_range(expires, slack, HRTIMER_MODE_ABS); + __set_current_state(TASK_RUNNING); + + /* + * Prepare for the next iteration. + * + * The following set_mb() serves two purposes. First, it's + * the counterpart rmb of the wmb in pollwake() such that data + * written before wake up is always visible after wake up. + * Second, the full barrier guarantees that triggered clearing + * doesn't pass event check of the next iteration. Note that + * this problem doesn't exist for the first iteration as + * add_wait_queue() has full barrier semantics. + */ + set_mb(pwq->triggered, 0); + + return rc; +} +EXPORT_SYMBOL(poll_schedule_timeout); + /** * poll_select_set_timeout - helper function to setup the timeout value * @to: pointer to timespec variable for the final timeout @@ -340,8 +392,6 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) for (;;) { unsigned long *rinp, *routp, *rexp, *inp, *outp, *exp; - set_current_state(TASK_INTERRUPTIBLE); - inp = fds->in; outp = fds->out; exp = fds->ex; rinp = fds->res_in; routp = fds->res_out; rexp = fds->res_ex; @@ -411,10 +461,10 @@ int do_select(int n, fd_set_bits *fds, struct timespec *end_time) to = &expire; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(&table, TASK_INTERRUPTIBLE, + to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); poll_freewait(&table); @@ -666,7 +716,6 @@ static int do_poll(unsigned int nfds, struct poll_list *list, for (;;) { struct poll_list *walk; - set_current_state(TASK_INTERRUPTIBLE); for (walk = list; walk != NULL; walk = walk->next) { struct pollfd * pfd, * pfd_end; @@ -709,10 +758,9 @@ static int do_poll(unsigned int nfds, struct poll_list *list, to = &expire; } - if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS)) + if (!poll_schedule_timeout(wait, TASK_INTERRUPTIBLE, to, slack)) timed_out = 1; } - __set_current_state(TASK_RUNNING); return count; } diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index e4f8d51a555..92d5e8ffb63 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -297,7 +297,7 @@ static int smb_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = __grab_cache_page(mapping, index); + *pagep = grab_cache_page_write_begin(mapping, index, flags); if (!*pagep) return -ENOMEM; return 0; diff --git a/fs/splice.c b/fs/splice.c index 1abab5cee4b..a54b3e3f10a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -21,6 +21,7 @@ #include <linux/file.h> #include <linux/pagemap.h> #include <linux/splice.h> +#include <linux/memcontrol.h> #include <linux/mm_inline.h> #include <linux/swap.h> #include <linux/writeback.h> diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile new file mode 100644 index 00000000000..8258cf9a031 --- /dev/null +++ b/fs/squashfs/Makefile @@ -0,0 +1,8 @@ +# +# Makefile for the linux squashfs routines. +# + +obj-$(CONFIG_SQUASHFS) += squashfs.o +squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o +squashfs-y += namei.o super.o symlink.o +#squashfs-y += squashfs2_0.o diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c new file mode 100644 index 00000000000..c837dfc2b3c --- /dev/null +++ b/fs/squashfs/block.c @@ -0,0 +1,274 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * block.c + */ + +/* + * This file implements the low-level routines to read and decompress + * datablocks and metadata blocks. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/string.h> +#include <linux/buffer_head.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Read the metadata block length, this is stored in the first two + * bytes of the metadata block. + */ +static struct buffer_head *get_block_length(struct super_block *sb, + u64 *cur_index, int *offset, int *length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head *bh; + + bh = sb_bread(sb, *cur_index); + if (bh == NULL) + return NULL; + + if (msblk->devblksize - *offset == 1) { + *length = (unsigned char) bh->b_data[*offset]; + put_bh(bh); + bh = sb_bread(sb, ++(*cur_index)); + if (bh == NULL) + return NULL; + *length |= (unsigned char) bh->b_data[0] << 8; + *offset = 1; + } else { + *length = (unsigned char) bh->b_data[*offset] | + (unsigned char) bh->b_data[*offset + 1] << 8; + *offset += 2; + } + + return bh; +} + + +/* + * Read and decompress a metadata block or datablock. Length is non-zero + * if a datablock is being read (the size is stored elsewhere in the + * filesystem), otherwise the length is obtained from the first two bytes of + * the metadata block. A bit in the length field indicates if the block + * is stored uncompressed in the filesystem (usually because compression + * generated a larger block - this does occasionally happen with zlib). + */ +int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, + int length, u64 *next_index, int srclength) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + struct buffer_head **bh; + int offset = index & ((1 << msblk->devblksize_log2) - 1); + u64 cur_index = index >> msblk->devblksize_log2; + int bytes, compressed, b = 0, k = 0, page = 0, avail; + + + bh = kcalloc((msblk->block_size >> msblk->devblksize_log2) + 1, + sizeof(*bh), GFP_KERNEL); + if (bh == NULL) + return -ENOMEM; + + if (length) { + /* + * Datablock. + */ + bytes = -offset; + compressed = SQUASHFS_COMPRESSED_BLOCK(length); + length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length); + if (next_index) + *next_index = index + length; + + TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n", + index, compressed ? "" : "un", length, srclength); + + if (length < 0 || length > srclength || + (index + length) > msblk->bytes_used) + goto read_failure; + + for (b = 0; bytes < length; b++, cur_index++) { + bh[b] = sb_getblk(sb, cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b, bh); + } else { + /* + * Metadata block. + */ + if ((index + 2) > msblk->bytes_used) + goto read_failure; + + bh[0] = get_block_length(sb, &cur_index, &offset, &length); + if (bh[0] == NULL) + goto read_failure; + b = 1; + + bytes = msblk->devblksize - offset; + compressed = SQUASHFS_COMPRESSED(length); + length = SQUASHFS_COMPRESSED_SIZE(length); + if (next_index) + *next_index = index + length + 2; + + TRACE("Block @ 0x%llx, %scompressed size %d\n", index, + compressed ? "" : "un", length); + + if (length < 0 || length > srclength || + (index + length) > msblk->bytes_used) + goto block_release; + + for (; bytes < length; b++) { + bh[b] = sb_getblk(sb, ++cur_index); + if (bh[b] == NULL) + goto block_release; + bytes += msblk->devblksize; + } + ll_rw_block(READ, b - 1, bh + 1); + } + + if (compressed) { + int zlib_err = 0, zlib_init = 0; + + /* + * Uncompress block. + */ + + mutex_lock(&msblk->read_data_mutex); + + msblk->stream.avail_out = 0; + msblk->stream.avail_in = 0; + + bytes = length; + do { + if (msblk->stream.avail_in == 0 && k < b) { + avail = min(bytes, msblk->devblksize - offset); + bytes -= avail; + wait_on_buffer(bh[k]); + if (!buffer_uptodate(bh[k])) + goto release_mutex; + + if (avail == 0) { + offset = 0; + put_bh(bh[k++]); + continue; + } + + msblk->stream.next_in = bh[k]->b_data + offset; + msblk->stream.avail_in = avail; + offset = 0; + } + + if (msblk->stream.avail_out == 0) { + msblk->stream.next_out = buffer[page++]; + msblk->stream.avail_out = PAGE_CACHE_SIZE; + } + + if (!zlib_init) { + zlib_err = zlib_inflateInit(&msblk->stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateInit returned" + " unexpected result 0x%x," + " srclength %d\n", zlib_err, + srclength); + goto release_mutex; + } + zlib_init = 1; + } + + zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); + + if (msblk->stream.avail_in == 0 && k < b) + put_bh(bh[k++]); + } while (zlib_err == Z_OK); + + if (zlib_err != Z_STREAM_END) { + ERROR("zlib_inflate returned unexpected result" + " 0x%x, srclength %d, avail_in %d," + " avail_out %d\n", zlib_err, srclength, + msblk->stream.avail_in, + msblk->stream.avail_out); + goto release_mutex; + } + + zlib_err = zlib_inflateEnd(&msblk->stream); + if (zlib_err != Z_OK) { + ERROR("zlib_inflateEnd returned unexpected result 0x%x," + " srclength %d\n", zlib_err, srclength); + goto release_mutex; + } + length = msblk->stream.total_out; + mutex_unlock(&msblk->read_data_mutex); + } else { + /* + * Block is uncompressed. + */ + int i, in, pg_offset = 0; + + for (i = 0; i < b; i++) { + wait_on_buffer(bh[i]); + if (!buffer_uptodate(bh[i])) + goto block_release; + } + + for (bytes = length; k < b; k++) { + in = min(bytes, msblk->devblksize - offset); + bytes -= in; + while (in) { + if (pg_offset == PAGE_CACHE_SIZE) { + page++; + pg_offset = 0; + } + avail = min_t(int, in, PAGE_CACHE_SIZE - + pg_offset); + memcpy(buffer[page] + pg_offset, + bh[k]->b_data + offset, avail); + in -= avail; + pg_offset += avail; + offset += avail; + } + offset = 0; + put_bh(bh[k]); + } + } + + kfree(bh); + return length; + +release_mutex: + mutex_unlock(&msblk->read_data_mutex); + +block_release: + for (; k < b; k++) + put_bh(bh[k]); + +read_failure: + ERROR("sb_bread failed reading block 0x%llx\n", cur_index); + kfree(bh); + return -EIO; +} diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c new file mode 100644 index 00000000000..f29eda16d25 --- /dev/null +++ b/fs/squashfs/cache.c @@ -0,0 +1,412 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * cache.c + */ + +/* + * Blocks in Squashfs are compressed. To avoid repeatedly decompressing + * recently accessed data Squashfs uses two small metadata and fragment caches. + * + * This file implements a generic cache implementation used for both caches, + * plus functions layered ontop of the generic cache implementation to + * access the metadata and fragment caches. + * + * To avoid out of memory and fragmentation isssues with vmalloc the cache + * uses sequences of kmalloced PAGE_CACHE_SIZE buffers. + * + * It should be noted that the cache is not used for file datablocks, these + * are decompressed and cached in the page-cache in the normal way. The + * cache is only used to temporarily cache fragment and metadata blocks + * which have been read as as a result of a metadata (i.e. inode or + * directory) or fragment access. Because metadata and fragments are packed + * together into blocks (to gain greater compression) the read of a particular + * piece of metadata or fragment will retrieve other metadata/fragments which + * have been packed with it, these because of locality-of-reference may be read + * in the near future. Temporarily caching them ensures they are available for + * near future access without requiring an additional read and decompress. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/vmalloc.h> +#include <linux/sched.h> +#include <linux/spinlock.h> +#include <linux/wait.h> +#include <linux/zlib.h> +#include <linux/pagemap.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up block in cache, and increment usage count. If not in cache, read + * and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, + struct squashfs_cache *cache, u64 block, int length) +{ + int i, n; + struct squashfs_cache_entry *entry; + + spin_lock(&cache->lock); + + while (1) { + for (i = 0; i < cache->entries; i++) + if (cache->entry[i].block == block) + break; + + if (i == cache->entries) { + /* + * Block not in cache, if all cache entries are used + * go to sleep waiting for one to become available. + */ + if (cache->unused == 0) { + cache->num_waiters++; + spin_unlock(&cache->lock); + wait_event(cache->wait_queue, cache->unused); + spin_lock(&cache->lock); + cache->num_waiters--; + continue; + } + + /* + * At least one unused cache entry. A simple + * round-robin strategy is used to choose the entry to + * be evicted from the cache. + */ + i = cache->next_blk; + for (n = 0; n < cache->entries; n++) { + if (cache->entry[i].refcount == 0) + break; + i = (i + 1) % cache->entries; + } + + cache->next_blk = (i + 1) % cache->entries; + entry = &cache->entry[i]; + + /* + * Initialise choosen cache entry, and fill it in from + * disk. + */ + cache->unused--; + entry->block = block; + entry->refcount = 1; + entry->pending = 1; + entry->num_waiters = 0; + entry->error = 0; + spin_unlock(&cache->lock); + + entry->length = squashfs_read_data(sb, entry->data, + block, length, &entry->next_index, + cache->block_size); + + spin_lock(&cache->lock); + + if (entry->length < 0) + entry->error = entry->length; + + entry->pending = 0; + + /* + * While filling this entry one or more other processes + * have looked it up in the cache, and have slept + * waiting for it to become available. + */ + if (entry->num_waiters) { + spin_unlock(&cache->lock); + wake_up_all(&entry->wait_queue); + } else + spin_unlock(&cache->lock); + + goto out; + } + + /* + * Block already in cache. Increment refcount so it doesn't + * get reused until we're finished with it, if it was + * previously unused there's one less cache entry available + * for reuse. + */ + entry = &cache->entry[i]; + if (entry->refcount == 0) + cache->unused--; + entry->refcount++; + + /* + * If the entry is currently being filled in by another process + * go to sleep waiting for it to become available. + */ + if (entry->pending) { + entry->num_waiters++; + spin_unlock(&cache->lock); + wait_event(entry->wait_queue, !entry->pending); + } else + spin_unlock(&cache->lock); + + goto out; + } + +out: + TRACE("Got %s %d, start block %lld, refcount %d, error %d\n", + cache->name, i, entry->block, entry->refcount, entry->error); + + if (entry->error) + ERROR("Unable to read %s cache entry [%llx]\n", cache->name, + block); + return entry; +} + + +/* + * Release cache entry, once usage count is zero it can be reused. + */ +void squashfs_cache_put(struct squashfs_cache_entry *entry) +{ + struct squashfs_cache *cache = entry->cache; + + spin_lock(&cache->lock); + entry->refcount--; + if (entry->refcount == 0) { + cache->unused++; + /* + * If there's any processes waiting for a block to become + * available, wake one up. + */ + if (cache->num_waiters) { + spin_unlock(&cache->lock); + wake_up(&cache->wait_queue); + return; + } + } + spin_unlock(&cache->lock); +} + +/* + * Delete cache reclaiming all kmalloced buffers. + */ +void squashfs_cache_delete(struct squashfs_cache *cache) +{ + int i, j; + + if (cache == NULL) + return; + + for (i = 0; i < cache->entries; i++) { + if (cache->entry[i].data) { + for (j = 0; j < cache->pages; j++) + kfree(cache->entry[i].data[j]); + kfree(cache->entry[i].data); + } + } + + kfree(cache->entry); + kfree(cache); +} + + +/* + * Initialise cache allocating the specified number of entries, each of + * size block_size. To avoid vmalloc fragmentation issues each entry + * is allocated as a sequence of kmalloced PAGE_CACHE_SIZE buffers. + */ +struct squashfs_cache *squashfs_cache_init(char *name, int entries, + int block_size) +{ + int i, j; + struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL); + + if (cache == NULL) { + ERROR("Failed to allocate %s cache\n", name); + return NULL; + } + + cache->entry = kcalloc(entries, sizeof(*(cache->entry)), GFP_KERNEL); + if (cache->entry == NULL) { + ERROR("Failed to allocate %s cache\n", name); + goto cleanup; + } + + cache->next_blk = 0; + cache->unused = entries; + cache->entries = entries; + cache->block_size = block_size; + cache->pages = block_size >> PAGE_CACHE_SHIFT; + cache->name = name; + cache->num_waiters = 0; + spin_lock_init(&cache->lock); + init_waitqueue_head(&cache->wait_queue); + + for (i = 0; i < entries; i++) { + struct squashfs_cache_entry *entry = &cache->entry[i]; + + init_waitqueue_head(&cache->entry[i].wait_queue); + entry->cache = cache; + entry->block = SQUASHFS_INVALID_BLK; + entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL); + if (entry->data == NULL) { + ERROR("Failed to allocate %s cache entry\n", name); + goto cleanup; + } + + for (j = 0; j < cache->pages; j++) { + entry->data[j] = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + if (entry->data[j] == NULL) { + ERROR("Failed to allocate %s buffer\n", name); + goto cleanup; + } + } + } + + return cache; + +cleanup: + squashfs_cache_delete(cache); + return NULL; +} + + +/* + * Copy upto length bytes from cache entry to buffer starting at offset bytes + * into the cache entry. If there's not length bytes then copy the number of + * bytes available. In all cases return the number of bytes copied. + */ +int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry, + int offset, int length) +{ + int remaining = length; + + if (length == 0) + return 0; + else if (buffer == NULL) + return min(length, entry->length - offset); + + while (offset < entry->length) { + void *buff = entry->data[offset / PAGE_CACHE_SIZE] + + (offset % PAGE_CACHE_SIZE); + int bytes = min_t(int, entry->length - offset, + PAGE_CACHE_SIZE - (offset % PAGE_CACHE_SIZE)); + + if (bytes >= remaining) { + memcpy(buffer, buff, remaining); + remaining = 0; + break; + } + + memcpy(buffer, buff, bytes); + buffer += bytes; + remaining -= bytes; + offset += bytes; + } + + return length - remaining; +} + + +/* + * Read length bytes from metadata position <block, offset> (block is the + * start of the compressed block on disk, and offset is the offset into + * the block once decompressed). Data is packed into consecutive blocks, + * and length bytes may require reading more than one block. + */ +int squashfs_read_metadata(struct super_block *sb, void *buffer, + u64 *block, int *offset, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int bytes, copied = length; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_read_metadata [%llx:%x]\n", *block, *offset); + + while (length) { + entry = squashfs_cache_get(sb, msblk->block_cache, *block, 0); + if (entry->error) + return entry->error; + else if (*offset >= entry->length) + return -EIO; + + bytes = squashfs_copy_data(buffer, entry, *offset, length); + if (buffer) + buffer += bytes; + length -= bytes; + *offset += bytes; + + if (*offset == entry->length) { + *block = entry->next_index; + *offset = 0; + } + + squashfs_cache_put(entry); + } + + return copied; +} + + +/* + * Look-up in the fragmment cache the fragment located at <start_block> in the + * filesystem. If necessary read and decompress it from disk. + */ +struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->fragment_cache, start_block, + length); +} + + +/* + * Read and decompress the datablock located at <start_block> in the + * filesystem. The cache is used here to avoid duplicating locking and + * read/decompress code. + */ +struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb, + u64 start_block, int length) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + + return squashfs_cache_get(sb, msblk->read_page, start_block, length); +} + + +/* + * Read a filesystem table (uncompressed sequence of bytes) from disk + */ +int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, + int length) +{ + int pages = (length + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int i, res; + void **data = kcalloc(pages, sizeof(void *), GFP_KERNEL); + if (data == NULL) + return -ENOMEM; + + for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) + data[i] = buffer; + res = squashfs_read_data(sb, data, block, length | + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); + kfree(data); + return res; +} diff --git a/fs/squashfs/dir.c b/fs/squashfs/dir.c new file mode 100644 index 00000000000..566b0eaed86 --- /dev/null +++ b/fs/squashfs/dir.c @@ -0,0 +1,235 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * dir.c + */ + +/* + * This file implements code to read directories from disk. + * + * See namei.c for a description of directory organisation on disk. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static const unsigned char squashfs_filetype_table[] = { + DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_FIFO, DT_SOCK +}; + +/* + * Lookup offset (f_pos) in the directory index, returning the + * metadata block containing it. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_offset(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, int index_offset, + int i_count, u64 f_pos) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int err, i, index, length = 0; + struct squashfs_dir_index dir_index; + + TRACE("Entered get_dir_index_using_offset, i_count %d, f_pos %lld\n", + i_count, f_pos); + + /* + * Translate from external f_pos to the internal f_pos. This + * is offset by 3 because we invent "." and ".." entries which are + * not actually stored in the directory. + */ + if (f_pos < 3) + return f_pos; + f_pos -= 3; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, &dir_index, &index_start, + &index_offset, sizeof(dir_index)); + if (err < 0) + break; + + index = le32_to_cpu(dir_index.index); + if (index > f_pos) + /* + * Found the index we're looking for. + */ + break; + + err = squashfs_read_metadata(sb, NULL, &index_start, + &index_offset, le32_to_cpu(dir_index.size) + 1); + if (err < 0) + break; + + length = index; + *next_block = le32_to_cpu(dir_index.start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + + /* + * Translate back from internal f_pos to external f_pos. + */ + return length + 3; +} + + +static int squashfs_readdir(struct file *file, void *dirent, filldir_t filldir) +{ + struct inode *inode = file->f_dentry->d_inode; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + u64 block = squashfs_i(inode)->start + msblk->directory_table; + int offset = squashfs_i(inode)->offset, length = 0, dir_count, size, + type, err; + unsigned int inode_number; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + + TRACE("Entered squashfs_readdir [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + goto finish; + } + + /* + * Return "." and ".." entries as the first two filenames in the + * directory. To maximise compression these two entries are not + * stored in the directory, and so we invent them here. + * + * It also means that the external f_pos is offset by 3 from the + * on-disk directory f_pos. + */ + while (file->f_pos < 3) { + char *name; + int i_ino; + + if (file->f_pos == 0) { + name = "."; + size = 1; + i_ino = inode->i_ino; + } else { + name = ".."; + size = 2; + i_ino = squashfs_i(inode)->parent; + } + + TRACE("Calling filldir(%p, %s, %d, %lld, %d, %d)\n", + dirent, name, size, file->f_pos, i_ino, + squashfs_filetype_table[1]); + + if (filldir(dirent, name, size, file->f_pos, i_ino, + squashfs_filetype_table[1]) < 0) { + TRACE("Filldir returned less than 0\n"); + goto finish; + } + + file->f_pos += size; + } + + length = get_dir_index_using_offset(inode->i_sb, &block, &offset, + squashfs_i(inode)->dir_idx_start, + squashfs_i(inode)->dir_idx_offset, + squashfs_i(inode)->dir_idx_cnt, + file->f_pos); + + while (length < i_size_read(inode)) { + /* + * Read directory header + */ + err = squashfs_read_metadata(inode->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto failed_read; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(inode->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto failed_read; + + size = le16_to_cpu(dire->size) + 1; + + err = squashfs_read_metadata(inode->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto failed_read; + + length += sizeof(*dire) + size; + + if (file->f_pos >= length) + continue; + + dire->name[size] = '\0'; + inode_number = le32_to_cpu(dirh.inode_number) + + ((short) le16_to_cpu(dire->inode_number)); + type = le16_to_cpu(dire->type); + + TRACE("Calling filldir(%p, %s, %d, %lld, %x:%x, %d, %d)" + "\n", dirent, dire->name, size, + file->f_pos, + le32_to_cpu(dirh.start_block), + le16_to_cpu(dire->offset), + inode_number, + squashfs_filetype_table[type]); + + if (filldir(dirent, dire->name, size, file->f_pos, + inode_number, + squashfs_filetype_table[type]) < 0) { + TRACE("Filldir returned less than 0\n"); + goto finish; + } + + file->f_pos = length; + } + } + +finish: + kfree(dire); + return 0; + +failed_read: + ERROR("Unable to read directory block [%llx:%x]\n", block, offset); + kfree(dire); + return 0; +} + + +const struct file_operations squashfs_dir_ops = { + .read = generic_read_dir, + .readdir = squashfs_readdir +}; diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c new file mode 100644 index 00000000000..69e971d5ddc --- /dev/null +++ b/fs/squashfs/export.c @@ -0,0 +1,155 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * export.c + */ + +/* + * This file implements code to make Squashfs filesystems exportable (NFS etc.) + * + * The export code uses an inode lookup table to map inode numbers passed in + * filehandles to an inode location on disk. This table is stored compressed + * into metadata blocks. A second index table is used to locate these. This + * second index table for speed of access (and because it is small) is read at + * mount time and cached in memory. + * + * The inode lookup table is used only by the export code, inode disk + * locations are directly encoded in directories, enabling direct access + * without an intermediate lookup for all operations except the export ops. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/dcache.h> +#include <linux/exportfs.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up inode number (ino) in table, returning the inode location. + */ +static long long squashfs_inode_lookup(struct super_block *sb, int ino_num) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int blk = SQUASHFS_LOOKUP_BLOCK(ino_num - 1); + int offset = SQUASHFS_LOOKUP_BLOCK_OFFSET(ino_num - 1); + u64 start = le64_to_cpu(msblk->inode_lookup_table[blk]); + __le64 ino; + int err; + + TRACE("Entered squashfs_inode_lookup, inode_number = %d\n", ino_num); + + err = squashfs_read_metadata(sb, &ino, &start, &offset, sizeof(ino)); + if (err < 0) + return err; + + TRACE("squashfs_inode_lookup, inode = 0x%llx\n", + (u64) le64_to_cpu(ino)); + + return le64_to_cpu(ino); +} + + +static struct dentry *squashfs_export_iget(struct super_block *sb, + unsigned int ino_num) +{ + long long ino; + struct dentry *dentry = ERR_PTR(-ENOENT); + + TRACE("Entered squashfs_export_iget\n"); + + ino = squashfs_inode_lookup(sb, ino_num); + if (ino >= 0) + dentry = d_obtain_alias(squashfs_iget(sb, ino, ino_num)); + + return dentry; +} + + +static struct dentry *squashfs_fh_to_dentry(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if ((fh_type != FILEID_INO32_GEN && fh_type != FILEID_INO32_GEN_PARENT) + || fh_len < 2) + return NULL; + + return squashfs_export_iget(sb, fid->i32.ino); +} + + +static struct dentry *squashfs_fh_to_parent(struct super_block *sb, + struct fid *fid, int fh_len, int fh_type) +{ + if (fh_type != FILEID_INO32_GEN_PARENT || fh_len < 4) + return NULL; + + return squashfs_export_iget(sb, fid->i32.parent_ino); +} + + +static struct dentry *squashfs_get_parent(struct dentry *child) +{ + struct inode *inode = child->d_inode; + unsigned int parent_ino = squashfs_i(inode)->parent; + + return squashfs_export_iget(inode->i_sb, parent_ino); +} + + +/* + * Read uncompressed inode lookup table indexes off disk into memory + */ +__le64 *squashfs_read_inode_lookup_table(struct super_block *sb, + u64 lookup_table_start, unsigned int inodes) +{ + unsigned int length = SQUASHFS_LOOKUP_BLOCK_BYTES(inodes); + __le64 *inode_lookup_table; + int err; + + TRACE("In read_inode_lookup_table, length %d\n", length); + + /* Allocate inode lookup table indexes */ + inode_lookup_table = kmalloc(length, GFP_KERNEL); + if (inode_lookup_table == NULL) { + ERROR("Failed to allocate inode lookup table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, inode_lookup_table, lookup_table_start, + length); + if (err < 0) { + ERROR("unable to read inode lookup table\n"); + kfree(inode_lookup_table); + return ERR_PTR(err); + } + + return inode_lookup_table; +} + + +const struct export_operations squashfs_export_ops = { + .fh_to_dentry = squashfs_fh_to_dentry, + .fh_to_parent = squashfs_fh_to_parent, + .get_parent = squashfs_get_parent +}; diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c new file mode 100644 index 00000000000..717767d831d --- /dev/null +++ b/fs/squashfs/file.c @@ -0,0 +1,502 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * file.c + */ + +/* + * This file contains code for handling regular files. A regular file + * consists of a sequence of contiguous compressed blocks, and/or a + * compressed fragment block (tail-end packed block). The compressed size + * of each datablock is stored in a block list contained within the + * file inode (itself stored in one or more compressed metadata blocks). + * + * To speed up access to datablocks when reading 'large' files (256 Mbytes or + * larger), the code implements an index cache that caches the mapping from + * block index to datablock location on disk. + * + * The index cache allows Squashfs to handle large files (up to 1.75 TiB) while + * retaining a simple and space-efficient block list on disk. The cache + * is split into slots, caching up to eight 224 GiB files (128 KiB blocks). + * Larger files use multiple slots, with 1.75 TiB files using all 8 slots. + * The index cache is designed to be memory efficient, and by default uses + * 16 KiB. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/mutex.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Locate cache slot in range [offset, index] for specified inode. If + * there's more than one return the slot closest to index. + */ +static struct meta_index *locate_meta_index(struct inode *inode, int offset, + int index) +{ + struct meta_index *meta = NULL; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("locate_meta_index: index %d, offset %d\n", index, offset); + + if (msblk->meta_index == NULL) + goto not_allocated; + + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + if (msblk->meta_index[i].inode_number == inode->i_ino && + msblk->meta_index[i].offset >= offset && + msblk->meta_index[i].offset <= index && + msblk->meta_index[i].locked == 0) { + TRACE("locate_meta_index: entry %d, offset %d\n", i, + msblk->meta_index[i].offset); + meta = &msblk->meta_index[i]; + offset = meta->offset; + } + } + + if (meta) + meta->locked = 1; + +not_allocated: + mutex_unlock(&msblk->meta_index_mutex); + + return meta; +} + + +/* + * Find and initialise an empty cache slot for index offset. + */ +static struct meta_index *empty_meta_index(struct inode *inode, int offset, + int skip) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + struct meta_index *meta = NULL; + int i; + + mutex_lock(&msblk->meta_index_mutex); + + TRACE("empty_meta_index: offset %d, skip %d\n", offset, skip); + + if (msblk->meta_index == NULL) { + /* + * First time cache index has been used, allocate and + * initialise. The cache index could be allocated at + * mount time but doing it here means it is allocated only + * if a 'large' file is read. + */ + msblk->meta_index = kcalloc(SQUASHFS_META_SLOTS, + sizeof(*(msblk->meta_index)), GFP_KERNEL); + if (msblk->meta_index == NULL) { + ERROR("Failed to allocate meta_index\n"); + goto failed; + } + for (i = 0; i < SQUASHFS_META_SLOTS; i++) { + msblk->meta_index[i].inode_number = 0; + msblk->meta_index[i].locked = 0; + } + msblk->next_meta_index = 0; + } + + for (i = SQUASHFS_META_SLOTS; i && + msblk->meta_index[msblk->next_meta_index].locked; i--) + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + if (i == 0) { + TRACE("empty_meta_index: failed!\n"); + goto failed; + } + + TRACE("empty_meta_index: returned meta entry %d, %p\n", + msblk->next_meta_index, + &msblk->meta_index[msblk->next_meta_index]); + + meta = &msblk->meta_index[msblk->next_meta_index]; + msblk->next_meta_index = (msblk->next_meta_index + 1) % + SQUASHFS_META_SLOTS; + + meta->inode_number = inode->i_ino; + meta->offset = offset; + meta->skip = skip; + meta->entries = 0; + meta->locked = 1; + +failed: + mutex_unlock(&msblk->meta_index_mutex); + return meta; +} + + +static void release_meta_index(struct inode *inode, struct meta_index *meta) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + mutex_lock(&msblk->meta_index_mutex); + meta->locked = 0; + mutex_unlock(&msblk->meta_index_mutex); +} + + +/* + * Read the next n blocks from the block list, starting from + * metadata block <start_block, offset>. + */ +static long long read_indexes(struct super_block *sb, int n, + u64 *start_block, int *offset) +{ + int err, i; + long long block = 0; + __le32 *blist = kmalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + + if (blist == NULL) { + ERROR("read_indexes: Failed to allocate block_list\n"); + return -ENOMEM; + } + + while (n) { + int blocks = min_t(int, n, PAGE_CACHE_SIZE >> 2); + + err = squashfs_read_metadata(sb, blist, start_block, + offset, blocks << 2); + if (err < 0) { + ERROR("read_indexes: reading block [%llx:%x]\n", + *start_block, *offset); + goto failure; + } + + for (i = 0; i < blocks; i++) { + int size = le32_to_cpu(blist[i]); + block += SQUASHFS_COMPRESSED_SIZE_BLOCK(size); + } + n -= blocks; + } + + kfree(blist); + return block; + +failure: + kfree(blist); + return err; +} + + +/* + * Each cache index slot has SQUASHFS_META_ENTRIES, each of which + * can cache one index -> datablock/blocklist-block mapping. We wish + * to distribute these over the length of the file, entry[0] maps index x, + * entry[1] maps index x + skip, entry[2] maps index x + 2 * skip, and so on. + * The larger the file, the greater the skip factor. The skip factor is + * limited to the size of the metadata cache (SQUASHFS_CACHED_BLKS) to ensure + * the number of metadata blocks that need to be read fits into the cache. + * If the skip factor is limited in this way then the file will use multiple + * slots. + */ +static inline int calculate_skip(int blocks) +{ + int skip = blocks / ((SQUASHFS_META_ENTRIES + 1) + * SQUASHFS_META_INDEXES); + return min(SQUASHFS_CACHED_BLKS - 1, skip + 1); +} + + +/* + * Search and grow the index cache for the specified inode, returning the + * on-disk locations of the datablock and block list metadata block + * <index_block, index_offset> for index (scaled to nearest cache index). + */ +static int fill_meta_index(struct inode *inode, int index, + u64 *index_block, int *index_offset, u64 *data_block) +{ + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int skip = calculate_skip(i_size_read(inode) >> msblk->block_log); + int offset = 0; + struct meta_index *meta; + struct meta_entry *meta_entry; + u64 cur_index_block = squashfs_i(inode)->block_list_start; + int cur_offset = squashfs_i(inode)->offset; + u64 cur_data_block = squashfs_i(inode)->start; + int err, i; + + /* + * Scale index to cache index (cache slot entry) + */ + index /= SQUASHFS_META_INDEXES * skip; + + while (offset < index) { + meta = locate_meta_index(inode, offset + 1, index); + + if (meta == NULL) { + meta = empty_meta_index(inode, offset + 1, skip); + if (meta == NULL) + goto all_done; + } else { + offset = index < meta->offset + meta->entries ? index : + meta->offset + meta->entries - 1; + meta_entry = &meta->meta_entry[offset - meta->offset]; + cur_index_block = meta_entry->index_block + + msblk->inode_table; + cur_offset = meta_entry->offset; + cur_data_block = meta_entry->data_block; + TRACE("get_meta_index: offset %d, meta->offset %d, " + "meta->entries %d\n", offset, meta->offset, + meta->entries); + TRACE("get_meta_index: index_block 0x%llx, offset 0x%x" + " data_block 0x%llx\n", cur_index_block, + cur_offset, cur_data_block); + } + + /* + * If necessary grow cache slot by reading block list. Cache + * slot is extended up to index or to the end of the slot, in + * which case further slots will be used. + */ + for (i = meta->offset + meta->entries; i <= index && + i < meta->offset + SQUASHFS_META_ENTRIES; i++) { + int blocks = skip * SQUASHFS_META_INDEXES; + long long res = read_indexes(inode->i_sb, blocks, + &cur_index_block, &cur_offset); + + if (res < 0) { + if (meta->entries == 0) + /* + * Don't leave an empty slot on read + * error allocated to this inode... + */ + meta->inode_number = 0; + err = res; + goto failed; + } + + cur_data_block += res; + meta_entry = &meta->meta_entry[i - meta->offset]; + meta_entry->index_block = cur_index_block - + msblk->inode_table; + meta_entry->offset = cur_offset; + meta_entry->data_block = cur_data_block; + meta->entries++; + offset++; + } + + TRACE("get_meta_index: meta->offset %d, meta->entries %d\n", + meta->offset, meta->entries); + + release_meta_index(inode, meta); + } + +all_done: + *index_block = cur_index_block; + *index_offset = cur_offset; + *data_block = cur_data_block; + + /* + * Scale cache index (cache slot entry) to index + */ + return offset * SQUASHFS_META_INDEXES * skip; + +failed: + release_meta_index(inode, meta); + return err; +} + + +/* + * Get the on-disk location and compressed size of the datablock + * specified by index. Fill_meta_index() does most of the work. + */ +static int read_blocklist(struct inode *inode, int index, u64 *block) +{ + u64 start; + long long blks; + int offset; + __le32 size; + int res = fill_meta_index(inode, index, &start, &offset, block); + + TRACE("read_blocklist: res %d, index %d, start 0x%llx, offset" + " 0x%x, block 0x%llx\n", res, index, start, offset, + *block); + + if (res < 0) + return res; + + /* + * res contains the index of the mapping returned by fill_meta_index(), + * this will likely be less than the desired index (because the + * meta_index cache works at a higher granularity). Read any + * extra block indexes needed. + */ + if (res < index) { + blks = read_indexes(inode->i_sb, index - res, &start, &offset); + if (blks < 0) + return (int) blks; + *block += blks; + } + + /* + * Read length of block specified by index. + */ + res = squashfs_read_metadata(inode->i_sb, &size, &start, &offset, + sizeof(size)); + if (res < 0) + return res; + return le32_to_cpu(size); +} + + +static int squashfs_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; + int bytes, i, offset = 0, sparse = 0; + struct squashfs_cache_entry *buffer = NULL; + void *pageaddr; + + int mask = (1 << (msblk->block_log - PAGE_CACHE_SHIFT)) - 1; + int index = page->index >> (msblk->block_log - PAGE_CACHE_SHIFT); + int start_index = page->index & ~mask; + int end_index = start_index | mask; + int file_end = i_size_read(inode) >> msblk->block_log; + + TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n", + page->index, squashfs_i(inode)->start); + + if (page->index >= ((i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT)) + goto out; + + if (index < file_end || squashfs_i(inode)->fragment_block == + SQUASHFS_INVALID_BLK) { + /* + * Reading a datablock from disk. Need to read block list + * to get location and block size. + */ + u64 block = 0; + int bsize = read_blocklist(inode, index, &block); + if (bsize < 0) + goto error_out; + + if (bsize == 0) { /* hole */ + bytes = index == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + sparse = 1; + } else { + /* + * Read and decompress datablock. + */ + buffer = squashfs_get_datablock(inode->i_sb, + block, bsize); + if (buffer->error) { + ERROR("Unable to read page, block %llx, size %x" + "\n", block, bsize); + squashfs_cache_put(buffer); + goto error_out; + } + bytes = buffer->length; + } + } else { + /* + * Datablock is stored inside a fragment (tail-end packed + * block). + */ + buffer = squashfs_get_fragment(inode->i_sb, + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + + if (buffer->error) { + ERROR("Unable to read page, block %llx, size %x\n", + squashfs_i(inode)->fragment_block, + squashfs_i(inode)->fragment_size); + squashfs_cache_put(buffer); + goto error_out; + } + bytes = i_size_read(inode) & (msblk->block_size - 1); + offset = squashfs_i(inode)->fragment_offset; + } + + /* + * Loop copying datablock into pages. As the datablock likely covers + * many PAGE_CACHE_SIZE pages (default block size is 128 KiB) explicitly + * grab the pages from the page cache, except for the page that we've + * been called to fill. + */ + for (i = start_index; i <= end_index && bytes > 0; i++, + bytes -= PAGE_CACHE_SIZE, offset += PAGE_CACHE_SIZE) { + struct page *push_page; + int avail = sparse ? 0 : min_t(int, bytes, PAGE_CACHE_SIZE); + + TRACE("bytes %d, i %d, available_bytes %d\n", bytes, i, avail); + + push_page = (i == page->index) ? page : + grab_cache_page_nowait(page->mapping, i); + + if (!push_page) + continue; + + if (PageUptodate(push_page)) + goto skip_page; + + pageaddr = kmap_atomic(push_page, KM_USER0); + squashfs_copy_data(pageaddr, buffer, offset, avail); + memset(pageaddr + avail, 0, PAGE_CACHE_SIZE - avail); + kunmap_atomic(pageaddr, KM_USER0); + flush_dcache_page(push_page); + SetPageUptodate(push_page); +skip_page: + unlock_page(push_page); + if (i != page->index) + page_cache_release(push_page); + } + + if (!sparse) + squashfs_cache_put(buffer); + + return 0; + +error_out: + SetPageError(page); +out: + pageaddr = kmap_atomic(page, KM_USER0); + memset(pageaddr, 0, PAGE_CACHE_SIZE); + kunmap_atomic(pageaddr, KM_USER0); + flush_dcache_page(page); + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + + return 0; +} + + +const struct address_space_operations squashfs_aops = { + .readpage = squashfs_readpage +}; diff --git a/fs/squashfs/fragment.c b/fs/squashfs/fragment.c new file mode 100644 index 00000000000..b5a2c15bbbc --- /dev/null +++ b/fs/squashfs/fragment.c @@ -0,0 +1,98 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * fragment.c + */ + +/* + * This file implements code to handle compressed fragments (tail-end packed + * datablocks). + * + * Regular files contain a fragment index which is mapped to a fragment + * location on disk and compressed size using a fragment lookup table. + * Like everything in Squashfs this fragment lookup table is itself stored + * compressed into metadata blocks. A second index table is used to locate + * these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Look-up fragment using the fragment index table. Return the on disk + * location of the fragment and its compressed size + */ +int squashfs_frag_lookup(struct super_block *sb, unsigned int fragment, + u64 *fragment_block) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_FRAGMENT_INDEX(fragment); + int offset = SQUASHFS_FRAGMENT_INDEX_OFFSET(fragment); + u64 start_block = le64_to_cpu(msblk->fragment_index[block]); + struct squashfs_fragment_entry fragment_entry; + int size; + + size = squashfs_read_metadata(sb, &fragment_entry, &start_block, + &offset, sizeof(fragment_entry)); + if (size < 0) + return size; + + *fragment_block = le64_to_cpu(fragment_entry.start_block); + size = le32_to_cpu(fragment_entry.size); + + return size; +} + + +/* + * Read the uncompressed fragment lookup table indexes off disk into memory + */ +__le64 *squashfs_read_fragment_index_table(struct super_block *sb, + u64 fragment_table_start, unsigned int fragments) +{ + unsigned int length = SQUASHFS_FRAGMENT_INDEX_BYTES(fragments); + __le64 *fragment_index; + int err; + + /* Allocate fragment lookup table indexes */ + fragment_index = kmalloc(length, GFP_KERNEL); + if (fragment_index == NULL) { + ERROR("Failed to allocate fragment index table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, fragment_index, fragment_table_start, + length); + if (err < 0) { + ERROR("unable to read fragment index table\n"); + kfree(fragment_index); + return ERR_PTR(err); + } + + return fragment_index; +} diff --git a/fs/squashfs/id.c b/fs/squashfs/id.c new file mode 100644 index 00000000000..3795b837ba2 --- /dev/null +++ b/fs/squashfs/id.c @@ -0,0 +1,94 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * id.c + */ + +/* + * This file implements code to handle uids and gids. + * + * For space efficiency regular files store uid and gid indexes, which are + * converted to 32-bit uids/gids using an id look up table. This table is + * stored compressed into metadata blocks. A second index table is used to + * locate these. This second index table for speed of access (and because it + * is small) is read at mount time and cached in memory. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Map uid/gid index into real 32-bit uid/gid using the id look up table + */ +int squashfs_get_id(struct super_block *sb, unsigned int index, + unsigned int *id) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int block = SQUASHFS_ID_BLOCK(index); + int offset = SQUASHFS_ID_BLOCK_OFFSET(index); + u64 start_block = le64_to_cpu(msblk->id_table[block]); + __le32 disk_id; + int err; + + err = squashfs_read_metadata(sb, &disk_id, &start_block, &offset, + sizeof(disk_id)); + if (err < 0) + return err; + + *id = le32_to_cpu(disk_id); + return 0; +} + + +/* + * Read uncompressed id lookup table indexes from disk into memory + */ +__le64 *squashfs_read_id_index_table(struct super_block *sb, + u64 id_table_start, unsigned short no_ids) +{ + unsigned int length = SQUASHFS_ID_BLOCK_BYTES(no_ids); + __le64 *id_table; + int err; + + TRACE("In read_id_index_table, length %d\n", length); + + /* Allocate id lookup table indexes */ + id_table = kmalloc(length, GFP_KERNEL); + if (id_table == NULL) { + ERROR("Failed to allocate id index table\n"); + return ERR_PTR(-ENOMEM); + } + + err = squashfs_read_table(sb, id_table, id_table_start, length); + if (err < 0) { + ERROR("unable to read id index table\n"); + kfree(id_table); + return ERR_PTR(err); + } + + return id_table; +} diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c new file mode 100644 index 00000000000..7a63398bb85 --- /dev/null +++ b/fs/squashfs/inode.c @@ -0,0 +1,346 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * inode.c + */ + +/* + * This file implements code to create and read inodes from disk. + * + * Inodes in Squashfs are identified by a 48-bit inode which encodes the + * location of the compressed metadata block containing the inode, and the byte + * offset into that block where the inode is placed (<block, offset>). + * + * To maximise compression there are different inodes for each file type + * (regular file, directory, device, etc.), the inode contents and length + * varying with the type. + * + * To further maximise compression, two types of regular file inode and + * directory inode are defined: inodes optimised for frequently occurring + * regular files and directories, and extended types where extra + * information has to be stored. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Initialise VFS inode with the base inode information common to all + * Squashfs inode types. Sqsh_ino contains the unswapped base inode + * off disk. + */ +static int squashfs_new_inode(struct super_block *sb, struct inode *inode, + struct squashfs_base_inode *sqsh_ino) +{ + int err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->uid), &inode->i_uid); + if (err) + return err; + + err = squashfs_get_id(sb, le16_to_cpu(sqsh_ino->guid), &inode->i_gid); + if (err) + return err; + + inode->i_ino = le32_to_cpu(sqsh_ino->inode_number); + inode->i_mtime.tv_sec = le32_to_cpu(sqsh_ino->mtime); + inode->i_atime.tv_sec = inode->i_mtime.tv_sec; + inode->i_ctime.tv_sec = inode->i_mtime.tv_sec; + inode->i_mode = le16_to_cpu(sqsh_ino->mode); + inode->i_size = 0; + + return err; +} + + +struct inode *squashfs_iget(struct super_block *sb, long long ino, + unsigned int ino_number) +{ + struct inode *inode = iget_locked(sb, ino_number); + int err; + + TRACE("Entered squashfs_iget\n"); + + if (!inode) + return ERR_PTR(-ENOMEM); + if (!(inode->i_state & I_NEW)) + return inode; + + err = squashfs_read_inode(inode, ino); + if (err) { + iget_failed(inode); + return ERR_PTR(err); + } + + unlock_new_inode(inode); + return inode; +} + + +/* + * Initialise VFS inode by reading inode from inode table (compressed + * metadata). The format and amount of data read depends on type. + */ +int squashfs_read_inode(struct inode *inode, long long ino) +{ + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + u64 block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + int err, type, offset = SQUASHFS_INODE_OFFSET(ino); + union squashfs_inode squashfs_ino; + struct squashfs_base_inode *sqshb_ino = &squashfs_ino.base; + + TRACE("Entered squashfs_read_inode\n"); + + /* + * Read inode base common to all inode types. + */ + err = squashfs_read_metadata(sb, sqshb_ino, &block, + &offset, sizeof(*sqshb_ino)); + if (err < 0) + goto failed_read; + + err = squashfs_new_inode(sb, inode, sqshb_ino); + if (err) + goto failed_read; + + block = SQUASHFS_INODE_BLK(ino) + msblk->inode_table; + offset = SQUASHFS_INODE_OFFSET(ino); + + type = le16_to_cpu(sqshb_ino->inode_type); + switch (type) { + case SQUASHFS_REG_TYPE: { + unsigned int frag_offset, frag_size, frag; + u64 frag_blk; + struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + inode->i_nlink = 1; + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - 1) >> 9) + 1; + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_LREG_TYPE: { + unsigned int frag_offset, frag_size, frag; + u64 frag_blk; + struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + frag = le32_to_cpu(sqsh_ino->fragment); + if (frag != SQUASHFS_INVALID_FRAG) { + frag_offset = le32_to_cpu(sqsh_ino->offset); + frag_size = squashfs_frag_lookup(sb, frag, &frag_blk); + if (frag_size < 0) { + err = frag_size; + goto failed_read; + } + } else { + frag_blk = SQUASHFS_INVALID_BLK; + frag_size = 0; + frag_offset = 0; + } + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le64_to_cpu(sqsh_ino->file_size); + inode->i_fop = &generic_ro_fops; + inode->i_mode |= S_IFREG; + inode->i_blocks = ((inode->i_size - + le64_to_cpu(sqsh_ino->sparse) - 1) >> 9) + 1; + + squashfs_i(inode)->fragment_block = frag_blk; + squashfs_i(inode)->fragment_size = frag_size; + squashfs_i(inode)->fragment_offset = frag_offset; + squashfs_i(inode)->start = le64_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->block_list_start = block; + squashfs_i(inode)->offset = offset; + inode->i_data.a_ops = &squashfs_aops; + + TRACE("File inode %x:%x, start_block %llx, block_list_start " + "%llx, offset %x\n", SQUASHFS_INODE_BLK(ino), + offset, squashfs_i(inode)->start, block, offset); + break; + } + case SQUASHFS_DIR_TYPE: { + struct squashfs_dir_inode *sqsh_ino = &squashfs_ino.dir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le16_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_cnt = 0; + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Directory inode %x:%x, start_block %llx, offset %x\n", + SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_LDIR_TYPE: { + struct squashfs_ldir_inode *sqsh_ino = &squashfs_ino.ldir; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le32_to_cpu(sqsh_ino->file_size); + inode->i_op = &squashfs_dir_inode_ops; + inode->i_fop = &squashfs_dir_ops; + inode->i_mode |= S_IFDIR; + squashfs_i(inode)->start = le32_to_cpu(sqsh_ino->start_block); + squashfs_i(inode)->offset = le16_to_cpu(sqsh_ino->offset); + squashfs_i(inode)->dir_idx_start = block; + squashfs_i(inode)->dir_idx_offset = offset; + squashfs_i(inode)->dir_idx_cnt = le16_to_cpu(sqsh_ino->i_count); + squashfs_i(inode)->parent = le32_to_cpu(sqsh_ino->parent_inode); + + TRACE("Long directory inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + squashfs_i(inode)->start, + le16_to_cpu(sqsh_ino->offset)); + break; + } + case SQUASHFS_SYMLINK_TYPE: + case SQUASHFS_LSYMLINK_TYPE: { + struct squashfs_symlink_inode *sqsh_ino = &squashfs_ino.symlink; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + inode->i_size = le32_to_cpu(sqsh_ino->symlink_size); + inode->i_op = &page_symlink_inode_operations; + inode->i_data.a_ops = &squashfs_symlink_aops; + inode->i_mode |= S_IFLNK; + squashfs_i(inode)->start = block; + squashfs_i(inode)->offset = offset; + + TRACE("Symbolic link inode %x:%x, start_block %llx, offset " + "%x\n", SQUASHFS_INODE_BLK(ino), offset, + block, offset); + break; + } + case SQUASHFS_BLKDEV_TYPE: + case SQUASHFS_CHRDEV_TYPE: + case SQUASHFS_LBLKDEV_TYPE: + case SQUASHFS_LCHRDEV_TYPE: { + struct squashfs_dev_inode *sqsh_ino = &squashfs_ino.dev; + unsigned int rdev; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_CHRDEV_TYPE) + inode->i_mode |= S_IFCHR; + else + inode->i_mode |= S_IFBLK; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + rdev = le32_to_cpu(sqsh_ino->rdev); + init_special_inode(inode, inode->i_mode, new_decode_dev(rdev)); + + TRACE("Device inode %x:%x, rdev %x\n", + SQUASHFS_INODE_BLK(ino), offset, rdev); + break; + } + case SQUASHFS_FIFO_TYPE: + case SQUASHFS_SOCKET_TYPE: + case SQUASHFS_LFIFO_TYPE: + case SQUASHFS_LSOCKET_TYPE: { + struct squashfs_ipc_inode *sqsh_ino = &squashfs_ino.ipc; + + err = squashfs_read_metadata(sb, sqsh_ino, &block, &offset, + sizeof(*sqsh_ino)); + if (err < 0) + goto failed_read; + + if (type == SQUASHFS_FIFO_TYPE) + inode->i_mode |= S_IFIFO; + else + inode->i_mode |= S_IFSOCK; + inode->i_nlink = le32_to_cpu(sqsh_ino->nlink); + init_special_inode(inode, inode->i_mode, 0); + break; + } + default: + ERROR("Unknown inode type %d in squashfs_iget!\n", type); + return -EINVAL; + } + + return 0; + +failed_read: + ERROR("Unable to read inode 0x%llx\n", ino); + return err; +} diff --git a/fs/squashfs/namei.c b/fs/squashfs/namei.c new file mode 100644 index 00000000000..9e398653b22 --- /dev/null +++ b/fs/squashfs/namei.c @@ -0,0 +1,242 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * namei.c + */ + +/* + * This file implements code to do filename lookup in directories. + * + * Like inodes, directories are packed into compressed metadata blocks, stored + * in a directory table. Directories are accessed using the start address of + * the metablock containing the directory and the offset into the + * decompressed block (<block, offset>). + * + * Directories are organised in a slightly complex way, and are not simply + * a list of file names. The organisation takes advantage of the + * fact that (in most cases) the inodes of the files will be in the same + * compressed metadata block, and therefore, can share the start block. + * Directories are therefore organised in a two level list, a directory + * header containing the shared start block value, and a sequence of directory + * entries, each of which share the shared start block. A new directory header + * is written once/if the inode start block changes. The directory + * header/directory entry list is repeated as many times as necessary. + * + * Directories are sorted, and can contain a directory index to speed up + * file lookup. Directory indexes store one entry per metablock, each entry + * storing the index/filename mapping to the first directory header + * in each metadata block. Directories are sorted in alphabetical order, + * and at lookup the index is scanned linearly looking for the first filename + * alphabetically larger than the filename being looked up. At this point the + * location of the metadata block the filename is in has been found. + * The general idea of the index is ensure only one metadata block needs to be + * decompressed to do a lookup irrespective of the length of the directory. + * This scheme has the advantage that it doesn't require extra memory overhead + * and doesn't require much extra storage on disk. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/dcache.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +/* + * Lookup name in the directory index, returning the location of the metadata + * block containing it, and the directory index this represents. + * + * If we get an error reading the index then return the part of the index + * (if any) we have managed to read - the index isn't essential, just + * quicker. + */ +static int get_dir_index_using_name(struct super_block *sb, + u64 *next_block, int *next_offset, u64 index_start, + int index_offset, int i_count, const char *name, + int len) +{ + struct squashfs_sb_info *msblk = sb->s_fs_info; + int i, size, length = 0, err; + struct squashfs_dir_index *index; + char *str; + + TRACE("Entered get_dir_index_using_name, i_count %d\n", i_count); + + index = kmalloc(sizeof(*index) + SQUASHFS_NAME_LEN * 2 + 2, GFP_KERNEL); + if (index == NULL) { + ERROR("Failed to allocate squashfs_dir_index\n"); + goto out; + } + + str = &index->name[SQUASHFS_NAME_LEN + 1]; + strncpy(str, name, len); + str[len] = '\0'; + + for (i = 0; i < i_count; i++) { + err = squashfs_read_metadata(sb, index, &index_start, + &index_offset, sizeof(*index)); + if (err < 0) + break; + + + size = le32_to_cpu(index->size) + 1; + + err = squashfs_read_metadata(sb, index->name, &index_start, + &index_offset, size); + if (err < 0) + break; + + index->name[size] = '\0'; + + if (strcmp(index->name, str) > 0) + break; + + length = le32_to_cpu(index->index); + *next_block = le32_to_cpu(index->start_block) + + msblk->directory_table; + } + + *next_offset = (length + *next_offset) % SQUASHFS_METADATA_SIZE; + kfree(index); + +out: + /* + * Return index (f_pos) of the looked up metadata block. Translate + * from internal f_pos to external f_pos which is offset by 3 because + * we invent "." and ".." entries which are not actually stored in the + * directory. + */ + return length + 3; +} + + +static struct dentry *squashfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + const unsigned char *name = dentry->d_name.name; + int len = dentry->d_name.len; + struct inode *inode = NULL; + struct squashfs_sb_info *msblk = dir->i_sb->s_fs_info; + struct squashfs_dir_header dirh; + struct squashfs_dir_entry *dire; + u64 block = squashfs_i(dir)->start + msblk->directory_table; + int offset = squashfs_i(dir)->offset; + int err, length = 0, dir_count, size; + + TRACE("Entered squashfs_lookup [%llx:%x]\n", block, offset); + + dire = kmalloc(sizeof(*dire) + SQUASHFS_NAME_LEN + 1, GFP_KERNEL); + if (dire == NULL) { + ERROR("Failed to allocate squashfs_dir_entry\n"); + return ERR_PTR(-ENOMEM); + } + + if (len > SQUASHFS_NAME_LEN) { + err = -ENAMETOOLONG; + goto failed; + } + + length = get_dir_index_using_name(dir->i_sb, &block, &offset, + squashfs_i(dir)->dir_idx_start, + squashfs_i(dir)->dir_idx_offset, + squashfs_i(dir)->dir_idx_cnt, name, len); + + while (length < i_size_read(dir)) { + /* + * Read directory header. + */ + err = squashfs_read_metadata(dir->i_sb, &dirh, &block, + &offset, sizeof(dirh)); + if (err < 0) + goto read_failure; + + length += sizeof(dirh); + + dir_count = le32_to_cpu(dirh.count) + 1; + while (dir_count--) { + /* + * Read directory entry. + */ + err = squashfs_read_metadata(dir->i_sb, dire, &block, + &offset, sizeof(*dire)); + if (err < 0) + goto read_failure; + + size = le16_to_cpu(dire->size) + 1; + + err = squashfs_read_metadata(dir->i_sb, dire->name, + &block, &offset, size); + if (err < 0) + goto read_failure; + + length += sizeof(*dire) + size; + + if (name[0] < dire->name[0]) + goto exit_lookup; + + if (len == size && !strncmp(name, dire->name, len)) { + unsigned int blk, off, ino_num; + long long ino; + blk = le32_to_cpu(dirh.start_block); + off = le16_to_cpu(dire->offset); + ino_num = le32_to_cpu(dirh.inode_number) + + (short) le16_to_cpu(dire->inode_number); + ino = SQUASHFS_MKINODE(blk, off); + + TRACE("calling squashfs_iget for directory " + "entry %s, inode %x:%x, %d\n", name, + blk, off, ino_num); + + inode = squashfs_iget(dir->i_sb, ino, ino_num); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto failed; + } + + goto exit_lookup; + } + } + } + +exit_lookup: + kfree(dire); + if (inode) + return d_splice_alias(inode, dentry); + d_add(dentry, inode); + return ERR_PTR(0); + +read_failure: + ERROR("Unable to read directory block [%llx:%x]\n", + squashfs_i(dir)->start + msblk->directory_table, + squashfs_i(dir)->offset); +failed: + kfree(dire); + return ERR_PTR(err); +} + + +const struct inode_operations squashfs_dir_inode_ops = { + .lookup = squashfs_lookup +}; diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h new file mode 100644 index 00000000000..6b2515d027d --- /dev/null +++ b/fs/squashfs/squashfs.h @@ -0,0 +1,90 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs.h + */ + +#define TRACE(s, args...) pr_debug("SQUASHFS: "s, ## args) + +#define ERROR(s, args...) pr_err("SQUASHFS error: "s, ## args) + +#define WARNING(s, args...) pr_warning("SQUASHFS: "s, ## args) + +static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) +{ + return list_entry(inode, struct squashfs_inode_info, vfs_inode); +} + +/* block.c */ +extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, + int); + +/* cache.c */ +extern struct squashfs_cache *squashfs_cache_init(char *, int, int); +extern void squashfs_cache_delete(struct squashfs_cache *); +extern struct squashfs_cache_entry *squashfs_cache_get(struct super_block *, + struct squashfs_cache *, u64, int); +extern void squashfs_cache_put(struct squashfs_cache_entry *); +extern int squashfs_copy_data(void *, struct squashfs_cache_entry *, int, int); +extern int squashfs_read_metadata(struct super_block *, void *, u64 *, + int *, int); +extern struct squashfs_cache_entry *squashfs_get_fragment(struct super_block *, + u64, int); +extern struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *, + u64, int); +extern int squashfs_read_table(struct super_block *, void *, u64, int); + +/* export.c */ +extern __le64 *squashfs_read_inode_lookup_table(struct super_block *, u64, + unsigned int); + +/* fragment.c */ +extern int squashfs_frag_lookup(struct super_block *, unsigned int, u64 *); +extern __le64 *squashfs_read_fragment_index_table(struct super_block *, + u64, unsigned int); + +/* id.c */ +extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *); +extern __le64 *squashfs_read_id_index_table(struct super_block *, u64, + unsigned short); + +/* inode.c */ +extern struct inode *squashfs_iget(struct super_block *, long long, + unsigned int); +extern int squashfs_read_inode(struct inode *, long long); + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations squashfs_dir_ops; + +/* export.c */ +extern const struct export_operations squashfs_export_ops; + +/* file.c */ +extern const struct address_space_operations squashfs_aops; + +/* namei.c */ +extern const struct inode_operations squashfs_dir_inode_ops; + +/* symlink.c */ +extern const struct address_space_operations squashfs_symlink_aops; diff --git a/fs/squashfs/squashfs_fs.h b/fs/squashfs/squashfs_fs.h new file mode 100644 index 00000000000..6840da1bf21 --- /dev/null +++ b/fs/squashfs/squashfs_fs.h @@ -0,0 +1,381 @@ +#ifndef SQUASHFS_FS +#define SQUASHFS_FS +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs.h + */ + +#define SQUASHFS_CACHED_FRAGMENTS CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE +#define SQUASHFS_MAJOR 4 +#define SQUASHFS_MINOR 0 +#define SQUASHFS_MAGIC 0x73717368 +#define SQUASHFS_START 0 + +/* size of metadata (inode and directory) blocks */ +#define SQUASHFS_METADATA_SIZE 8192 +#define SQUASHFS_METADATA_LOG 13 + +/* default size of data blocks */ +#define SQUASHFS_FILE_SIZE 131072 +#define SQUASHFS_FILE_LOG 17 + +#define SQUASHFS_FILE_MAX_SIZE 1048576 +#define SQUASHFS_FILE_MAX_LOG 20 + +/* Max number of uids and gids */ +#define SQUASHFS_IDS 65536 + +/* Max length of filename (not 255) */ +#define SQUASHFS_NAME_LEN 256 + +#define SQUASHFS_INVALID_FRAG (0xffffffffU) +#define SQUASHFS_INVALID_BLK (-1LL) + +/* Filesystem flags */ +#define SQUASHFS_NOI 0 +#define SQUASHFS_NOD 1 +#define SQUASHFS_NOF 3 +#define SQUASHFS_NO_FRAG 4 +#define SQUASHFS_ALWAYS_FRAG 5 +#define SQUASHFS_DUPLICATE 6 +#define SQUASHFS_EXPORT 7 + +#define SQUASHFS_BIT(flag, bit) ((flag >> bit) & 1) + +#define SQUASHFS_UNCOMPRESSED_INODES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOI) + +#define SQUASHFS_UNCOMPRESSED_DATA(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOD) + +#define SQUASHFS_UNCOMPRESSED_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NOF) + +#define SQUASHFS_NO_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_NO_FRAG) + +#define SQUASHFS_ALWAYS_FRAGMENTS(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_ALWAYS_FRAG) + +#define SQUASHFS_DUPLICATES(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_DUPLICATE) + +#define SQUASHFS_EXPORTABLE(flags) SQUASHFS_BIT(flags, \ + SQUASHFS_EXPORT) + +/* Max number of types and file types */ +#define SQUASHFS_DIR_TYPE 1 +#define SQUASHFS_REG_TYPE 2 +#define SQUASHFS_SYMLINK_TYPE 3 +#define SQUASHFS_BLKDEV_TYPE 4 +#define SQUASHFS_CHRDEV_TYPE 5 +#define SQUASHFS_FIFO_TYPE 6 +#define SQUASHFS_SOCKET_TYPE 7 +#define SQUASHFS_LDIR_TYPE 8 +#define SQUASHFS_LREG_TYPE 9 +#define SQUASHFS_LSYMLINK_TYPE 10 +#define SQUASHFS_LBLKDEV_TYPE 11 +#define SQUASHFS_LCHRDEV_TYPE 12 +#define SQUASHFS_LFIFO_TYPE 13 +#define SQUASHFS_LSOCKET_TYPE 14 + +/* Flag whether block is compressed or uncompressed, bit is set if block is + * uncompressed */ +#define SQUASHFS_COMPRESSED_BIT (1 << 15) + +#define SQUASHFS_COMPRESSED_SIZE(B) (((B) & ~SQUASHFS_COMPRESSED_BIT) ? \ + (B) & ~SQUASHFS_COMPRESSED_BIT : SQUASHFS_COMPRESSED_BIT) + +#define SQUASHFS_COMPRESSED(B) (!((B) & SQUASHFS_COMPRESSED_BIT)) + +#define SQUASHFS_COMPRESSED_BIT_BLOCK (1 << 24) + +#define SQUASHFS_COMPRESSED_SIZE_BLOCK(B) ((B) & \ + ~SQUASHFS_COMPRESSED_BIT_BLOCK) + +#define SQUASHFS_COMPRESSED_BLOCK(B) (!((B) & SQUASHFS_COMPRESSED_BIT_BLOCK)) + +/* + * Inode number ops. Inodes consist of a compressed block number, and an + * uncompressed offset within that block + */ +#define SQUASHFS_INODE_BLK(A) ((unsigned int) ((A) >> 16)) + +#define SQUASHFS_INODE_OFFSET(A) ((unsigned int) ((A) & 0xffff)) + +#define SQUASHFS_MKINODE(A, B) ((long long)(((long long) (A)\ + << 16) + (B))) + +/* Translate between VFS mode and squashfs mode */ +#define SQUASHFS_MODE(A) ((A) & 0xfff) + +/* fragment and fragment table defines */ +#define SQUASHFS_FRAGMENT_BYTES(A) \ + ((A) * sizeof(struct squashfs_fragment_entry)) + +#define SQUASHFS_FRAGMENT_INDEX(A) (SQUASHFS_FRAGMENT_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_OFFSET(A) (SQUASHFS_FRAGMENT_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEXES(A) ((SQUASHFS_FRAGMENT_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_FRAGMENT_INDEX_BYTES(A) (SQUASHFS_FRAGMENT_INDEXES(A) *\ + sizeof(u64)) + +/* inode lookup table defines */ +#define SQUASHFS_LOOKUP_BYTES(A) ((A) * sizeof(u64)) + +#define SQUASHFS_LOOKUP_BLOCK(A) (SQUASHFS_LOOKUP_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_OFFSET(A) (SQUASHFS_LOOKUP_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCKS(A) ((SQUASHFS_LOOKUP_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_LOOKUP_BLOCK_BYTES(A) (SQUASHFS_LOOKUP_BLOCKS(A) *\ + sizeof(u64)) + +/* uid/gid lookup table defines */ +#define SQUASHFS_ID_BYTES(A) ((A) * sizeof(unsigned int)) + +#define SQUASHFS_ID_BLOCK(A) (SQUASHFS_ID_BYTES(A) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_OFFSET(A) (SQUASHFS_ID_BYTES(A) % \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCKS(A) ((SQUASHFS_ID_BYTES(A) + \ + SQUASHFS_METADATA_SIZE - 1) / \ + SQUASHFS_METADATA_SIZE) + +#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ + sizeof(u64)) + +/* cached data constants for filesystem */ +#define SQUASHFS_CACHED_BLKS 8 + +#define SQUASHFS_MAX_FILE_SIZE_LOG 64 + +#define SQUASHFS_MAX_FILE_SIZE (1LL << \ + (SQUASHFS_MAX_FILE_SIZE_LOG - 2)) + +#define SQUASHFS_MARKER_BYTE 0xff + +/* meta index cache */ +#define SQUASHFS_META_INDEXES (SQUASHFS_METADATA_SIZE / sizeof(unsigned int)) +#define SQUASHFS_META_ENTRIES 127 +#define SQUASHFS_META_SLOTS 8 + +struct meta_entry { + u64 data_block; + unsigned int index_block; + unsigned short offset; + unsigned short pad; +}; + +struct meta_index { + unsigned int inode_number; + unsigned int offset; + unsigned short entries; + unsigned short skip; + unsigned short locked; + unsigned short pad; + struct meta_entry meta_entry[SQUASHFS_META_ENTRIES]; +}; + + +/* + * definitions for structures on disk + */ +#define ZLIB_COMPRESSION 1 + +struct squashfs_super_block { + __le32 s_magic; + __le32 inodes; + __le32 mkfs_time; + __le32 block_size; + __le32 fragments; + __le16 compression; + __le16 block_log; + __le16 flags; + __le16 no_ids; + __le16 s_major; + __le16 s_minor; + __le64 root_inode; + __le64 bytes_used; + __le64 id_table_start; + __le64 xattr_table_start; + __le64 inode_table_start; + __le64 directory_table_start; + __le64 fragment_table_start; + __le64 lookup_table_start; +}; + +struct squashfs_dir_index { + __le32 index; + __le32 start_block; + __le32 size; + unsigned char name[0]; +}; + +struct squashfs_base_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; +}; + +struct squashfs_ipc_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; +}; + +struct squashfs_dev_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 rdev; +}; + +struct squashfs_symlink_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 symlink_size; + char symlink[0]; +}; + +struct squashfs_reg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 fragment; + __le32 offset; + __le32 file_size; + __le16 block_list[0]; +}; + +struct squashfs_lreg_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le64 start_block; + __le64 file_size; + __le64 sparse; + __le32 nlink; + __le32 fragment; + __le32 offset; + __le32 xattr; + __le16 block_list[0]; +}; + +struct squashfs_dir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 start_block; + __le32 nlink; + __le16 file_size; + __le16 offset; + __le32 parent_inode; +}; + +struct squashfs_ldir_inode { + __le16 inode_type; + __le16 mode; + __le16 uid; + __le16 guid; + __le32 mtime; + __le32 inode_number; + __le32 nlink; + __le32 file_size; + __le32 start_block; + __le32 parent_inode; + __le16 i_count; + __le16 offset; + __le32 xattr; + struct squashfs_dir_index index[0]; +}; + +union squashfs_inode { + struct squashfs_base_inode base; + struct squashfs_dev_inode dev; + struct squashfs_symlink_inode symlink; + struct squashfs_reg_inode reg; + struct squashfs_lreg_inode lreg; + struct squashfs_dir_inode dir; + struct squashfs_ldir_inode ldir; + struct squashfs_ipc_inode ipc; +}; + +struct squashfs_dir_entry { + __le16 offset; + __le16 inode_number; + __le16 type; + __le16 size; + char name[0]; +}; + +struct squashfs_dir_header { + __le32 count; + __le32 start_block; + __le32 inode_number; +}; + +struct squashfs_fragment_entry { + __le64 start_block; + __le32 size; + unsigned int unused; +}; + +#endif diff --git a/fs/squashfs/squashfs_fs_i.h b/fs/squashfs/squashfs_fs_i.h new file mode 100644 index 00000000000..fbfca30c0c6 --- /dev/null +++ b/fs/squashfs/squashfs_fs_i.h @@ -0,0 +1,45 @@ +#ifndef SQUASHFS_FS_I +#define SQUASHFS_FS_I +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_i.h + */ + +struct squashfs_inode_info { + u64 start; + int offset; + union { + struct { + u64 fragment_block; + int fragment_size; + int fragment_offset; + u64 block_list_start; + }; + struct { + u64 dir_idx_start; + int dir_idx_offset; + int dir_idx_cnt; + int parent; + }; + }; + struct inode vfs_inode; +}; +#endif diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h new file mode 100644 index 00000000000..c8c65614dd1 --- /dev/null +++ b/fs/squashfs/squashfs_fs_sb.h @@ -0,0 +1,76 @@ +#ifndef SQUASHFS_FS_SB +#define SQUASHFS_FS_SB +/* + * Squashfs + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * squashfs_fs_sb.h + */ + +#include "squashfs_fs.h" + +struct squashfs_cache { + char *name; + int entries; + int next_blk; + int num_waiters; + int unused; + int block_size; + int pages; + spinlock_t lock; + wait_queue_head_t wait_queue; + struct squashfs_cache_entry *entry; +}; + +struct squashfs_cache_entry { + u64 block; + int length; + int refcount; + u64 next_index; + int pending; + int error; + int num_waiters; + wait_queue_head_t wait_queue; + struct squashfs_cache *cache; + void **data; +}; + +struct squashfs_sb_info { + int devblksize; + int devblksize_log2; + struct squashfs_cache *block_cache; + struct squashfs_cache *fragment_cache; + struct squashfs_cache *read_page; + int next_meta_index; + __le64 *id_table; + __le64 *fragment_index; + unsigned int *fragment_index_2; + struct mutex read_data_mutex; + struct mutex meta_index_mutex; + struct meta_index *meta_index; + z_stream stream; + __le64 *inode_lookup_table; + u64 inode_table; + u64 directory_table; + unsigned int block_size; + unsigned short block_log; + long long bytes_used; + unsigned int inodes; +}; +#endif diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c new file mode 100644 index 00000000000..a0466d7467b --- /dev/null +++ b/fs/squashfs/super.c @@ -0,0 +1,440 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * super.c + */ + +/* + * This file implements code to read the superblock, read and initialise + * in-memory structures at mount time, and all the VFS glue code to register + * the filesystem. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/slab.h> +#include <linux/mutex.h> +#include <linux/pagemap.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static struct file_system_type squashfs_fs_type; +static struct super_operations squashfs_super_ops; + +static int supported_squashfs_filesystem(short major, short minor, short comp) +{ + if (major < SQUASHFS_MAJOR) { + ERROR("Major/Minor mismatch, older Squashfs %d.%d " + "filesystems are unsupported\n", major, minor); + return -EINVAL; + } else if (major > SQUASHFS_MAJOR || minor > SQUASHFS_MINOR) { + ERROR("Major/Minor mismatch, trying to mount newer " + "%d.%d filesystem\n", major, minor); + ERROR("Please update your kernel\n"); + return -EINVAL; + } + + if (comp != ZLIB_COMPRESSION) + return -EINVAL; + + return 0; +} + + +static int squashfs_fill_super(struct super_block *sb, void *data, int silent) +{ + struct squashfs_sb_info *msblk; + struct squashfs_super_block *sblk = NULL; + char b[BDEVNAME_SIZE]; + struct inode *root; + long long root_inode; + unsigned short flags; + unsigned int fragments; + u64 lookup_table_start; + int err; + + TRACE("Entered squashfs_fill_superblock\n"); + + sb->s_fs_info = kzalloc(sizeof(*msblk), GFP_KERNEL); + if (sb->s_fs_info == NULL) { + ERROR("Failed to allocate squashfs_sb_info\n"); + return -ENOMEM; + } + msblk = sb->s_fs_info; + + msblk->stream.workspace = kmalloc(zlib_inflate_workspacesize(), + GFP_KERNEL); + if (msblk->stream.workspace == NULL) { + ERROR("Failed to allocate zlib workspace\n"); + goto failure; + } + + sblk = kzalloc(sizeof(*sblk), GFP_KERNEL); + if (sblk == NULL) { + ERROR("Failed to allocate squashfs_super_block\n"); + goto failure; + } + + msblk->devblksize = sb_min_blocksize(sb, BLOCK_SIZE); + msblk->devblksize_log2 = ffz(~msblk->devblksize); + + mutex_init(&msblk->read_data_mutex); + mutex_init(&msblk->meta_index_mutex); + + /* + * msblk->bytes_used is checked in squashfs_read_table to ensure reads + * are not beyond filesystem end. But as we're using + * squashfs_read_table here to read the superblock (including the value + * of bytes_used) we need to set it to an initial sensible dummy value + */ + msblk->bytes_used = sizeof(*sblk); + err = squashfs_read_table(sb, sblk, SQUASHFS_START, sizeof(*sblk)); + + if (err < 0) { + ERROR("unable to read squashfs_super_block\n"); + goto failed_mount; + } + + /* Check it is a SQUASHFS superblock */ + sb->s_magic = le32_to_cpu(sblk->s_magic); + if (sb->s_magic != SQUASHFS_MAGIC) { + if (!silent) + ERROR("Can't find a SQUASHFS superblock on %s\n", + bdevname(sb->s_bdev, b)); + err = -EINVAL; + goto failed_mount; + } + + /* Check the MAJOR & MINOR versions and compression type */ + err = supported_squashfs_filesystem(le16_to_cpu(sblk->s_major), + le16_to_cpu(sblk->s_minor), + le16_to_cpu(sblk->compression)); + if (err < 0) + goto failed_mount; + + err = -EINVAL; + + /* + * Check if there's xattrs in the filesystem. These are not + * supported in this version, so warn that they will be ignored. + */ + if (le64_to_cpu(sblk->xattr_table_start) != SQUASHFS_INVALID_BLK) + ERROR("Xattrs in filesystem, these will be ignored\n"); + + /* Check the filesystem does not extend beyond the end of the + block device */ + msblk->bytes_used = le64_to_cpu(sblk->bytes_used); + if (msblk->bytes_used < 0 || msblk->bytes_used > + i_size_read(sb->s_bdev->bd_inode)) + goto failed_mount; + + /* Check block size for sanity */ + msblk->block_size = le32_to_cpu(sblk->block_size); + if (msblk->block_size > SQUASHFS_FILE_MAX_SIZE) + goto failed_mount; + + msblk->block_log = le16_to_cpu(sblk->block_log); + if (msblk->block_log > SQUASHFS_FILE_MAX_LOG) + goto failed_mount; + + /* Check the root inode for sanity */ + root_inode = le64_to_cpu(sblk->root_inode); + if (SQUASHFS_INODE_OFFSET(root_inode) > SQUASHFS_METADATA_SIZE) + goto failed_mount; + + msblk->inode_table = le64_to_cpu(sblk->inode_table_start); + msblk->directory_table = le64_to_cpu(sblk->directory_table_start); + msblk->inodes = le32_to_cpu(sblk->inodes); + flags = le16_to_cpu(sblk->flags); + + TRACE("Found valid superblock on %s\n", bdevname(sb->s_bdev, b)); + TRACE("Inodes are %scompressed\n", SQUASHFS_UNCOMPRESSED_INODES(flags) + ? "un" : ""); + TRACE("Data is %scompressed\n", SQUASHFS_UNCOMPRESSED_DATA(flags) + ? "un" : ""); + TRACE("Filesystem size %lld bytes\n", msblk->bytes_used); + TRACE("Block size %d\n", msblk->block_size); + TRACE("Number of inodes %d\n", msblk->inodes); + TRACE("Number of fragments %d\n", le32_to_cpu(sblk->fragments)); + TRACE("Number of ids %d\n", le16_to_cpu(sblk->no_ids)); + TRACE("sblk->inode_table_start %llx\n", msblk->inode_table); + TRACE("sblk->directory_table_start %llx\n", msblk->directory_table); + TRACE("sblk->fragment_table_start %llx\n", + (u64) le64_to_cpu(sblk->fragment_table_start)); + TRACE("sblk->id_table_start %llx\n", + (u64) le64_to_cpu(sblk->id_table_start)); + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_flags |= MS_RDONLY; + sb->s_op = &squashfs_super_ops; + + err = -ENOMEM; + + msblk->block_cache = squashfs_cache_init("metadata", + SQUASHFS_CACHED_BLKS, SQUASHFS_METADATA_SIZE); + if (msblk->block_cache == NULL) + goto failed_mount; + + /* Allocate read_page block */ + msblk->read_page = squashfs_cache_init("data", 1, msblk->block_size); + if (msblk->read_page == NULL) { + ERROR("Failed to allocate read_page block\n"); + goto failed_mount; + } + + /* Allocate and read id index table */ + msblk->id_table = squashfs_read_id_index_table(sb, + le64_to_cpu(sblk->id_table_start), le16_to_cpu(sblk->no_ids)); + if (IS_ERR(msblk->id_table)) { + err = PTR_ERR(msblk->id_table); + msblk->id_table = NULL; + goto failed_mount; + } + + fragments = le32_to_cpu(sblk->fragments); + if (fragments == 0) + goto allocate_lookup_table; + + msblk->fragment_cache = squashfs_cache_init("fragment", + SQUASHFS_CACHED_FRAGMENTS, msblk->block_size); + if (msblk->fragment_cache == NULL) { + err = -ENOMEM; + goto failed_mount; + } + + /* Allocate and read fragment index table */ + msblk->fragment_index = squashfs_read_fragment_index_table(sb, + le64_to_cpu(sblk->fragment_table_start), fragments); + if (IS_ERR(msblk->fragment_index)) { + err = PTR_ERR(msblk->fragment_index); + msblk->fragment_index = NULL; + goto failed_mount; + } + +allocate_lookup_table: + lookup_table_start = le64_to_cpu(sblk->lookup_table_start); + if (lookup_table_start == SQUASHFS_INVALID_BLK) + goto allocate_root; + + /* Allocate and read inode lookup table */ + msblk->inode_lookup_table = squashfs_read_inode_lookup_table(sb, + lookup_table_start, msblk->inodes); + if (IS_ERR(msblk->inode_lookup_table)) { + err = PTR_ERR(msblk->inode_lookup_table); + msblk->inode_lookup_table = NULL; + goto failed_mount; + } + + sb->s_export_op = &squashfs_export_ops; + +allocate_root: + root = new_inode(sb); + if (!root) { + err = -ENOMEM; + goto failed_mount; + } + + err = squashfs_read_inode(root, root_inode); + if (err) { + iget_failed(root); + goto failed_mount; + } + insert_inode_hash(root); + + sb->s_root = d_alloc_root(root); + if (sb->s_root == NULL) { + ERROR("Root inode create failed\n"); + err = -ENOMEM; + iput(root); + goto failed_mount; + } + + TRACE("Leaving squashfs_fill_super\n"); + kfree(sblk); + return 0; + +failed_mount: + squashfs_cache_delete(msblk->block_cache); + squashfs_cache_delete(msblk->fragment_cache); + squashfs_cache_delete(msblk->read_page); + kfree(msblk->inode_lookup_table); + kfree(msblk->fragment_index); + kfree(msblk->id_table); + kfree(msblk->stream.workspace); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + kfree(sblk); + return err; + +failure: + kfree(msblk->stream.workspace); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + return -ENOMEM; +} + + +static int squashfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct squashfs_sb_info *msblk = dentry->d_sb->s_fs_info; + + TRACE("Entered squashfs_statfs\n"); + + buf->f_type = SQUASHFS_MAGIC; + buf->f_bsize = msblk->block_size; + buf->f_blocks = ((msblk->bytes_used - 1) >> msblk->block_log) + 1; + buf->f_bfree = buf->f_bavail = 0; + buf->f_files = msblk->inodes; + buf->f_ffree = 0; + buf->f_namelen = SQUASHFS_NAME_LEN; + + return 0; +} + + +static int squashfs_remount(struct super_block *sb, int *flags, char *data) +{ + *flags |= MS_RDONLY; + return 0; +} + + +static void squashfs_put_super(struct super_block *sb) +{ + if (sb->s_fs_info) { + struct squashfs_sb_info *sbi = sb->s_fs_info; + squashfs_cache_delete(sbi->block_cache); + squashfs_cache_delete(sbi->fragment_cache); + squashfs_cache_delete(sbi->read_page); + kfree(sbi->id_table); + kfree(sbi->fragment_index); + kfree(sbi->meta_index); + kfree(sbi->stream.workspace); + kfree(sb->s_fs_info); + sb->s_fs_info = NULL; + } +} + + +static int squashfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, + struct vfsmount *mnt) +{ + return get_sb_bdev(fs_type, flags, dev_name, data, squashfs_fill_super, + mnt); +} + + +static struct kmem_cache *squashfs_inode_cachep; + + +static void init_once(void *foo) +{ + struct squashfs_inode_info *ei = foo; + + inode_init_once(&ei->vfs_inode); +} + + +static int __init init_inodecache(void) +{ + squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache", + sizeof(struct squashfs_inode_info), 0, + SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once); + + return squashfs_inode_cachep ? 0 : -ENOMEM; +} + + +static void destroy_inodecache(void) +{ + kmem_cache_destroy(squashfs_inode_cachep); +} + + +static int __init init_squashfs_fs(void) +{ + int err = init_inodecache(); + + if (err) + return err; + + err = register_filesystem(&squashfs_fs_type); + if (err) { + destroy_inodecache(); + return err; + } + + printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " + "Phillip Lougher\n"); + + return 0; +} + + +static void __exit exit_squashfs_fs(void) +{ + unregister_filesystem(&squashfs_fs_type); + destroy_inodecache(); +} + + +static struct inode *squashfs_alloc_inode(struct super_block *sb) +{ + struct squashfs_inode_info *ei = + kmem_cache_alloc(squashfs_inode_cachep, GFP_KERNEL); + + return ei ? &ei->vfs_inode : NULL; +} + + +static void squashfs_destroy_inode(struct inode *inode) +{ + kmem_cache_free(squashfs_inode_cachep, squashfs_i(inode)); +} + + +static struct file_system_type squashfs_fs_type = { + .owner = THIS_MODULE, + .name = "squashfs", + .get_sb = squashfs_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV +}; + +static struct super_operations squashfs_super_ops = { + .alloc_inode = squashfs_alloc_inode, + .destroy_inode = squashfs_destroy_inode, + .statfs = squashfs_statfs, + .put_super = squashfs_put_super, + .remount_fs = squashfs_remount +}; + +module_init(init_squashfs_fs); +module_exit(exit_squashfs_fs); +MODULE_DESCRIPTION("squashfs 4.0, a compressed read-only filesystem"); +MODULE_AUTHOR("Phillip Lougher <phillip@lougher.demon.co.uk>"); +MODULE_LICENSE("GPL"); diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c new file mode 100644 index 00000000000..83d87880aac --- /dev/null +++ b/fs/squashfs/symlink.c @@ -0,0 +1,118 @@ +/* + * Squashfs - a compressed read only filesystem for Linux + * + * Copyright (c) 2002, 2003, 2004, 2005, 2006, 2007, 2008 + * Phillip Lougher <phillip@lougher.demon.co.uk> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2, + * or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. + * + * symlink.c + */ + +/* + * This file implements code to handle symbolic links. + * + * The data contents of symbolic links are stored inside the symbolic + * link inode within the inode table. This allows the normally small symbolic + * link to be compressed as part of the inode table, achieving much greater + * compression than if the symbolic link was compressed individually. + */ + +#include <linux/fs.h> +#include <linux/vfs.h> +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/string.h> +#include <linux/pagemap.h> +#include <linux/zlib.h> + +#include "squashfs_fs.h" +#include "squashfs_fs_sb.h" +#include "squashfs_fs_i.h" +#include "squashfs.h" + +static int squashfs_symlink_readpage(struct file *file, struct page *page) +{ + struct inode *inode = page->mapping->host; + struct super_block *sb = inode->i_sb; + struct squashfs_sb_info *msblk = sb->s_fs_info; + int index = page->index << PAGE_CACHE_SHIFT; + u64 block = squashfs_i(inode)->start; + int offset = squashfs_i(inode)->offset; + int length = min_t(int, i_size_read(inode) - index, PAGE_CACHE_SIZE); + int bytes, copied; + void *pageaddr; + struct squashfs_cache_entry *entry; + + TRACE("Entered squashfs_symlink_readpage, page index %ld, start block " + "%llx, offset %x\n", page->index, block, offset); + + /* + * Skip index bytes into symlink metadata. + */ + if (index) { + bytes = squashfs_read_metadata(sb, NULL, &block, &offset, + index); + if (bytes < 0) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + goto error_out; + } + } + + /* + * Read length bytes from symlink metadata. Squashfs_read_metadata + * is not used here because it can sleep and we want to use + * kmap_atomic to map the page. Instead call the underlying + * squashfs_cache_get routine. As length bytes may overlap metadata + * blocks, we may need to call squashfs_cache_get multiple times. + */ + for (bytes = 0; bytes < length; offset = 0, bytes += copied) { + entry = squashfs_cache_get(sb, msblk->block_cache, block, 0); + if (entry->error) { + ERROR("Unable to read symlink [%llx:%x]\n", + squashfs_i(inode)->start, + squashfs_i(inode)->offset); + squashfs_cache_put(entry); + goto error_out; + } + + pageaddr = kmap_atomic(page, KM_USER0); + copied = squashfs_copy_data(pageaddr + bytes, entry, offset, + length - bytes); + if (copied == length - bytes) + memset(pageaddr + length, 0, PAGE_CACHE_SIZE - length); + else + block = entry->next_index; + kunmap_atomic(pageaddr, KM_USER0); + squashfs_cache_put(entry); + } + + flush_dcache_page(page); + SetPageUptodate(page); + unlock_page(page); + return 0; + +error_out: + SetPageError(page); + unlock_page(page); + return 0; +} + + +const struct address_space_operations squashfs_symlink_aops = { + .readpage = squashfs_symlink_readpage +}; diff --git a/fs/stat.c b/fs/stat.c index 7c46fbeb8b7..7e12a6f8279 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -305,7 +305,7 @@ asmlinkage long sys_readlinkat(int dfd, const char __user *pathname, struct inode *inode = path.dentry->d_inode; error = -EINVAL; - if (inode->i_op && inode->i_op->readlink) { + if (inode->i_op->readlink) { error = security_inode_readlink(path.dentry); if (!error) { touch_atime(path.mnt, path.dentry); diff --git a/fs/super.c b/fs/super.c index ddba069d7a9..ed080c41716 100644 --- a/fs/super.c +++ b/fs/super.c @@ -38,6 +38,7 @@ #include <linux/kobject.h> #include <linux/mutex.h> #include <linux/file.h> +#include <linux/async.h> #include <asm/uaccess.h> #include "internal.h" @@ -71,6 +72,7 @@ static struct super_block *alloc_super(struct file_system_type *type) INIT_HLIST_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); INIT_LIST_HEAD(&s->s_dentry_lru); + INIT_LIST_HEAD(&s->s_async_list); init_rwsem(&s->s_umount); mutex_init(&s->s_lock); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -289,11 +291,18 @@ void generic_shutdown_super(struct super_block *sb) { const struct super_operations *sop = sb->s_op; + if (sb->s_root) { shrink_dcache_for_umount(sb); fsync_super(sb); lock_super(sb); sb->s_flags &= ~MS_ACTIVE; + + /* + * wait for asynchronous fs operations to finish before going further + */ + async_synchronize_full_special(&sb->s_async_list); + /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); lock_kernel(); @@ -461,6 +470,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); + async_synchronize_full_special(&sb->s_async_list); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); @@ -800,6 +810,7 @@ int get_sb_bdev(struct file_system_type *fs_type, } s->s_flags |= MS_ACTIVE; + bdev->bd_super = s; } return simple_set_mnt(mnt, s); @@ -819,6 +830,7 @@ void kill_block_super(struct super_block *sb) struct block_device *bdev = sb->s_bdev; fmode_t mode = sb->s_mode; + bdev->bd_super = 0; generic_shutdown_super(sb); sync_blockdev(bdev); close_bdev_exclusive(bdev, mode); diff --git a/fs/sync.c b/fs/sync.c index 2967562d416..ac02b56548b 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -75,14 +75,39 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) return ret; } -long do_fsync(struct file *file, int datasync) +/** + * vfs_fsync - perform a fsync or fdatasync on a file + * @file: file to sync + * @dentry: dentry of @file + * @data: only perform a fdatasync operation + * + * Write back data and metadata for @file to disk. If @datasync is + * set only metadata needed to access modified file data is written. + * + * In case this function is called from nfsd @file may be %NULL and + * only @dentry is set. This can only happen when the filesystem + * implements the export_operations API. + */ +int vfs_fsync(struct file *file, struct dentry *dentry, int datasync) { - int ret; - int err; - struct address_space *mapping = file->f_mapping; + const struct file_operations *fop; + struct address_space *mapping; + int err, ret; + + /* + * Get mapping and operations from the file in case we have + * as file, or get the default values for them in case we + * don't have a struct file available. Damn nfsd.. + */ + if (file) { + mapping = file->f_mapping; + fop = file->f_op; + } else { + mapping = dentry->d_inode->i_mapping; + fop = dentry->d_inode->i_fop; + } - if (!file->f_op || !file->f_op->fsync) { - /* Why? We can still call filemap_fdatawrite */ + if (!fop || !fop->fsync) { ret = -EINVAL; goto out; } @@ -94,7 +119,7 @@ long do_fsync(struct file *file, int datasync) * livelocks in fsync_buffers_list(). */ mutex_lock(&mapping->host->i_mutex); - err = file->f_op->fsync(file, file->f_path.dentry, datasync); + err = fop->fsync(file, dentry, datasync); if (!ret) ret = err; mutex_unlock(&mapping->host->i_mutex); @@ -104,15 +129,16 @@ long do_fsync(struct file *file, int datasync) out: return ret; } +EXPORT_SYMBOL(vfs_fsync); -static long __do_fsync(unsigned int fd, int datasync) +static int do_fsync(unsigned int fd, int datasync) { struct file *file; int ret = -EBADF; file = fget(fd); if (file) { - ret = do_fsync(file, datasync); + ret = vfs_fsync(file, file->f_path.dentry, datasync); fput(file); } return ret; @@ -120,12 +146,12 @@ static long __do_fsync(unsigned int fd, int datasync) asmlinkage long sys_fsync(unsigned int fd) { - return __do_fsync(fd, 0); + return do_fsync(fd, 0); } asmlinkage long sys_fdatasync(unsigned int fd) { - return __do_fsync(fd, 1); + return do_fsync(fd, 1); } /* @@ -269,7 +295,7 @@ int do_sync_mapping_range(struct address_space *mapping, loff_t offset, if (flags & SYNC_FILE_RANGE_WRITE) { ret = __filemap_fdatawrite_range(mapping, offset, endbyte, - WB_SYNC_NONE); + WB_SYNC_ALL); if (ret < 0) goto out; } diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index eb53c632f85..dfa3d94cfc7 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -107,8 +107,6 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) static inline void set_default_inode_attr(struct inode * inode, mode_t mode) { inode->i_mode = mode; - inode->i_uid = 0; - inode->i_gid = 0; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; } @@ -149,7 +147,6 @@ static void sysfs_init_inode(struct sysfs_dirent *sd, struct inode *inode) { struct bin_attribute *bin_attr; - inode->i_blocks = 0; inode->i_mapping->a_ops = &sysfs_aops; inode->i_mapping->backing_dev_info = &sysfs_backing_dev_info; inode->i_op = &sysfs_inode_operations; diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig index 91ceeda7e5b..e35b54d5059 100644 --- a/fs/ubifs/Kconfig +++ b/fs/ubifs/Kconfig @@ -40,7 +40,7 @@ config UBIFS_FS_ZLIB depends on UBIFS_FS default y help - Zlib copresses better then LZO but it is slower. Say 'Y' if unsure. + Zlib compresses better than LZO but it is slower. Say 'Y' if unsure. # Debugging-related stuff config UBIFS_FS_DEBUG diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 0e5e54d8292..175f9c590b7 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -142,7 +142,7 @@ static long long get_liability(struct ubifs_info *c) * * This function is called when an operation cannot be budgeted because there * is supposedly no free space. But in most cases there is some free space: - * o budgeting is pessimistic, so it always budgets more then it is actually + * o budgeting is pessimistic, so it always budgets more than it is actually * needed, so shrinking the liability is one way to make free space - the * cached data will take less space then it was budgeted for; * o GC may turn some dark space into free space (budgeting treats dark space @@ -606,7 +606,7 @@ void ubifs_release_budget(struct ubifs_info *c, struct ubifs_budget_req *req) * @c: UBIFS file-system description object * * This function converts budget which was allocated for a new page of data to - * the budget of changing an existing page of data. The latter is smaller then + * the budget of changing an existing page of data. The latter is smaller than * the former, so this function only does simple re-calculation and does not * involve any write-back. */ diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index fe82d2464d4..bf37374567f 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -219,7 +219,8 @@ static void release_existing_page_budget(struct ubifs_info *c) } static int write_begin_slow(struct address_space *mapping, - loff_t pos, unsigned len, struct page **pagep) + loff_t pos, unsigned len, struct page **pagep, + unsigned flags) { struct inode *inode = mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; @@ -247,7 +248,7 @@ static int write_begin_slow(struct address_space *mapping, if (unlikely(err)) return err; - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (unlikely(!page)) { ubifs_release_budget(c, &req); return -ENOMEM; @@ -438,7 +439,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, return -EROFS; /* Try out the fast-path part first */ - page = __grab_cache_page(mapping, index); + page = grab_cache_page_write_begin(mapping, index, flags); if (unlikely(!page)) return -ENOMEM; @@ -483,7 +484,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); - return write_begin_slow(mapping, pos, len, pagep); + return write_begin_slow(mapping, pos, len, pagep, flags); } /* diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index 0bef6501d58..9832f9abe28 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -45,7 +45,7 @@ #define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ /* - * GC may need to move more then one LEB to make progress. The below constants + * GC may need to move more than one LEB to make progress. The below constants * define "soft" and "hard" limits on the number of LEBs the garbage collector * may move. */ diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 10ae25b7d1d..9b7c54e0cd2 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -191,7 +191,7 @@ again: if (wbuf->lnum != -1 && avail >= len) { /* * Someone else has switched the journal head and we have - * enough space now. This happens when more then one process is + * enough space now. This happens when more than one process is * trying to write to the same journal head at the same time. */ dbg_jnl("return LEB %d back, already have LEB %d:%d", diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c index f248533841a..e7bab52a141 100644 --- a/fs/ubifs/shrinker.c +++ b/fs/ubifs/shrinker.c @@ -151,7 +151,7 @@ static int shrink_tnc(struct ubifs_info *c, int nr, int age, int *contention) * @contention: if any contention, this is set to %1 * * This function walks the list of mounted UBIFS file-systems and frees clean - * znodes which are older then @age, until at least @nr znodes are freed. + * znodes which are older than @age, until at least @nr znodes are freed. * Returns the number of freed znodes. */ static int shrink_tnc_trees(int nr, int age, int *contention) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0d7564b95f8..89556ee7251 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -432,12 +432,19 @@ static int ubifs_sync_fs(struct super_block *sb, int wait) int i, err; struct ubifs_info *c = sb->s_fs_info; struct writeback_control wbc = { - .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_HOLD, + .sync_mode = wait ? WB_SYNC_ALL : WB_SYNC_NONE, .range_start = 0, .range_end = LLONG_MAX, .nr_to_write = LONG_MAX, }; + /* + * Note by akpm about WB_SYNC_NONE used above: zero @wait is just an + * advisory thing to help the file system shove lots of data into the + * queues. If some gets missed then it'll be picked up on the second + * '->sync_fs()' call, with non-zero @wait. + */ + if (sb->s_flags & MS_RDONLY) return 0; diff --git a/fs/xattr.c b/fs/xattr.c index 468377e6653..237804cd6b5 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -175,7 +175,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size) if (error) return error; error = -EOPNOTSUPP; - if (d->d_inode->i_op && d->d_inode->i_op->listxattr) { + if (d->d_inode->i_op->listxattr) { error = d->d_inode->i_op->listxattr(d, list, size); } else { error = security_inode_listsecurity(d->d_inode, list, size); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 36f6cc703ef..be846d606ae 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1348,7 +1348,7 @@ xfs_finish_flags( { int ronly = (mp->m_flags & XFS_MOUNT_RDONLY); - /* Fail a mount where the logbuf is smaller then the log stripe */ + /* Fail a mount where the logbuf is smaller than the log stripe */ if (xfs_sb_version_haslogv2(&mp->m_sb)) { if (mp->m_logbsize <= 0 && mp->m_sb.sb_logsunit > XLOG_BIG_RECORD_BSIZE) { |