diff options
Diffstat (limited to 'fs')
298 files changed, 15600 insertions, 7690 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 047c791427a..c061c3f18e7 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -55,7 +55,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_debug, "debug=%x"}, {Opt_dfltuid, "dfltuid=%u"}, {Opt_dfltgid, "dfltgid=%u"}, diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 88e3787c6ea..e298fe19409 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -119,6 +119,7 @@ int v9fs_dir_release(struct inode *inode, struct file *filp) const struct file_operations v9fs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = v9fs_dir_readdir, .open = v9fs_file_open, .release = v9fs_dir_release, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index c95295c6504..e83aa5ebe86 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -626,8 +626,7 @@ static struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, return NULL; error: - if (fid) - p9_client_clunk(fid); + p9_client_clunk(fid); return ERR_PTR(result); } diff --git a/fs/Kconfig b/fs/Kconfig index d3873583360..d0a1174fb51 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -136,37 +136,51 @@ config EXT3_FS_SECURITY If you are not using a security module that requires using extended attributes for file security labels, say N. -config EXT4DEV_FS - tristate "Ext4dev/ext4 extended fs support development (EXPERIMENTAL)" - depends on EXPERIMENTAL +config EXT4_FS + tristate "The Extended 4 (ext4) filesystem" select JBD2 select CRC16 help - Ext4dev is a predecessor filesystem of the next generation - extended fs ext4, based on ext3 filesystem code. It will be - renamed ext4 fs later, once ext4dev is mature and stabilized. + This is the next generation of the ext3 filesystem. Unlike the change from ext2 filesystem to ext3 filesystem, - the on-disk format of ext4dev is not the same as ext3 any more: - it is based on extent maps and it supports 48-bit physical block - numbers. These combined on-disk format changes will allow - ext4dev/ext4 to handle more than 16 TB filesystem volumes -- - a hard limit that ext3 cannot overcome without changing the - on-disk format. - - Other than extent maps and 48-bit block numbers, ext4dev also is - likely to have other new features such as persistent preallocation, - high resolution time stamps, and larger file support etc. These - features will be added to ext4dev gradually. + the on-disk format of ext4 is not forwards compatible with + ext3; it is based on extent maps and it supports 48-bit + physical block numbers. The ext4 filesystem also supports delayed + allocation, persistent preallocation, high resolution time stamps, + and a number of other features to improve performance and speed + up fsck time. For more information, please see the web pages at + http://ext4.wiki.kernel.org. + + The ext4 filesystem will support mounting an ext3 + filesystem; while there will be some performance gains from + the delayed allocation and inode table readahead, the best + performance gains will require enabling ext4 features in the + filesystem, or formating a new filesystem as an ext4 + filesystem initially. To compile this file system support as a module, choose M here. The - module will be called ext4dev. + module will be called ext4. If unsure, say N. -config EXT4DEV_FS_XATTR - bool "Ext4dev extended attributes" - depends on EXT4DEV_FS +config EXT4DEV_COMPAT + bool "Enable ext4dev compatibility" + depends on EXT4_FS + help + Starting with 2.6.28, the name of the ext4 filesystem was + renamed from ext4dev to ext4. Unfortunately there are some + legacy userspace programs (such as klibc's fstype) have + "ext4dev" hardcoded. + + To enable backwards compatibility so that systems that are + still expecting to mount ext4 filesystems using ext4dev, + chose Y here. This feature will go away by 2.6.31, so + please arrange to get your userspace programs fixed! + +config EXT4_FS_XATTR + bool "Ext4 extended attributes" + depends on EXT4_FS default y help Extended attributes are name:value pairs associated with inodes by @@ -175,11 +189,11 @@ config EXT4DEV_FS_XATTR If unsure, say N. - You need this for POSIX ACL support on ext4dev/ext4. + You need this for POSIX ACL support on ext4. -config EXT4DEV_FS_POSIX_ACL - bool "Ext4dev POSIX Access Control Lists" - depends on EXT4DEV_FS_XATTR +config EXT4_FS_POSIX_ACL + bool "Ext4 POSIX Access Control Lists" + depends on EXT4_FS_XATTR select FS_POSIX_ACL help POSIX Access Control Lists (ACLs) support permissions for users and @@ -190,14 +204,14 @@ config EXT4DEV_FS_POSIX_ACL If you don't know what Access Control Lists are, say N -config EXT4DEV_FS_SECURITY - bool "Ext4dev Security Labels" - depends on EXT4DEV_FS_XATTR +config EXT4_FS_SECURITY + bool "Ext4 Security Labels" + depends on EXT4_FS_XATTR help Security labels support alternative access control models implemented by security modules like SELinux. This option enables an extended attribute handler for file security - labels in the ext4dev/ext4 filesystem. + labels in the ext4 filesystem. If you are not using a security module that requires using extended attributes for file security labels, say N. @@ -206,17 +220,16 @@ config JBD tristate help This is a generic journalling layer for block devices. It is - currently used by the ext3 and OCFS2 file systems, but it could - also be used to add journal support to other file systems or block + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using the ext3 or OCFS2 file systems, you need to - say Y here. If you are not using ext3 OCFS2 then you will probably - want to say N. + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 or OCFS2 into the kernel, - you cannot compile this code as a module. + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. config JBD_DEBUG bool "JBD (ext3) debugging support" @@ -240,22 +253,23 @@ config JBD2 help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by - the ext4dev/ext4 filesystem, but it could also be used to add + the ext4 and OCFS2 filesystems, but it could also be used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using ext4dev/ext4, you need to say Y here. If you are not - using ext4dev/ext4 then you will probably want to say N. + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. To compile this device as a module, choose M here. The module will be - called jbd2. If you are compiling ext4dev/ext4 into the kernel, + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, you cannot compile this code as a module. config JBD2_DEBUG - bool "JBD2 (ext4dev/ext4) debugging support" + bool "JBD2 (ext4) debugging support" depends on JBD2 && DEBUG_FS help - If you are using the ext4dev/ext4 journaled file system (or + If you are using the ext4 journaled file system (or potentially any other filesystem/device using JBD2), this option allows you to enable debugging output while the system is running, in order to help track down any problems you are having. @@ -270,9 +284,9 @@ config JBD2_DEBUG config FS_MBCACHE # Meta block cache for Extended Attributes (ext2/ext3/ext4) tristate - depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4DEV_FS_XATTR - default y if EXT2_FS=y || EXT3_FS=y || EXT4DEV_FS=y - default m if EXT2_FS=m || EXT3_FS=m || EXT4DEV_FS=m + depends on EXT2_FS_XATTR || EXT3_FS_XATTR || EXT4_FS_XATTR + default y if EXT2_FS=y || EXT3_FS=y || EXT4_FS=y + default m if EXT2_FS=m || EXT3_FS=m || EXT4_FS=m config REISERFS_FS tristate "Reiserfs support" @@ -419,6 +433,14 @@ config FS_POSIX_ACL bool default n +config FILE_LOCKING + bool "Enable POSIX file locking API" if EMBEDDED + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" @@ -426,7 +448,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on NET && SYSFS select CONFIGFS_FS - select JBD + select JBD2 select CRC32 help OCFS2 is a general purpose extent based shared disk cluster file @@ -497,6 +519,16 @@ config OCFS2_DEBUG_FS this option for debugging only as it is likely to decrease performance of the filesystem. +config OCFS2_COMPAT_JBD + bool "Use JBD for compatibility" + depends on OCFS2_FS + default n + select JBD + help + The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 + is backwards compatible with JBD. It is safe to say N here. + However, if you really want to use the original JBD, say Y here. + endif # BLOCK config DNOTIFY @@ -1765,6 +1797,28 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_REGISTER_V4 + bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" + depends on SUNRPC && EXPERIMENTAL + default n + help + Sun added support for registering RPC services at an IPv6 + address by creating two new versions of the rpcbind protocol + (RFC 1833). + + This option enables support in the kernel RPC server for + registering kernel RPC services via version 4 of the rpcbind + protocol. If you enable this option, you must run a portmapper + daemon that supports rpcbind protocol version 4. + + Serving NFS over IPv6 from knfsd (the kernel's NFS server) + requires that you enable this option and use a portmapper that + supports rpcbind version 4. + + If unsure, say N to get traditional behavior (register kernel + RPC services using only rpcbind version 2). Distributions + using the legacy Linux portmapper daemon must say N here. + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL @@ -1930,6 +1984,16 @@ config CIFS_WEAK_PW_HASH If unsure, say N. +config CIFS_UPCALL + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + help + Enables an upcall mechanism for CIFS which accesses + userspace helper utilities to provide SPNEGO packaged (RFC 4178) + Kerberos tickets which are needed to mount to certain secure servers + (for which more secure Kerberos authentication is required). If + unsure, say N. + config CIFS_XATTR bool "CIFS extended attributes" depends on CIFS @@ -1982,17 +2046,6 @@ config CIFS_EXPERIMENTAL (which is disabled by default). See the file fs/cifs/README for more details. If unsure, say N. -config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" - depends on CIFS_EXPERIMENTAL - depends on KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. - config CIFS_DFS_UPCALL bool "DFS feature support (EXPERIMENTAL)" depends on CIFS_EXPERIMENTAL diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4a551af6f3f..801db134181 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -25,7 +25,7 @@ config BINFMT_ELF config COMPAT_BINFMT_ELF bool - depends on COMPAT && MMU + depends on COMPAT && BINFMT_ELF config BINFMT_ELF_FDPIC bool "Kernel support for FDPIC ELF binaries" @@ -59,10 +59,12 @@ config BINFMT_SHARED_FLAT help Support FLAT shared libraries +config HAVE_AOUT + def_bool n + config BINFMT_AOUT tristate "Kernel support for a.out and ECOFF binaries" - depends on ARCH_SUPPORTS_AOUT && \ - (X86_32 || ALPHA || ARM || M68K) + depends on HAVE_AOUT ---help--- A.out (Assembler.OUTput) is a set of formats for libraries and executables used in the earliest versions of UNIX. Linux used diff --git a/fs/Makefile b/fs/Makefile index a1482a5eff1..2168c902d5c 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,8 +7,8 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ - attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ + ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ + attr.o bad_inode.o file.o filesystems.o namespace.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ stack.o @@ -27,6 +27,8 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_AIO) += aio.o +obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o @@ -69,7 +71,7 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4DEV_FS) += ext4/ # Before ext2 so root fs can be ext4dev +obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ obj-$(CONFIG_EXT2_FS) += ext2/ diff --git a/fs/adfs/dir.c b/fs/adfs/dir.c index fc1a8dc64d7..85a30e92980 100644 --- a/fs/adfs/dir.c +++ b/fs/adfs/dir.c @@ -197,6 +197,7 @@ out: const struct file_operations adfs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = adfs_readdir, .fsync = file_fsync, }; diff --git a/fs/adfs/super.c b/fs/adfs/super.c index 26f3b43726b..7f83a46f2b7 100644 --- a/fs/adfs/super.c +++ b/fs/adfs/super.c @@ -157,7 +157,7 @@ static int adfs_show_options(struct seq_file *seq, struct vfsmount *mnt) enum {Opt_uid, Opt_gid, Opt_ownmask, Opt_othmask, Opt_err}; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_ownmask, "ownmask=%o"}, diff --git a/fs/affs/dir.c b/fs/affs/dir.c index 6e3f282424b..7b36904dbea 100644 --- a/fs/affs/dir.c +++ b/fs/affs/dir.c @@ -19,6 +19,7 @@ static int affs_readdir(struct file *, void *, filldir_t); const struct file_operations affs_dir_operations = { .read = generic_read_dir, + .llseek = generic_file_llseek, .readdir = affs_readdir, .fsync = file_fsync, }; diff --git a/fs/affs/super.c b/fs/affs/super.c index 3a89094f93d..8989c93193e 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -135,7 +135,7 @@ enum { Opt_verbose, Opt_volume, Opt_ignore, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bs, "bs=%u"}, {Opt_mode, "mode=%o"}, {Opt_mufs, "mufs"}, diff --git a/fs/afs/file.c b/fs/afs/file.c index 525f7c56e06..a3901769a96 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -50,8 +50,8 @@ const struct address_space_operations afs_fs_aops = { .launder_page = afs_launder_page, .releasepage = afs_releasepage, .invalidatepage = afs_invalidatepage, - .prepare_write = afs_prepare_write, - .commit_write = afs_commit_write, + .write_begin = afs_write_begin, + .write_end = afs_write_end, .writepage = afs_writepage, .writepages = afs_writepages, }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 3cb6920ff30..67f259d99cd 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -728,8 +728,12 @@ extern int afs_volume_release_fileserver(struct afs_vnode *, */ extern int afs_set_page_dirty(struct page *); extern void afs_put_writeback(struct afs_writeback *); -extern int afs_prepare_write(struct file *, struct page *, unsigned, unsigned); -extern int afs_commit_write(struct file *, struct page *, unsigned, unsigned); +extern int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata); +extern int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata); extern int afs_writepage(struct page *, struct writeback_control *); extern int afs_writepages(struct address_space *, struct writeback_control *); extern int afs_write_inode(struct inode *, int); diff --git a/fs/afs/super.c b/fs/afs/super.c index 250d8c4d66e..aee239a048c 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -64,7 +64,7 @@ enum { afs_opt_vol, }; -static match_table_t afs_options_list = { +static const match_table_t afs_options_list = { { afs_opt_cell, "cell=%s" }, { afs_opt_rwpath, "rwpath" }, { afs_opt_vol, "vol=%s" }, diff --git a/fs/afs/write.c b/fs/afs/write.c index 065b4e10681..d6b85dab35f 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -84,15 +84,23 @@ void afs_put_writeback(struct afs_writeback *wb) * partly or wholly fill a page that's under preparation for writing */ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - unsigned start, unsigned len, struct page *page) + loff_t pos, unsigned len, struct page *page) { + loff_t i_size; + unsigned eof; int ret; - _enter(",,%u,%u", start, len); + _enter(",,%llu,%u", (unsigned long long)pos, len); - ASSERTCMP(start + len, <=, PAGE_SIZE); + ASSERTCMP(len, <=, PAGE_CACHE_SIZE); - ret = afs_vnode_fetch_data(vnode, key, start, len, page); + i_size = i_size_read(&vnode->vfs_inode); + if (pos + len > i_size) + eof = i_size; + else + eof = PAGE_CACHE_SIZE; + + ret = afs_vnode_fetch_data(vnode, key, 0, eof, page); if (ret < 0) { if (ret == -ENOENT) { _debug("got NOENT from server" @@ -107,109 +115,55 @@ static int afs_fill_page(struct afs_vnode *vnode, struct key *key, } /* - * prepare a page for being written to - */ -static int afs_prepare_page(struct afs_vnode *vnode, struct page *page, - struct key *key, unsigned offset, unsigned to) -{ - unsigned eof, tail, start, stop, len; - loff_t i_size, pos; - void *p; - int ret; - - _enter(""); - - if (offset == 0 && to == PAGE_SIZE) - return 0; - - p = kmap_atomic(page, KM_USER0); - - i_size = i_size_read(&vnode->vfs_inode); - pos = (loff_t) page->index << PAGE_SHIFT; - if (pos >= i_size) { - /* partial write, page beyond EOF */ - _debug("beyond"); - if (offset > 0) - memset(p, 0, offset); - if (to < PAGE_SIZE) - memset(p + to, 0, PAGE_SIZE - to); - kunmap_atomic(p, KM_USER0); - return 0; - } - - if (i_size - pos >= PAGE_SIZE) { - /* partial write, page entirely before EOF */ - _debug("before"); - tail = eof = PAGE_SIZE; - } else { - /* partial write, page overlaps EOF */ - eof = i_size - pos; - _debug("overlap %u", eof); - tail = max(eof, to); - if (tail < PAGE_SIZE) - memset(p + tail, 0, PAGE_SIZE - tail); - if (offset > eof) - memset(p + eof, 0, PAGE_SIZE - eof); - } - - kunmap_atomic(p, KM_USER0); - - ret = 0; - if (offset > 0 || eof > to) { - /* need to fill one or two bits that aren't going to be written - * (cover both fillers in one read if there are two) */ - start = (offset > 0) ? 0 : to; - stop = (eof > to) ? eof : offset; - len = stop - start; - _debug("wr=%u-%u av=0-%u rd=%u@%u", - offset, to, eof, start, len); - ret = afs_fill_page(vnode, key, start, len, page); - } - - _leave(" = %d", ret); - return ret; -} - -/* * prepare to perform part of a write to a page - * - the caller holds the page locked, preventing it from being written out or - * modified by anyone else */ -int afs_prepare_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +int afs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { struct afs_writeback *candidate, *wb; struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); + struct page *page; struct key *key = file->private_data; - pgoff_t index; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + len; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; int ret; _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); + vnode->fid.vid, vnode->fid.vnode, index, from, to); candidate = kzalloc(sizeof(*candidate), GFP_KERNEL); if (!candidate) return -ENOMEM; candidate->vnode = vnode; - candidate->first = candidate->last = page->index; - candidate->offset_first = offset; + candidate->first = candidate->last = index; + candidate->offset_first = from; candidate->to_last = to; candidate->usage = 1; candidate->state = AFS_WBACK_PENDING; init_waitqueue_head(&candidate->waitq); + page = __grab_cache_page(mapping, index); + if (!page) { + kfree(candidate); + return -ENOMEM; + } + *pagep = page; + /* page won't leak in error case: it eventually gets cleaned off LRU */ + if (!PageUptodate(page)) { _debug("not up to date"); - ret = afs_prepare_page(vnode, page, key, offset, to); + ret = afs_fill_page(vnode, key, pos, len, page); if (ret < 0) { kfree(candidate); _leave(" = %d [prep]", ret); return ret; } + SetPageUptodate(page); } try_again: - index = page->index; spin_lock(&vnode->writeback_lock); /* see if this page is already pending a writeback under a suitable key @@ -242,8 +196,8 @@ try_again: subsume_in_current_wb: _debug("subsume"); ASSERTRANGE(wb->first, <=, index, <=, wb->last); - if (index == wb->first && offset < wb->offset_first) - wb->offset_first = offset; + if (index == wb->first && from < wb->offset_first) + wb->offset_first = from; if (index == wb->last && to > wb->to_last) wb->to_last = to; spin_unlock(&vnode->writeback_lock); @@ -289,17 +243,17 @@ flush_conflicting_wb: /* * finalise part of a write to a page */ -int afs_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +int afs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file->f_dentry->d_inode); loff_t i_size, maybe_i_size; - _enter("{%x:%u},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, page->index, offset, to); + _enter("{%x:%u},{%lx}", + vnode->fid.vid, vnode->fid.vnode, page->index); - maybe_i_size = (loff_t) page->index << PAGE_SHIFT; - maybe_i_size += to; + maybe_i_size = pos + copied; i_size = i_size_read(&vnode->vfs_inode); if (maybe_i_size > i_size) { @@ -310,12 +264,13 @@ int afs_commit_write(struct file *file, struct page *page, spin_unlock(&vnode->writeback_lock); } - SetPageUptodate(page); set_page_dirty(page); if (PageDirty(page)) _debug("dirtied"); + unlock_page(page); + page_cache_release(page); - return 0; + return copied; } /* diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index dda510d31f8..b70eea1e8c5 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -59,7 +59,7 @@ static const struct super_operations autofs_sops = { enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto}; -static match_table_t autofs_tokens = { +static const match_table_t autofs_tokens = { {Opt_fd, "fd=%u"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile index f2c3b79e94d..a811c1f7d9a 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs4/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4.o -autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o +autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 69a2f5c9231..e0f16da00e5 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -14,6 +14,7 @@ /* Internal header file for autofs */ #include <linux/auto_fs4.h> +#include <linux/auto_dev-ioctl.h> #include <linux/mutex.h> #include <linux/list.h> @@ -21,6 +22,11 @@ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY #define AUTOFS_IOC_COUNT 32 +#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) +#define AUTOFS_DEV_IOCTL_IOC_COUNT (AUTOFS_IOC_COUNT - 11) + +#define AUTOFS_TYPE_TRIGGER (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET) + #include <linux/kernel.h> #include <linux/slab.h> #include <linux/time.h> @@ -35,11 +41,27 @@ /* #define DEBUG */ #ifdef DEBUG -#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0) +#define DPRINTK(fmt, args...) \ +do { \ + printk(KERN_DEBUG "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) #else -#define DPRINTK(fmt,args...) do {} while(0) +#define DPRINTK(fmt, args...) do {} while (0) #endif +#define AUTOFS_WARN(fmt, args...) \ +do { \ + printk(KERN_WARNING "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + +#define AUTOFS_ERROR(fmt, args...) \ +do { \ + printk(KERN_ERR "pid %d: %s: " fmt "\n", \ + current->pid, __func__, ##args); \ +} while (0) + /* Unified info structure. This is pointed to by both the dentry and inode structures. Each file in the filesystem has an instance of this structure. It holds a reference to the dentry, so dentries are never @@ -61,6 +83,9 @@ struct autofs_info { unsigned long last_used; atomic_t count; + uid_t uid; + gid_t gid; + mode_t mode; size_t size; @@ -92,10 +117,6 @@ struct autofs_wait_queue { #define AUTOFS_SBI_MAGIC 0x6d4a556d -#define AUTOFS_TYPE_INDIRECT 0x0001 -#define AUTOFS_TYPE_DIRECT 0x0002 -#define AUTOFS_TYPE_OFFSET 0x0004 - struct autofs_sb_info { u32 magic; int pipefd; @@ -169,6 +190,17 @@ int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_packet_expire __user *); int autofs4_expire_multi(struct super_block *, struct vfsmount *, struct autofs_sb_info *, int __user *); +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); + +/* Device node initialization */ + +int autofs_dev_ioctl_init(void); +void autofs_dev_ioctl_exit(void); /* Operations structures */ diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c new file mode 100644 index 00000000000..625abf5422e --- /dev/null +++ b/fs/autofs4/dev-ioctl.c @@ -0,0 +1,863 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent <raven@themaw.net> + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include <linux/module.h> +#include <linux/vmalloc.h> +#include <linux/miscdevice.h> +#include <linux/init.h> +#include <linux/wait.h> +#include <linux/namei.h> +#include <linux/fcntl.h> +#include <linux/file.h> +#include <linux/fdtable.h> +#include <linux/sched.h> +#include <linux/compat.h> +#include <linux/syscalls.h> +#include <linux/smp_lock.h> +#include <linux/magic.h> +#include <linux/dcache.h> +#include <linux/uaccess.h> + +#include "autofs_i.h" + +/* + * This module implements an interface for routing autofs ioctl control + * commands via a miscellaneous device file. + * + * The alternate interface is needed because we need to be able open + * an ioctl file descriptor on an autofs mount that may be covered by + * another mount. This situation arises when starting automount(8) + * or other user space daemon which uses direct mounts or offset + * mounts (used for autofs lazy mount/umount of nested mount trees), + * which have been left busy at at service shutdown. + */ + +#define AUTOFS_DEV_IOCTL_SIZE sizeof(struct autofs_dev_ioctl) + +typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, + struct autofs_dev_ioctl *); + +static int check_name(const char *name) +{ + if (!strchr(name, '/')) + return -EINVAL; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from user land. + */ +static int invalid_str(char *str, void *end) +{ + while ((void *) str <= end) + if (!*str++) + return 0; + return -EINVAL; +} + +/* + * Check that the user compiled against correct version of autofs + * misc device code. + * + * As well as checking the version compatibility this always copies + * the kernel interface version out. + */ +static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) +{ + int err = 0; + + if ((AUTOFS_DEV_IOCTL_VERSION_MAJOR != param->ver_major) || + (AUTOFS_DEV_IOCTL_VERSION_MINOR < param->ver_minor)) { + AUTOFS_WARN("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(%d)", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); + err = -EINVAL; + } + + /* Fill in the kernel version. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + + return err; +} + +/* + * Copy parameter control struct, including a possible path allocated + * at the end of the struct. + */ +static struct autofs_dev_ioctl *copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +{ + struct autofs_dev_ioctl tmp, *ads; + + if (copy_from_user(&tmp, in, sizeof(tmp))) + return ERR_PTR(-EFAULT); + + if (tmp.size < sizeof(tmp)) + return ERR_PTR(-EINVAL); + + ads = kmalloc(tmp.size, GFP_KERNEL); + if (!ads) + return ERR_PTR(-ENOMEM); + + if (copy_from_user(ads, in, tmp.size)) { + kfree(ads); + return ERR_PTR(-EFAULT); + } + + return ads; +} + +static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) +{ + kfree(param); + return; +} + +/* + * Check sanity of parameter control fields and if a path is present + * check that it has a "/" and is terminated. + */ +static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + int err = -EINVAL; + + if (check_dev_ioctl_version(cmd, param)) { + AUTOFS_WARN("invalid device control module version " + "supplied for cmd(0x%08x)", cmd); + goto out; + } + + if (param->size > sizeof(*param)) { + err = check_name(param->path); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + + err = invalid_str(param->path, + (void *) ((size_t) param + param->size)); + if (err) { + AUTOFS_WARN("invalid path supplied for cmd(0x%08x)", + cmd); + goto out; + } + } + + err = 0; +out: + return err; +} + +/* + * Get the autofs super block info struct from the file opened on + * the autofs mount point. + */ +static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) +{ + struct autofs_sb_info *sbi = NULL; + struct inode *inode; + + if (f) { + inode = f->f_path.dentry->d_inode; + sbi = autofs4_sbi(inode->i_sb); + } + return sbi; +} + +/* Return autofs module protocol version */ +static int autofs_dev_ioctl_protover(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = sbi->version; + return 0; +} + +/* Return autofs module protocol sub version */ +static int autofs_dev_ioctl_protosubver(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = sbi->sub_version; + return 0; +} + +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested device number (aka. new_encode_dev(sb->s_dev). + */ +static int autofs_dev_ioctl_find_super(struct nameidata *nd, dev_t devno) +{ + struct dentry *dentry; + struct inode *inode; + struct super_block *sb; + dev_t s_dev; + unsigned int err; + + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + inode = nd->path.dentry->d_inode; + if (!inode) + break; + + sb = inode->i_sb; + s_dev = new_encode_dev(sb->s_dev); + if (devno == s_dev) { + if (sb->s_magic == AUTOFS_SUPER_MAGIC) { + err = 0; + break; + } + } + } +out: + return err; +} + +/* + * Walk down the mount stack looking for an autofs mount that + * has the requested mount type (ie. indirect, direct or offset). + */ +static int autofs_dev_ioctl_find_sbi_type(struct nameidata *nd, unsigned int type) +{ + struct dentry *dentry; + struct autofs_info *ino; + unsigned int err; + + err = -ENOENT; + + /* Lookup the dentry name at the base of our mount point */ + dentry = d_lookup(nd->path.dentry, &nd->last); + if (!dentry) + goto out; + + dput(nd->path.dentry); + nd->path.dentry = dentry; + + /* And follow the mount stack looking for our autofs mount */ + while (follow_down(&nd->path.mnt, &nd->path.dentry)) { + ino = autofs4_dentry_ino(nd->path.dentry); + if (ino && ino->sbi->type & type) { + err = 0; + break; + } + } +out: + return err; +} + +static void autofs_dev_ioctl_fd_install(unsigned int fd, struct file *file) +{ + struct files_struct *files = current->files; + struct fdtable *fdt; + + spin_lock(&files->file_lock); + fdt = files_fdtable(files); + BUG_ON(fdt->fd[fd] != NULL); + rcu_assign_pointer(fdt->fd[fd], file); + FD_SET(fd, fdt->close_on_exec); + spin_unlock(&files->file_lock); +} + + +/* + * Open a file descriptor on the autofs mount point corresponding + * to the given path and device number (aka. new_encode_dev(sb->s_dev)). + */ +static int autofs_dev_ioctl_open_mountpoint(const char *path, dev_t devid) +{ + struct file *filp; + struct nameidata nd; + int err, fd; + + fd = get_unused_fd(); + if (likely(fd >= 0)) { + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + /* + * Search down, within the parent, looking for an + * autofs super block that has the device number + * corresponding to the autofs fs we want to open. + */ + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) { + path_put(&nd.path); + goto out; + } + + filp = dentry_open(nd.path.dentry, nd.path.mnt, O_RDONLY); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto out; + } + + autofs_dev_ioctl_fd_install(fd, filp); + } + + return fd; + +out: + put_unused_fd(fd); + return err; +} + +/* Open a file descriptor on an autofs mount point */ +static int autofs_dev_ioctl_openmount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + const char *path; + dev_t devid; + int err, fd; + + /* param->path has already been checked */ + if (!param->arg1) + return -EINVAL; + + param->ioctlfd = -1; + + path = param->path; + devid = param->arg1; + + err = 0; + fd = autofs_dev_ioctl_open_mountpoint(path, devid); + if (unlikely(fd < 0)) { + err = fd; + goto out; + } + + param->ioctlfd = fd; +out: + return err; +} + +/* Close file descriptor allocated above (user can also use close(2)). */ +static int autofs_dev_ioctl_closemount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + return sys_close(param->ioctlfd); +} + +/* + * Send "ready" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_ready(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + + token = (autofs_wqt_t) param->arg1; + return autofs4_wait_release(sbi, token, 0); +} + +/* + * Send "fail" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_fail(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + int status; + + token = (autofs_wqt_t) param->arg1; + status = param->arg2 ? param->arg2 : -ENOENT; + return autofs4_wait_release(sbi, token, status); +} + +/* + * Set the pipe fd for kernel communication to the daemon. + * + * Normally this is set at mount using an option but if we + * are reconnecting to a busy mount then we need to use this + * to tell the autofs mount about the new kernel pipe fd. In + * order to protect mounts against incorrectly setting the + * pipefd we also require that the autofs mount be catatonic. + * + * This also sets the process group id used to identify the + * controlling process (eg. the owning automount(8) daemon). + */ +static int autofs_dev_ioctl_setpipefd(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + int pipefd; + int err = 0; + + if (param->arg1 == -1) + return -EINVAL; + + pipefd = param->arg1; + + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return -EBUSY; + } else { + struct file *pipe = fget(pipefd); + if (!pipe->f_op || !pipe->f_op->write) { + err = -EPIPE; + fput(pipe); + goto out; + } + sbi->oz_pgrp = task_pgrp_nr(current); + sbi->pipefd = pipefd; + sbi->pipe = pipe; + sbi->catatonic = 0; + } +out: + mutex_unlock(&sbi->wq_mutex); + return err; +} + +/* + * Make the autofs mount point catatonic, no longer responsive to + * mount requests. Also closes the kernel pipe file descriptor. + */ +static int autofs_dev_ioctl_catatonic(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs4_catatonic_mode(sbi); + return 0; +} + +/* Set the autofs mount timeout */ +static int autofs_dev_ioctl_timeout(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + unsigned long timeout; + + timeout = param->arg1; + param->arg1 = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + return 0; +} + +/* + * Return the uid and gid of the last request for the mount + * + * When reconstructing an autofs mount tree with active mounts + * we need to re-connect to mounts that may have used the original + * process uid and gid (or string variations of them) for mount + * lookups within the map entry. + */ +static int autofs_dev_ioctl_requester(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct autofs_info *ino; + struct nameidata nd; + const char *path; + dev_t devid; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + path = param->path; + devid = sbi->sb->s_dev; + + param->arg1 = param->arg2 = -1; + + /* Get nameidata of the parent directory */ + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + if (ino) { + err = 0; + autofs4_expire_wait(nd.path.dentry); + spin_lock(&sbi->fs_lock); + param->arg1 = ino->uid; + param->arg2 = ino->gid; + spin_unlock(&sbi->fs_lock); + } + +out_release: + path_put(&nd.path); +out: + return err; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more that can be done. + */ +static int autofs_dev_ioctl_expire(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct dentry *dentry; + struct vfsmount *mnt; + int err = -EAGAIN; + int how; + + how = param->arg1; + mnt = fp->f_path.mnt; + + if (sbi->type & AUTOFS_TYPE_TRIGGER) + dentry = autofs4_expire_direct(sbi->sb, mnt, sbi, how); + else + dentry = autofs4_expire_indirect(sbi->sb, mnt, sbi, how); + + if (dentry) { + struct autofs_info *ino = autofs4_dentry_ino(dentry); + + /* + * This is synchronous because it makes the daemon a + * little easier + */ + err = autofs4_wait(sbi, dentry, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_MOUNTPOINT) { + ino->flags &= ~AUTOFS_INF_MOUNTPOINT; + sbi->sb->s_root->d_mounted++; + } + ino->flags &= ~AUTOFS_INF_EXPIRING; + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + dput(dentry); + } + + return err; +} + +/* Check if autofs mount point is in use */ +static int autofs_dev_ioctl_askumount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->arg1 = 0; + if (may_umount(fp->f_path.mnt)) + param->arg1 = 1; + return 0; +} + +/* + * Check if the given path is a mountpoint. + * + * If we are supplied with the file descriptor of an autofs + * mount we're looking for a specific mount. In this case + * the path is considered a mountpoint if it is itself a + * mountpoint or contains a mount, such as a multi-mount + * without a root mount. In this case we return 1 if the + * path is a mount point and the super magic of the covering + * mount if there is one or 0 if it isn't a mountpoint. + * + * If we aren't supplied with a file descriptor then we + * lookup the nameidata of the path and check if it is the + * root of a mount. If a type is given we are looking for + * a particular autofs mount and if we don't find a match + * we return fail. If the located nameidata path is the + * root of a mount we return 1 along with the super magic + * of the mount or 0 otherwise. + * + * In both cases the the device number (as returned by + * new_encode_dev()) is also returned. + */ +static int autofs_dev_ioctl_ismountpoint(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct nameidata nd; + const char *path; + unsigned int type; + int err = -ENOENT; + + if (param->size <= sizeof(*param)) { + err = -EINVAL; + goto out; + } + + path = param->path; + type = param->arg1; + + param->arg1 = 0; + param->arg2 = 0; + + if (!fp || param->ioctlfd == -1) { + if (type == AUTOFS_TYPE_ANY) { + struct super_block *sb; + + err = path_lookup(path, LOOKUP_FOLLOW, &nd); + if (err) + goto out; + + sb = nd.path.dentry->d_sb; + param->arg1 = new_encode_dev(sb->s_dev); + } else { + struct autofs_info *ino; + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_sbi_type(&nd, type); + if (err) + goto out_release; + + ino = autofs4_dentry_ino(nd.path.dentry); + param->arg1 = autofs4_get_dev(ino->sbi); + } + + err = 0; + if (nd.path.dentry->d_inode && + nd.path.mnt->mnt_root == nd.path.dentry) { + err = 1; + param->arg2 = nd.path.dentry->d_inode->i_sb->s_magic; + } + } else { + dev_t devid = new_encode_dev(sbi->sb->s_dev); + + err = path_lookup(path, LOOKUP_PARENT, &nd); + if (err) + goto out; + + err = autofs_dev_ioctl_find_super(&nd, devid); + if (err) + goto out_release; + + param->arg1 = autofs4_get_dev(sbi); + + err = have_submounts(nd.path.dentry); + + if (nd.path.mnt->mnt_mountpoint != nd.path.mnt->mnt_root) { + if (follow_down(&nd.path.mnt, &nd.path.dentry)) { + struct inode *inode = nd.path.dentry->d_inode; + param->arg2 = inode->i_sb->s_magic; + } + } + } + +out_release: + path_put(&nd.path); +out: + return err; +} + +/* + * Our range of ioctl numbers isn't 0 based so we need to shift + * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table + * lookup. + */ +#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) + +static ioctl_fn lookup_dev_ioctl(unsigned int cmd) +{ + static struct { + int cmd; + ioctl_fn fn; + } _ioctls[] = { + {cmd_idx(AUTOFS_DEV_IOCTL_VERSION_CMD), NULL}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOVER_CMD), + autofs_dev_ioctl_protover}, + {cmd_idx(AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD), + autofs_dev_ioctl_protosubver}, + {cmd_idx(AUTOFS_DEV_IOCTL_OPENMOUNT_CMD), + autofs_dev_ioctl_openmount}, + {cmd_idx(AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD), + autofs_dev_ioctl_closemount}, + {cmd_idx(AUTOFS_DEV_IOCTL_READY_CMD), + autofs_dev_ioctl_ready}, + {cmd_idx(AUTOFS_DEV_IOCTL_FAIL_CMD), + autofs_dev_ioctl_fail}, + {cmd_idx(AUTOFS_DEV_IOCTL_SETPIPEFD_CMD), + autofs_dev_ioctl_setpipefd}, + {cmd_idx(AUTOFS_DEV_IOCTL_CATATONIC_CMD), + autofs_dev_ioctl_catatonic}, + {cmd_idx(AUTOFS_DEV_IOCTL_TIMEOUT_CMD), + autofs_dev_ioctl_timeout}, + {cmd_idx(AUTOFS_DEV_IOCTL_REQUESTER_CMD), + autofs_dev_ioctl_requester}, + {cmd_idx(AUTOFS_DEV_IOCTL_EXPIRE_CMD), + autofs_dev_ioctl_expire}, + {cmd_idx(AUTOFS_DEV_IOCTL_ASKUMOUNT_CMD), + autofs_dev_ioctl_askumount}, + {cmd_idx(AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD), + autofs_dev_ioctl_ismountpoint} + }; + unsigned int idx = cmd_idx(cmd); + + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx].fn; +} + +/* ioctl dispatcher */ +static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __user *user) +{ + struct autofs_dev_ioctl *param; + struct file *fp; + struct autofs_sb_info *sbi; + unsigned int cmd_first, cmd; + ioctl_fn fn = NULL; + int err = 0; + + /* only root can play with this */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); + cmd = _IOC_NR(command); + + if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || + cmd - cmd_first >= AUTOFS_DEV_IOCTL_IOC_COUNT) { + return -ENOTTY; + } + + /* Copy the parameters into kernel space. */ + param = copy_dev_ioctl(user); + if (IS_ERR(param)) + return PTR_ERR(param); + + err = validate_dev_ioctl(command, param); + if (err) + goto out; + + /* The validate routine above always sets the version */ + if (cmd == AUTOFS_DEV_IOCTL_VERSION_CMD) + goto done; + + fn = lookup_dev_ioctl(cmd); + if (!fn) { + AUTOFS_WARN("unknown command 0x%08x", command); + return -ENOTTY; + } + + fp = NULL; + sbi = NULL; + + /* + * For obvious reasons the openmount can't have a file + * descriptor yet. We don't take a reference to the + * file during close to allow for immediate release. + */ + if (cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + fp = fget(param->ioctlfd); + if (!fp) { + if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) + goto cont; + err = -EBADF; + goto out; + } + + if (!fp->f_op) { + err = -ENOTTY; + fput(fp); + goto out; + } + + sbi = autofs_dev_ioctl_sbi(fp); + if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + err = -EINVAL; + fput(fp); + goto out; + } + + /* + * Admin needs to be able to set the mount catatonic in + * order to be able to perform the re-open. + */ + if (!autofs4_oz_mode(sbi) && + cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { + err = -EACCES; + fput(fp); + goto out; + } + } +cont: + err = fn(fp, sbi, param); + + if (fp) + fput(fp); +done: + if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) + err = -EFAULT; +out: + free_dev_ioctl(param); + return err; +} + +static long autofs_dev_ioctl(struct file *file, uint command, ulong u) +{ + int err; + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); + return (long) err; +} + +#ifdef CONFIG_COMPAT +static long autofs_dev_ioctl_compat(struct file *file, uint command, ulong u) +{ + return (long) autofs_dev_ioctl(file, command, (ulong) compat_ptr(u)); +} +#else +#define autofs_dev_ioctl_compat NULL +#endif + +static const struct file_operations _dev_ioctl_fops = { + .unlocked_ioctl = autofs_dev_ioctl, + .compat_ioctl = autofs_dev_ioctl_compat, + .owner = THIS_MODULE, +}; + +static struct miscdevice _autofs_dev_ioctl_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops +}; + +/* Register/deregister misc character device */ +int autofs_dev_ioctl_init(void) +{ + int r; + + r = misc_register(&_autofs_dev_ioctl_misc); + if (r) { + AUTOFS_ERROR("misc_register failed for control device"); + return r; + } + + return 0; +} + +void autofs_dev_ioctl_exit(void) +{ + misc_deregister(&_autofs_dev_ioctl_misc); + return; +} + diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index cdabb796ff0..cde2f8e8935 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -244,10 +244,10 @@ cont: } /* Check if we can expire a direct mount (possibly a tree) */ -static struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs4_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -283,10 +283,10 @@ static struct dentry *autofs4_expire_direct(struct super_block *sb, * - it is unused by any user process * - it has been unused for exp_timeout time */ -static struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs4_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -479,7 +479,7 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - if (sbi->type & AUTOFS_TYPE_DIRECT) + if (sbi->type & AUTOFS_TYPE_TRIGGER) dentry = autofs4_expire_direct(sb, mnt, sbi, do_now); else dentry = autofs4_expire_indirect(sb, mnt, sbi, do_now); diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 723a1c5e361..9722e4bd895 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -29,11 +29,20 @@ static struct file_system_type autofs_fs_type = { static int __init init_autofs4_fs(void) { - return register_filesystem(&autofs_fs_type); + int err; + + err = register_filesystem(&autofs_fs_type); + if (err) + return err; + + autofs_dev_ioctl_init(); + + return err; } static void __exit exit_autofs4_fs(void) { + autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 7bb3e5ba053..c7e65bb30ba 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -53,6 +53,8 @@ struct autofs_info *autofs4_init_ino(struct autofs_info *ino, atomic_set(&ino->count, 0); } + ino->uid = 0; + ino->gid = 0; ino->mode = mode; ino->last_used = jiffies; @@ -213,7 +215,7 @@ static const struct super_operations autofs4_sops = { enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, Opt_indirect, Opt_direct, Opt_offset}; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_fd, "fd=%u"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, @@ -288,7 +290,7 @@ static int parse_options(char *options, int *pipefd, uid_t *uid, gid_t *gid, *type = AUTOFS_TYPE_DIRECT; break; case Opt_offset: - *type = AUTOFS_TYPE_DIRECT | AUTOFS_TYPE_OFFSET; + *type = AUTOFS_TYPE_OFFSET; break; default: return 1; @@ -336,7 +338,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) sbi->sb = s; sbi->version = 0; sbi->sub_version = 0; - sbi->type = 0; + sbi->type = AUTOFS_TYPE_INDIRECT; sbi->min_proto = 0; sbi->max_proto = 0; mutex_init(&sbi->wq_mutex); @@ -378,7 +380,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) } root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = sbi->type & AUTOFS_TYPE_DIRECT ? + root_inode->i_op = sbi->type & AUTOFS_TYPE_TRIGGER ? &autofs4_direct_root_inode_operations : &autofs4_indirect_root_inode_operations; diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index bcfb2dc0a61..2a41c2a7fc5 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -36,6 +36,7 @@ const struct file_operations autofs4_root_operations = { .release = dcache_dir_close, .read = generic_read_dir, .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, .ioctl = autofs4_root_ioctl, }; @@ -44,6 +45,7 @@ const struct file_operations autofs4_dir_operations = { .release = dcache_dir_close, .read = generic_read_dir, .readdir = dcache_readdir, + .llseek = dcache_dir_lseek, }; const struct inode_operations autofs4_indirect_root_inode_operations = { diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 35216d18d8b..4b67c2a2d77 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -337,7 +337,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, * is very similar for indirect mounts except only dentrys * in the root of the autofs file system may be negative. */ - if (sbi->type & (AUTOFS_TYPE_DIRECT|AUTOFS_TYPE_OFFSET)) + if (sbi->type & AUTOFS_TYPE_TRIGGER) return -ENOENT; else if (!IS_ROOT(dentry->d_parent)) return -ENOENT; @@ -348,7 +348,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && (sbi->type & AUTOFS_TYPE_DIRECT)) + if (IS_ROOT(dentry) && sbi->type & AUTOFS_TYPE_TRIGGER) qstr.len = sprintf(name, "%p", dentry); else { qstr.len = autofs4_getpath(sbi, dentry, &name); @@ -406,11 +406,11 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, type = autofs_ptype_expire_multi; } else { if (notify == NFY_MOUNT) - type = (sbi->type & AUTOFS_TYPE_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? autofs_ptype_missing_direct : autofs_ptype_missing_indirect; else - type = (sbi->type & AUTOFS_TYPE_DIRECT) ? + type = (sbi->type & AUTOFS_TYPE_TRIGGER) ? autofs_ptype_expire_direct : autofs_ptype_expire_indirect; } @@ -457,6 +457,40 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry, status = wq->status; + /* + * For direct and offset mounts we need to track the requester's + * uid and gid in the dentry info struct. This is so it can be + * supplied, on request, by the misc device ioctl interface. + * This is needed during daemon resatart when reconnecting + * to existing, active, autofs mounts. The uid and gid (and + * related string values) may be used for macro substitution + * in autofs mount maps. + */ + if (!status) { + struct autofs_info *ino; + struct dentry *de = NULL; + + /* direct mount or browsable map */ + ino = autofs4_dentry_ino(dentry); + if (!ino) { + /* If not lookup actual dentry used */ + de = d_lookup(dentry->d_parent, &dentry->d_name); + if (de) + ino = autofs4_dentry_ino(de); + } + + /* Set mount requester */ + if (ino) { + spin_lock(&sbi->fs_lock); + ino->uid = wq->uid; + ino->gid = wq->gid; + spin_unlock(&sbi->fs_lock); + } + + if (de) + dput(de); + } + /* Are we the last process to need status? */ mutex_lock(&sbi->wq_mutex); if (!--wq->wait_ctr) diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h index e2595c2c403..7893eaa1e58 100644 --- a/fs/befs/befs_fs_types.h +++ b/fs/befs/befs_fs_types.h @@ -55,8 +55,12 @@ enum super_flags { }; #define BEFS_BYTEORDER_NATIVE 0x42494745 +#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE) +#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE) #define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1 +#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1) +#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1) /* * Flags of inode diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 02c6e62b72f..b6dfee37c7b 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -66,6 +66,7 @@ static struct kmem_cache *befs_inode_cachep; static const struct file_operations befs_dir_operations = { .read = generic_read_dir, .readdir = befs_readdir, + .llseek = generic_file_llseek, }; static const struct inode_operations befs_dir_inode_operations = { @@ -649,7 +650,7 @@ enum { Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err, }; -static match_table_t befs_tokens = { +static const match_table_t befs_tokens = { {Opt_uid, "uid=%d"}, {Opt_gid, "gid=%d"}, {Opt_charset, "iocharset=%s"}, @@ -808,8 +809,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent) /* account for offset of super block on x86 */ disk_sb = (befs_super_block *) bh->b_data; - if ((le32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1) || - (be32_to_cpu(disk_sb->magic1) == BEFS_SUPER_MAGIC1)) { + if ((disk_sb->magic1 == BEFS_SUPER_MAGIC1_LE) || + (disk_sb->magic1 == BEFS_SUPER_MAGIC1_BE)) { befs_debug(sb, "Using PPC superblock location"); } else { befs_debug(sb, "Using x86 superblock location"); diff --git a/fs/befs/super.c b/fs/befs/super.c index 8c3401ff6d6..41f2b4d0093 100644 --- a/fs/befs/super.c +++ b/fs/befs/super.c @@ -26,10 +26,10 @@ befs_load_sb(struct super_block *sb, befs_super_block * disk_sb) befs_sb_info *befs_sb = BEFS_SB(sb); /* Check the byte order of the filesystem */ - if (le32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) + if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_LE) befs_sb->byte_order = BEFS_BYTESEX_LE; - else if (be32_to_cpu(disk_sb->fs_byte_order) == BEFS_BYTEORDER_NATIVE) - befs_sb->byte_order = BEFS_BYTESEX_BE; + else if (disk_sb->fs_byte_order == BEFS_BYTEORDER_NATIVE_BE) + befs_sb->byte_order = BEFS_BYTESEX_BE; befs_sb->magic1 = fs32_to_cpu(sb, disk_sb->magic1); befs_sb->magic2 = fs32_to_cpu(sb, disk_sb->magic2); diff --git a/fs/bfs/dir.c b/fs/bfs/dir.c index 87ee5ccee34..ed8feb052df 100644 --- a/fs/bfs/dir.c +++ b/fs/bfs/dir.c @@ -125,8 +125,8 @@ static int bfs_create(struct inode *dir, struct dentry *dentry, int mode, inode->i_ino); if (err) { inode_dec_link_count(inode); - iput(inode); mutex_unlock(&info->bfs_lock); + iput(inode); return err; } mutex_unlock(&info->bfs_lock); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 655ed8d30a8..83d72006e29 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -683,7 +683,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) * switch really is going to happen - do this in * flush_thread(). - akpm */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); interpreter = open_exec(elf_interpreter); retval = PTR_ERR(interpreter); @@ -734,7 +734,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_dentry; } else { /* Executables without an interpreter also need a personality */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); } /* Flush all traces of the currently running executable */ @@ -748,7 +748,7 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) /* Do this immediately, since STACK_TOP as used in setup_arg_pages may depend on the personality. */ - SET_PERSONALITY(loc->elf_ex, 0); + SET_PERSONALITY(loc->elf_ex); if (elf_read_implies_exec(loc->elf_ex, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -1333,20 +1333,15 @@ static void fill_prstatus(struct elf_prstatus *prstatus, prstatus->pr_pgrp = task_pgrp_vnr(p); prstatus->pr_sid = task_session_vnr(p); if (thread_group_leader(p)) { + struct task_cputime cputime; + /* - * This is the record for the group leader. Add in the - * cumulative times of previous dead threads. This total - * won't include the time of each live thread whose state - * is included in the core dump. The final total reported - * to our parent process when it calls wait4 will include - * those sums as well as the little bit more time it takes - * this and each other thread to finish dying after the - * core dump synchronization phase. + * This is the record for the group leader. It shows the + * group-wide total, not its individual thread total. */ - cputime_to_timeval(cputime_add(p->utime, p->signal->utime), - &prstatus->pr_utime); - cputime_to_timeval(cputime_add(p->stime, p->signal->stime), - &prstatus->pr_stime); + thread_group_cputime(p, &cputime); + cputime_to_timeval(cputime.utime, &prstatus->pr_utime); + cputime_to_timeval(cputime.stime, &prstatus->pr_stime); } else { cputime_to_timeval(p->utime, &prstatus->pr_utime); cputime_to_timeval(p->stime, &prstatus->pr_stime); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 80c1f952ef7..0e8367c5462 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -25,6 +25,7 @@ #include <linux/fcntl.h> #include <linux/slab.h> #include <linux/pagemap.h> +#include <linux/security.h> #include <linux/highmem.h> #include <linux/highuid.h> #include <linux/personality.h> @@ -455,8 +456,19 @@ error_kill: } /*****************************************************************************/ + +#ifndef ELF_BASE_PLATFORM /* - * present useful information to the program + * AT_BASE_PLATFORM indicates the "real" hardware/microarchitecture. + * If the arch defines ELF_BASE_PLATFORM (in asm/elf.h), the value + * will be copied to the user stack in the same manner as AT_PLATFORM. + */ +#define ELF_BASE_PLATFORM NULL +#endif + +/* + * present useful information to the program by shovelling it onto the new + * process's stack */ static int create_elf_fdpic_tables(struct linux_binprm *bprm, struct mm_struct *mm, @@ -466,15 +478,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, unsigned long sp, csp, nitems; elf_caddr_t __user *argv, *envp; size_t platform_len = 0, len; - char *k_platform; - char __user *u_platform, *p; + char *k_platform, *k_base_platform; + char __user *u_platform, *u_base_platform, *p; long hwcap; int loop; int nr; /* reset for each csp adjustment */ - /* we're going to shovel a whole load of stuff onto the stack */ #ifdef CONFIG_MMU - sp = bprm->p; + /* In some cases (e.g. Hyper-Threading), we want to avoid L1 evictions + * by the processes running on the same package. One thing we can do is + * to shuffle the initial stack for them, so we give the architecture + * an opportunity to do so here. + */ + sp = arch_align_stack(bprm->p); #else sp = mm->start_stack; @@ -483,11 +499,14 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; #endif - /* get hold of platform and hardware capabilities masks for the machine - * we are running on. In some cases (Sparc), this info is impossible - * to get, in others (i386) it is merely difficult. - */ hwcap = ELF_HWCAP; + + /* + * If this architecture has a platform capability string, copy it + * to userspace. In some cases (Sparc), this info is impossible + * for userspace to get any other way, in others (i386) it is + * merely difficult. + */ k_platform = ELF_PLATFORM; u_platform = NULL; @@ -499,19 +518,20 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, return -EFAULT; } -#if defined(__i386__) && defined(CONFIG_SMP) - /* in some cases (e.g. Hyper-Threading), we want to avoid L1 evictions - * by the processes running on the same package. One thing we can do is - * to shuffle the initial stack for them. - * - * the conditionals here are unneeded, but kept in to make the code - * behaviour the same as pre change unless we have hyperthreaded - * processors. This keeps Mr Marcelo Person happier but should be - * removed for 2.5 + /* + * If this architecture has a "base" platform capability + * string, copy it to userspace. */ - if (smp_num_siblings > 1) - sp = sp - ((current->pid % 64) << 7); -#endif + k_base_platform = ELF_BASE_PLATFORM; + u_base_platform = NULL; + + if (k_base_platform) { + platform_len = strlen(k_base_platform) + 1; + sp -= platform_len; + u_base_platform = (char __user *) sp; + if (__copy_to_user(u_base_platform, k_base_platform, platform_len) != 0) + return -EFAULT; + } sp &= ~7UL; @@ -541,9 +561,13 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, } /* force 16 byte _final_ alignment here for generality */ -#define DLINFO_ITEMS 13 +#define DLINFO_ITEMS 15 + + nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + + (k_base_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; - nitems = 1 + DLINFO_ITEMS + (k_platform ? 1 : 0) + AT_VECTOR_SIZE_ARCH; + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) + nitems++; csp = sp; sp -= nitems * 2 * sizeof(unsigned long); @@ -575,6 +599,19 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, (elf_addr_t) (unsigned long) u_platform); } + if (k_base_platform) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_BASE_PLATFORM, + (elf_addr_t) (unsigned long) u_base_platform); + } + + if (bprm->interp_flags & BINPRM_FLAGS_EXECFD) { + nr = 0; + csp -= 2 * sizeof(unsigned long); + NEW_AUX_ENT(AT_EXECFD, bprm->interp_data); + } + nr = 0; csp -= DLINFO_ITEMS * 2 * sizeof(unsigned long); NEW_AUX_ENT(AT_HWCAP, hwcap); @@ -590,6 +627,8 @@ static int create_elf_fdpic_tables(struct linux_binprm *bprm, NEW_AUX_ENT(AT_EUID, (elf_addr_t) current->euid); NEW_AUX_ENT(AT_GID, (elf_addr_t) current->gid); NEW_AUX_ENT(AT_EGID, (elf_addr_t) current->egid); + NEW_AUX_ENT(AT_SECURE, security_bprm_secureexec(bprm)); + NEW_AUX_ENT(AT_EXECFN, bprm->exec); #ifdef ARCH_DLINFO nr = 0; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f9c88d0c8ce..32fb00b52cd 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) return -ENOEXEC; } - bprm->sh_bang = 1; /* Well, the bang-shell is implicit... */ + bprm->recursion_depth++; /* Well, the bang-shell is implicit... */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index dfc0197905c..ccb781a6a80 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -229,13 +229,13 @@ static int decompress_exec( ret = 10; if (buf[3] & EXTRA_FIELD) { ret += 2 + buf[10] + (buf[11] << 8); - if (unlikely(LBUFSIZE == ret)) { + if (unlikely(LBUFSIZE <= ret)) { DBG_FLT("binfmt_flat: buffer overflow (EXTRA)?\n"); goto out_free_buf; } } if (buf[3] & ORIG_NAME) { - for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) + while (ret < LBUFSIZE && buf[ret++] != 0) ; if (unlikely(LBUFSIZE == ret)) { DBG_FLT("binfmt_flat: buffer overflow (ORIG_NAME)?\n"); @@ -243,7 +243,7 @@ static int decompress_exec( } } if (buf[3] & COMMENT) { - for (; ret < LBUFSIZE && (buf[ret] != 0); ret++) + while (ret < LBUFSIZE && buf[ret++] != 0) ; if (unlikely(LBUFSIZE == ret)) { DBG_FLT("binfmt_flat: buffer overflow (COMMENT)?\n"); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 8d7e88e02e0..f2744ab4e5b 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -117,7 +117,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto _ret; retval = -ENOEXEC; - if (bprm->misc_bang) + if (bprm->recursion_depth > BINPRM_MAX_RECURSION) goto _ret; /* to keep locking time low, we copy the interpreter string */ @@ -197,7 +197,7 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; - bprm->misc_bang = 1; + bprm->recursion_depth++; retval = search_binary_handler (bprm, regs); if (retval < 0) diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index 9e3963f7ebf..08343505e18 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -22,14 +22,15 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) char interp[BINPRM_BUF_SIZE]; int retval; - if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || (bprm->sh_bang)) + if ((bprm->buf[0] != '#') || (bprm->buf[1] != '!') || + (bprm->recursion_depth > BINPRM_MAX_RECURSION)) return -ENOEXEC; /* * This section does the #! interpretation. * Sorta complicated, but hopefully it will work. -TYT */ - bprm->sh_bang = 1; + bprm->recursion_depth++; allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 68be580ba28..74e587a5279 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -306,3 +306,5 @@ static void __exit exit_som_binfmt(void) core_initcall(init_som_binfmt); module_exit(exit_som_binfmt); + +MODULE_LICENSE("GPL"); diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index c3e174b35fe..19caf7c962a 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs) BUG_ON(bip == NULL); /* A cloned bio doesn't own the integrity metadata */ - if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) + if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) + && bip->bip_buf != NULL) kfree(bip->bip_buf); mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); @@ -150,6 +151,24 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return 1; + + return 0; +} + /** * bio_integrity_enabled - Check whether integrity can be passed * @bio: bio to check @@ -313,6 +332,14 @@ static void bio_integrity_generate(struct bio *bio) } } +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare @@ -30,7 +30,7 @@ static struct kmem_cache *bio_slab __read_mostly; -mempool_t *bio_split_pool __read_mostly; +static mempool_t *bio_split_pool __read_mostly; /* * if you change this list, also change bvec_alloc or things will @@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct struct bio_vec *bvl; /* - * see comment near bvec_array define! + * If 'bs' is given, lookup the pool and do the mempool alloc. + * If not, this is a bio_kmalloc() allocation and just do a + * kzalloc() for the exact number of vecs right away. */ - switch (nr) { - case 1 : *idx = 0; break; - case 2 ... 4: *idx = 1; break; - case 5 ... 16: *idx = 2; break; - case 17 ... 64: *idx = 3; break; - case 65 ... 128: *idx = 4; break; - case 129 ... BIO_MAX_PAGES: *idx = 5; break; + if (bs) { + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; default: return NULL; - } - /* - * idx now points to the pool we want to allocate from - */ + } - bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); - if (bvl) - memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); + /* + * idx now points to the pool we want to allocate from + */ + bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); + if (bvl) + memset(bvl, 0, + bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); + } else + bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask); return bvl; } @@ -107,10 +128,17 @@ static void bio_fs_destructor(struct bio *bio) bio_free(bio, fs_bio_set); } +static void bio_kmalloc_destructor(struct bio *bio) +{ + kfree(bio->bi_io_vec); + kfree(bio); +} + void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } @@ -118,19 +146,25 @@ void bio_init(struct bio *bio) * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from + * @bs: the bio_set to allocate from. If %NULL, just use kmalloc * * Description: - * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. + * bio_alloc_bioset will first try its own mempool to satisfy the allocation. * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. + * for a &struct bio to become free. If a %NULL @bs is passed in, we will + * fall back to just using @kmalloc to allocate the required memory. * * allocate bio and iovecs from the memory pools specified by the - * bio_set structure. + * bio_set structure, or @kmalloc if none given. **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { - struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); + struct bio *bio; + + if (bs) + bio = mempool_alloc(bs->bio_pool, gfp_mask); + else + bio = kmalloc(sizeof(*bio), gfp_mask); if (likely(bio)) { struct bio_vec *bvl = NULL; @@ -141,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) { - mempool_free(bio, bs->bio_pool); + if (bs) + mempool_free(bio, bs->bio_pool); + else + kfree(bio); bio = NULL; goto out; } @@ -164,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +/* + * Like bio_alloc(), but doesn't use a mempool backing. This means that + * it CAN fail, but while bio_alloc() can only be used for allocations + * that have a short (finite) life span, bio_kmalloc() should be used + * for more permanent bio allocations (like allocating some bio's for + * initalization or setup purposes). + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); + + if (bio) + bio->bi_destructor = bio_kmalloc_destructor; + + return bio; +} + void zero_fill_bio(struct bio *bio) { unsigned long flags; @@ -208,14 +262,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) return bio->bi_phys_segments; } -inline int bio_hw_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_hw_segments; -} - /** * __bio_clone - clone a bio * @bio: destination bio @@ -350,8 +396,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page */ while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_hw_segments >= q->max_hw_segments - || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { + || bio->bi_phys_segments >= q->max_hw_segments) { if (retried_segments) return 0; @@ -395,13 +440,11 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || - BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); bio->bi_vcnt++; bio->bi_phys_segments++; - bio->bi_hw_segments++; done: bio->bi_size += len; return len; @@ -449,16 +492,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct bio_map_data { struct bio_vec *iovecs; - int nr_sgvecs; struct sg_iovec *sgvecs; + int nr_sgvecs; + int is_our_pages; }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count) + struct sg_iovec *iov, int iov_count, + int is_our_pages) { memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); bmd->nr_sgvecs = iov_count; + bmd->is_our_pages = is_our_pages; bio->bi_private = bmd; } @@ -469,20 +515,21 @@ static void bio_free_map_data(struct bio_map_data *bmd) kfree(bmd); } -static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) +static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, + gfp_t gfp_mask) { - struct bio_map_data *bmd = kmalloc(sizeof(*bmd), GFP_KERNEL); + struct bio_map_data *bmd = kmalloc(sizeof(*bmd), gfp_mask); if (!bmd) return NULL; - bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, GFP_KERNEL); + bmd->iovecs = kmalloc(sizeof(struct bio_vec) * nr_segs, gfp_mask); if (!bmd->iovecs) { kfree(bmd); return NULL; } - bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, GFP_KERNEL); + bmd->sgvecs = kmalloc(sizeof(struct sg_iovec) * iov_count, gfp_mask); if (bmd->sgvecs) return bmd; @@ -491,8 +538,9 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count) return NULL; } -static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, - int uncopy) +static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, + struct sg_iovec *iov, int iov_count, int uncopy, + int do_free_page) { int ret = 0, i; struct bio_vec *bvec; @@ -502,7 +550,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, __bio_for_each_segment(bvec, bio, i, 0) { char *bv_addr = page_address(bvec->bv_page); - unsigned int bv_len = bvec->bv_len; + unsigned int bv_len = iovecs[i].bv_len; while (bv_len && iov_idx < iov_count) { unsigned int bytes; @@ -535,7 +583,7 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, } } - if (uncopy) + if (do_free_page) __free_page(bvec->bv_page); } @@ -552,10 +600,11 @@ static int __bio_copy_iov(struct bio *bio, struct sg_iovec *iov, int iov_count, int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - int ret; - - ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs, 1); + int ret = 0; + if (!bio_flagged(bio, BIO_NULL_MAPPED)) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, 1, bmd->is_our_pages); bio_free_map_data(bmd); bio_put(bio); return ret; @@ -564,16 +613,20 @@ int bio_uncopy_user(struct bio *bio) /** * bio_copy_user_iov - copy user data to bio * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, - int iov_count, int write_to_vm) +struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, + struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; struct bio_vec *bvec; @@ -596,25 +649,38 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, len += iov[i].iov_len; } - bmd = bio_alloc_map_data(nr_pages, iov_count); + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; bio->bi_rw |= (!write_to_vm << BIO_RW); ret = 0; + i = 0; while (len) { - unsigned int bytes = PAGE_SIZE; + unsigned int bytes; + + if (map_data) + bytes = 1U << (PAGE_SHIFT + map_data->page_order); + else + bytes = PAGE_SIZE; if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | GFP_KERNEL); + if (map_data) { + if (i == map_data->nr_entries) { + ret = -ENOMEM; + break; + } + page = map_data->pages[i++]; + } else + page = alloc_page(q->bounce_gfp | gfp_mask); if (!page) { ret = -ENOMEM; break; @@ -633,16 +699,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, * success */ if (!write_to_vm) { - ret = __bio_copy_iov(bio, iov, iov_count, 0); + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); if (ret) goto cleanup; } - bio_set_map_data(bmd, bio, iov, iov_count); + bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); return bio; cleanup: - bio_for_each_segment(bvec, bio, i) - __free_page(bvec->bv_page); + if (!map_data) + bio_for_each_segment(bvec, bio, i) + __free_page(bvec->bv_page); bio_put(bio); out_bmd: @@ -653,29 +720,32 @@ out_bmd: /** * bio_copy_user - copy user data to bio * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, - unsigned int len, int write_to_vm) +struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, + unsigned long uaddr, unsigned int len, + int write_to_vm, gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_copy_user_iov(q, &iov, 1, write_to_vm); + return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); } static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, - int write_to_vm) + int write_to_vm, gfp_t gfp_mask) { int i, j; int nr_pages = 0; @@ -701,12 +771,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!nr_pages) return ERR_PTR(-EINVAL); - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); if (!pages) goto out; @@ -785,19 +855,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm) + unsigned long uaddr, unsigned int len, int write_to_vm, + gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); + return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); } /** @@ -807,18 +879,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, - int write_to_vm) + int write_to_vm, gfp_t gfp_mask) { struct bio *bio; - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); - + bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, + gfp_mask); if (IS_ERR(bio)) return bio; @@ -942,19 +1015,22 @@ static void bio_copy_kern_endio(struct bio *bio, int err) { struct bio_vec *bvec; const int read = bio_data_dir(bio) == READ; - char *p = bio->bi_private; + struct bio_map_data *bmd = bio->bi_private; int i; + char *p = bmd->sgvecs[0].iov_base; __bio_for_each_segment(bvec, bio, i, 0) { char *addr = page_address(bvec->bv_page); + int len = bmd->iovecs[i].bv_len; if (read && !err) - memcpy(p, addr, bvec->bv_len); + memcpy(p, addr, len); __free_page(bvec->bv_page); - p += bvec->bv_len; + p += len; } + bio_free_map_data(bmd); bio_put(bio); } @@ -972,38 +1048,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err) struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; struct bio *bio; struct bio_vec *bvec; - int i, ret; - - bio = bio_alloc(gfp_mask, nr_pages); - if (!bio) - return ERR_PTR(-ENOMEM); - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - goto cleanup; - } - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { - ret = -EINVAL; - goto cleanup; - } + int i; - len -= bytes; - } + bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); + if (IS_ERR(bio)) + return bio; if (!reading) { void *p = data; @@ -1016,16 +1067,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, } } - bio->bi_private = data; bio->bi_end_io = bio_copy_kern_endio; - return bio; -cleanup: - bio_for_each_segment(bvec, bio, i) - __free_page(bvec->bv_page); - - bio_put(bio); - return ERR_PTR(ret); + return bio; } /* @@ -1212,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err) * split a bio - only worry about a bio with a single page * in it's iovec */ -struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) +struct bio_pair *bio_split(struct bio *bi, int first_sectors) { - struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); + struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); if (!bp) return bp; @@ -1248,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio2.bi_end_io = bio_pair_end_2; bp->bio1.bi_private = bi; - bp->bio2.bi_private = pool; + bp->bio2.bi_private = bio_split_pool; if (bio_integrity(bi)) bio_integrity_split(bi, bp, first_sectors); @@ -1256,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) return bp; } +/** + * bio_sector_offset - Find hardware sector offset in bio + * @bio: bio to inspect + * @index: bio_vec index + * @offset: offset in bv_page + * + * Return the number of hardware sectors between beginning of bio + * and an end point indicated by a bio_vec index and an offset + * within that vector's page. + */ +sector_t bio_sector_offset(struct bio *bio, unsigned short index, + unsigned int offset) +{ + unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); + struct bio_vec *bv; + sector_t sectors; + int i; + + sectors = 0; + + if (index >= bio->bi_idx) + index = bio->bi_vcnt - 1; + + __bio_for_each_segment(bv, bio, i, 0) { + if (i == index) { + if (offset > bv->bv_offset) + sectors += (offset - bv->bv_offset) / sector_sz; + break; + } + + sectors += bv->bv_len / sector_sz; + } + + return sectors; +} +EXPORT_SYMBOL(bio_sector_offset); /* * create memory pools for biovec's in a bio_set. @@ -1358,6 +1438,7 @@ static int __init init_bio(void) subsys_initcall(init_bio); EXPORT_SYMBOL(bio_alloc); +EXPORT_SYMBOL(bio_kmalloc); EXPORT_SYMBOL(bio_put); EXPORT_SYMBOL(bio_free); EXPORT_SYMBOL(bio_endio); @@ -1365,7 +1446,6 @@ EXPORT_SYMBOL(bio_init); EXPORT_SYMBOL(__bio_clone); EXPORT_SYMBOL(bio_clone); EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_hw_segments); EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_pc_page); EXPORT_SYMBOL(bio_get_nr_vecs); @@ -1375,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern); EXPORT_SYMBOL(bio_copy_kern); EXPORT_SYMBOL(bio_pair_release); EXPORT_SYMBOL(bio_split); -EXPORT_SYMBOL(bio_split_pool); EXPORT_SYMBOL(bio_copy_user); EXPORT_SYMBOL(bio_uncopy_user); EXPORT_SYMBOL(bioset_create); diff --git a/fs/block_dev.c b/fs/block_dev.c index aff54219e04..218408eed1b 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -540,22 +540,6 @@ EXPORT_SYMBOL(bd_release); * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 */ -static struct kobject *bdev_get_kobj(struct block_device *bdev) -{ - if (bdev->bd_contains != bdev) - return kobject_get(&bdev->bd_part->dev.kobj); - else - return kobject_get(&bdev->bd_disk->dev.kobj); -} - -static struct kobject *bdev_get_holder(struct block_device *bdev) -{ - if (bdev->bd_contains != bdev) - return kobject_get(bdev->bd_part->holder_dir); - else - return kobject_get(bdev->bd_disk->holder_dir); -} - static int add_symlink(struct kobject *from, struct kobject *to) { if (!from || !to) @@ -604,11 +588,11 @@ static int bd_holder_grab_dirs(struct block_device *bdev, if (!bo->hdev) goto fail_put_sdir; - bo->sdev = bdev_get_kobj(bdev); + bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); if (!bo->sdev) goto fail_put_hdev; - bo->hdir = bdev_get_holder(bdev); + bo->hdir = kobject_get(bdev->bd_part->holder_dir); if (!bo->hdir) goto fail_put_sdev; @@ -868,6 +852,87 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode) EXPORT_SYMBOL(open_by_devnum); +/** + * flush_disk - invalidates all buffer-cache entries on a disk + * + * @bdev: struct block device to be flushed + * + * Invalidates all buffer-cache entries on a disk. It should be called + * when a disk has been changed -- either by a media change or online + * resize. + */ +static void flush_disk(struct block_device *bdev) +{ + if (__invalidate_device(bdev)) { + char name[BDEVNAME_SIZE] = ""; + + if (bdev->bd_disk) + disk_name(bdev->bd_disk, 0, name); + printk(KERN_WARNING "VFS: busy inodes on changed media or " + "resized disk %s\n", name); + } + + if (!bdev->bd_disk) + return; + if (disk_partitionable(bdev->bd_disk)) + bdev->bd_invalidated = 1; +} + +/** + * check_disk_size_change - checks for disk size change and adjusts bdev size. + * @disk: struct gendisk to check + * @bdev: struct bdev to adjust. + * + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. + */ +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +{ + loff_t disk_size, bdev_size; + + disk_size = (loff_t)get_capacity(disk) << 9; + bdev_size = i_size_read(bdev->bd_inode); + if (disk_size != bdev_size) { + char name[BDEVNAME_SIZE]; + + disk_name(disk, 0, name); + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + name, bdev_size, disk_size); + i_size_write(bdev->bd_inode, disk_size); + flush_disk(bdev); + } +} +EXPORT_SYMBOL(check_disk_size_change); + +/** + * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back + * @disk: struct gendisk to be revalidated + * + * This routine is a wrapper for lower-level driver's revalidate_disk + * call-backs. It is used to do common pre and post operations needed + * for all revalidate_disk operations. + */ +int revalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev; + int ret = 0; + + if (disk->fops->revalidate_disk) + ret = disk->fops->revalidate_disk(disk); + + bdev = bdget_disk(disk, 0); + if (!bdev) + return ret; + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} +EXPORT_SYMBOL(revalidate_disk); + /* * This routine checks whether a removable media has been changed, * and invalidates all buffer-cache-entries in that case. This @@ -887,13 +952,9 @@ int check_disk_change(struct block_device *bdev) if (!bdops->media_changed(bdev->bd_disk)) return 0; - if (__invalidate_device(bdev)) - printk("VFS: busy inodes on changed media.\n"); - + flush_disk(bdev); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); - if (bdev->bd_disk->minors > 1) - bdev->bd_invalidated = 1; return 1; } @@ -927,10 +988,10 @@ static int __blkdev_put(struct block_device *bdev, int for_part); static int do_open(struct block_device *bdev, struct file *file, int for_part) { - struct module *owner = NULL; struct gendisk *disk; + struct hd_struct *part = NULL; int ret; - int part; + int partno; int perm = 0; if (file->f_mode & FMODE_READ) @@ -948,25 +1009,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; + lock_kernel(); - disk = get_gendisk(bdev->bd_dev, &part); - if (!disk) { - unlock_kernel(); - bdput(bdev); - return ret; - } - owner = disk->fops->owner; + + disk = get_gendisk(bdev->bd_dev, &partno); + if (!disk) + goto out_unlock_kernel; + part = disk_get_part(disk, partno); + if (!part) + goto out_unlock_kernel; mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_part = part; bdev->bd_contains = bdev; - if (!part) { + if (!partno) { struct backing_dev_info *bdi; if (disk->fops->open) { ret = disk->fops->open(bdev->bd_inode, file); if (ret) - goto out_first; + goto out_clear; } if (!bdev->bd_openers) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); @@ -978,36 +1041,36 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (bdev->bd_invalidated) rescan_partitions(disk, bdev); } else { - struct hd_struct *p; struct block_device *whole; whole = bdget_disk(disk, 0); ret = -ENOMEM; if (!whole) - goto out_first; + goto out_clear; BUG_ON(for_part); ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); if (ret) - goto out_first; + goto out_clear; bdev->bd_contains = whole; - p = disk->part[part - 1]; bdev->bd_inode->i_data.backing_dev_info = whole->bd_inode->i_data.backing_dev_info; - if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { + if (!(disk->flags & GENHD_FL_UP) || + !part || !part->nr_sects) { ret = -ENXIO; - goto out_first; + goto out_clear; } - kobject_get(&p->dev.kobj); - bdev->bd_part = p; - bd_set_size(bdev, (loff_t) p->nr_sects << 9); + bd_set_size(bdev, (loff_t)part->nr_sects << 9); } } else { + disk_put_part(part); put_disk(disk); - module_put(owner); + module_put(disk->fops->owner); + part = NULL; + disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); if (ret) - goto out; + goto out_unlock_bdev; } if (bdev->bd_invalidated) rescan_partitions(bdev->bd_disk, bdev); @@ -1020,19 +1083,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) unlock_kernel(); return 0; -out_first: + out_clear: bdev->bd_disk = NULL; + bdev->bd_part = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, 1); bdev->bd_contains = NULL; - put_disk(disk); - module_put(owner); -out: + out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); + out_unlock_kernel: unlock_kernel(); - if (ret) - bdput(bdev); + + disk_put_part(part); + if (disk) + module_put(disk->fops->owner); + put_disk(disk); + bdput(bdev); + return ret; } @@ -1117,11 +1185,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part) put_disk(disk); module_put(owner); - - if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->dev.kobj); - bdev->bd_part = NULL; - } + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) @@ -1197,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name - * * @path: special file representing the block device * - * Get a reference to the blockdevice at @path in the current + * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ diff --git a/fs/buffer.c b/fs/buffer.c index 38653e36e22..ac78d4c19b3 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2926,14 +2926,17 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); - if (buffer_ordered(bh) && (rw == WRITE)) - rw = WRITE_BARRIER; + /* + * Mask in barrier bit for a write (could be either a WRITE or a + * WRITE_SYNC + */ + if (buffer_ordered(bh) && (rw & WRITE)) + rw |= WRITE_BARRIER; /* - * Only clear out a write error when rewriting, should this - * include WRITE_SYNC as well? + * Only clear out a write error when rewriting */ - if (test_set_buffer_req(bh) && (rw == WRITE || rw == WRITE_BARRIER)) + if (test_set_buffer_req(bh) && (rw & WRITE)) clear_buffer_write_io_error(bh); /* diff --git a/fs/char_dev.c b/fs/char_dev.c index 3cb7cda3d78..262fa10e213 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -22,9 +22,6 @@ #include <linux/mutex.h> #include <linux/backing-dev.h> -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif #include "internal.h" /* diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index f5d0083e09f..06e521a945c 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -4,7 +4,15 @@ Fix premature write failure on congested networks (we would give up on EAGAIN from the socket too quickly on large writes). Cifs_mkdir and cifs_create now respect the setgid bit on parent dir. Fix endian problems in acl (mode from/to cifs acl) on bigendian -architectures. +architectures. Fix problems with preserving timestamps on copying open +files (e.g. "cp -a") to Windows servers. For mkdir and create honor setgid bit +on parent directory when server supports Unix Extensions but not POSIX +create. Update cifs.upcall version to handle new Kerberos sec flags +(this requires update of cifs.upcall program from Samba). Fix memory leak +on dns_upcall (resolving DFS referralls). Fix plain text password +authentication (requires setting SecurityFlags to 0x30030 to enable +lanman and plain text though). Fix writes to be at correct offset when +file is open with O_APPEND and file is on a directio (forcediretio) mount. Version 1.53 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 2bd6fe556f8..bd2343d4c6a 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -542,10 +542,20 @@ SecurityFlags Flags which control security negotiation and hashing mechanisms (as "must use") on the other hand does not make much sense. Default flags are 0x07007 - (NTLM, NTLMv2 and packet signing allowed). Maximum + (NTLM, NTLMv2 and packet signing allowed). The maximum allowable flags if you want to allow mounts to servers using weaker password hashes is 0x37037 (lanman, - plaintext, ntlm, ntlmv2, signing allowed): + plaintext, ntlm, ntlmv2, signing allowed). Some + SecurityFlags require the corresponding menuconfig + options to be enabled (lanman and plaintext require + CONFIG_CIFS_WEAK_PW_HASH for example). Enabling + plaintext authentication currently requires also + enabling lanman authentication in the security flags + because the cifs module only supports sending + laintext passwords using the older lanman dialect + form of the session setup SMB. (e.g. for authentication + using plain text passwords, set the SecurityFlags + to 0x30030): may use packet signing 0x00001 must use packet signing 0x01001 @@ -642,8 +652,30 @@ The statistics for the number of total SMBs and oplock breaks are different in that they represent all for that share, not just those for which the server returned success. -Also note that "cat /proc/fs/cifs/DebugData" will display information about +Also note that "cat /proc/fs/cifs/DebugData" will display information about the active sessions and the shares that are mounted. -Enabling Kerberos (extended security) works when CONFIG_CIFS_EXPERIMENTAL is -on but requires a user space helper (from the Samba project). NTLM and NTLMv2 and -LANMAN support do not require this helper. + +Enabling Kerberos (extended security) works but requires version 1.2 or later +of the helper program cifs.upcall to be present and to be configured in the +/etc/request-key.conf file. The cifs.upcall helper program is from the Samba +project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not +require this helper. Note that NTLMv2 security (which does not require the +cifs.upcall helper program), instead of using Kerberos, is sufficient for +some use cases. + +Enabling DFS support (used to access shares transparently in an MS-DFS +global name space) requires that CONFIG_CIFS_EXPERIMENTAL be enabled. In +addition, DFS support for target shares which are specified as UNC +names which begin with host names (rather than IP addresses) requires +a user space helper (such as cifs.upcall) to be present in order to +translate host names to ip address, and the user space helper must also +be configured in the file /etc/request-key.conf + +To use cifs Kerberos and DFS support, the Linux keyutils package should be +installed and something like the following lines should be added to the +/etc/request-key.conf file: + +create cifs.spnego * * /usr/local/sbin/cifs.upcall %k +create dns_resolver * * /usr/local/sbin/cifs.upcall %k + + diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 5fabd2caf93..1b09f167006 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -476,6 +476,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, unsigned int cls, con, tag, oidlen, rc; bool use_ntlmssp = false; bool use_kerberos = false; + bool use_mskerberos = false; *secType = NTLM; /* BB eventually make Kerberos or NLTMSSP the default*/ @@ -574,10 +575,12 @@ decode_negTokenInit(unsigned char *security_blob, int length, *(oid + 1), *(oid + 2), *(oid + 3))); if (compare_oid(oid, oidlen, MSKRB5_OID, - MSKRB5_OID_LEN)) - use_kerberos = true; + MSKRB5_OID_LEN) && + !use_kerberos) + use_mskerberos = true; else if (compare_oid(oid, oidlen, KRB5_OID, - KRB5_OID_LEN)) + KRB5_OID_LEN) && + !use_mskerberos) use_kerberos = true; else if (compare_oid(oid, oidlen, NTLMSSP_OID, NTLMSSP_OID_LEN)) @@ -630,6 +633,8 @@ decode_negTokenInit(unsigned char *security_blob, int length, if (use_kerberos) *secType = Kerberos; + else if (use_mskerberos) + *secType = MSKerberos; else if (use_ntlmssp) *secType = NTLMSSP; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 2434ab0e879..fcee9298b62 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -66,11 +66,28 @@ struct key_type cifs_spnego_key_type = { .describe = user_describe, }; -#define MAX_VER_STR_LEN 8 /* length of longest version string e.g. - strlen("ver=0xFF") */ -#define MAX_MECH_STR_LEN 13 /* length of longest security mechanism name, eg - in future could have strlen(";sec=ntlmsspi") */ -#define MAX_IPV6_ADDR_LEN 42 /* eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ +/* length of longest version string e.g. strlen("ver=0xFF") */ +#define MAX_VER_STR_LEN 8 + +/* length of longest security mechanism name, eg in future could have + * strlen(";sec=ntlmsspi") */ +#define MAX_MECH_STR_LEN 13 + +/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/60 */ +#define MAX_IPV6_ADDR_LEN 42 + +/* strlen of "host=" */ +#define HOST_KEY_LEN 5 + +/* strlen of ";ip4=" or ";ip6=" */ +#define IP_KEY_LEN 5 + +/* strlen of ";uid=0x" */ +#define UID_KEY_LEN 7 + +/* strlen of ";user=" */ +#define USER_KEY_LEN 6 + /* get a key struct with a SPNEGO security blob, suitable for session setup */ struct key * cifs_get_spnego_key(struct cifsSesInfo *sesInfo) @@ -84,11 +101,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) /* length of fields (with semicolons): ver=0xyz ip4=ipaddress host=hostname sec=mechanism uid=0xFF user=username */ desc_len = MAX_VER_STR_LEN + - 6 /* len of "host=" */ + strlen(hostname) + - 5 /* len of ";ipv4=" */ + MAX_IPV6_ADDR_LEN + + HOST_KEY_LEN + strlen(hostname) + + IP_KEY_LEN + MAX_IPV6_ADDR_LEN + MAX_MECH_STR_LEN + - 7 /* len of ";uid=0x" */ + (sizeof(uid_t) * 2) + - 6 /* len of ";user=" */ + strlen(sesInfo->userName) + 1; + UID_KEY_LEN + (sizeof(uid_t) * 2) + + USER_KEY_LEN + strlen(sesInfo->userName) + 1; spnego_key = ERR_PTR(-ENOMEM); description = kzalloc(desc_len, GFP_KERNEL); @@ -114,9 +131,11 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) dp = description + strlen(description); - /* for now, only sec=krb5 is valid */ + /* for now, only sec=krb5 and sec=mskrb5 are valid */ if (server->secType == Kerberos) sprintf(dp, ";sec=krb5"); + else if (server->secType == MSKerberos) + sprintf(dp, ";sec=mskrb5"); else goto out; diff --git a/fs/cifs/cifs_spnego.h b/fs/cifs/cifs_spnego.h index 05a34b17a1a..e4041ec4d71 100644 --- a/fs/cifs/cifs_spnego.h +++ b/fs/cifs/cifs_spnego.h @@ -23,7 +23,7 @@ #ifndef _CIFS_SPNEGO_H #define _CIFS_SPNEGO_H -#define CIFS_SPNEGO_UPCALL_VERSION 1 +#define CIFS_SPNEGO_UPCALL_VERSION 2 /* * The version field should always be set to CIFS_SPNEGO_UPCALL_VERSION. diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 83fd40dc1ef..bd5f13d3845 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -294,6 +294,7 @@ void calc_lanman_hash(struct cifsSesInfo *ses, char *lnm_session_key) if ((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) if (extended_security & CIFSSEC_MAY_PLNTXT) { + memset(lnm_session_key, 0, CIFS_SESS_KEY_SIZE); memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); return; diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 135c965c413..f7b4a5cd837 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -41,7 +41,7 @@ extern int cifs_create(struct inode *, struct dentry *, int, struct nameidata *); extern struct dentry *cifs_lookup(struct inode *, struct dentry *, struct nameidata *); -extern int cifs_unlink(struct inode *, struct dentry *); +extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct inode *, struct dentry *, int, dev_t); extern int cifs_mkdir(struct inode *, struct dentry *, int); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7e1cf262eff..0d22479d99b 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -80,7 +80,8 @@ enum securityEnum { NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO */ NTLMSSP, /* NTLMSSP via SPNEGO */ - Kerberos /* Kerberos via SPNEGO */ + Kerberos, /* Kerberos via SPNEGO */ + MSKerberos, /* MS Kerberos via SPNEGO */ }; enum protocolEnum { @@ -308,6 +309,7 @@ struct cifs_search_info { __u32 resume_key; char *ntwrk_buf_start; char *srch_entries_start; + char *last_entry; char *presume_name; unsigned int resume_name_len; bool endOfSearch:1; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index a729d083e6f..0cff7fe986e 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -179,6 +179,8 @@ extern int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, extern int CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, const FILE_BASIC_INFO *data, __u16 fid, __u32 pid_of_opener); +extern int CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener); #if 0 extern int CIFSSMBSetAttrLegacy(int xid, struct cifsTconInfo *tcon, char *fileName, __u16 dos_attributes, @@ -229,7 +231,7 @@ extern int CIFSSMBRename(const int xid, struct cifsTconInfo *tcon, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, - int netfid, char *target_name, + int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap_special_chars); extern int CIFSCreateHardLink(const int xid, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 994de7c9047..6f4ffe15d68 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -2017,7 +2017,7 @@ renameRetry: } int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, - int netfid, char *target_name, + int netfid, const char *target_name, const struct nls_table *nls_codepage, int remap) { struct smb_com_transaction2_sfi_req *pSMB = NULL; @@ -2071,7 +2071,7 @@ int CIFSSMBRenameOpenFile(const int xid, struct cifsTconInfo *pTcon, remap); } rename_info->target_name_len = cpu_to_le32(2 * len_of_str); - count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str) + 2; + count = 12 /* sizeof(struct set_file_rename) */ + (2 * len_of_str); byte_count += count; pSMB->DataCount = cpu_to_le16(count); pSMB->TotalDataCount = pSMB->DataCount; @@ -3614,6 +3614,8 @@ findFirstRetry: /* BB remember to free buffer if error BB */ rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc == 0) { + unsigned int lnoff; + if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) psrch_inf->unicode = true; else @@ -3636,6 +3638,17 @@ findFirstRetry: le16_to_cpu(parms->SearchCount); psrch_inf->index_of_last_entry = 2 /* skip . and .. */ + psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < + lnoff) { + cERROR(1, ("ignoring corrupt resume name")); + psrch_inf->last_entry = NULL; + return rc; + } + + psrch_inf->last_entry = psrch_inf->srch_entries_start + + lnoff; + *pnetfid = parms->SearchHandle; } else { cifs_buf_release(pSMB); @@ -3725,6 +3738,8 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, rc = validate_t2((struct smb_t2_rsp *)pSMBr); if (rc == 0) { + unsigned int lnoff; + /* BB fixme add lock for file (srch_info) struct here */ if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) psrch_inf->unicode = true; @@ -3751,6 +3766,16 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, le16_to_cpu(parms->SearchCount); psrch_inf->index_of_last_entry += psrch_inf->entries_in_buffer; + lnoff = le16_to_cpu(parms->LastNameOffset); + if (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE < + lnoff) { + cERROR(1, ("ignoring corrupt resume name")); + psrch_inf->last_entry = NULL; + return rc; + } else + psrch_inf->last_entry = + psrch_inf->srch_entries_start + lnoff; + /* cFYI(1,("fnxt2 entries in buf %d index_of_last %d", psrch_inf->entries_in_buffer, psrch_inf->index_of_last_entry)); */ @@ -4876,6 +4901,61 @@ CIFSSMBSetFileInfo(const int xid, struct cifsTconInfo *tcon, return rc; } +int +CIFSSMBSetFileDisposition(const int xid, struct cifsTconInfo *tcon, + bool delete_file, __u16 fid, __u32 pid_of_opener) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + char *data_offset; + int rc = 0; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, ("Set File Disposition (via SetFileInfo)")); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMB->hdr.Pid = cpu_to_le16((__u16)pid_of_opener); + pSMB->hdr.PidHigh = cpu_to_le16((__u16)(pid_of_opener >> 16)); + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + + count = 1; + pSMB->MaxParameterCount = cpu_to_le16(2); + /* BB find max SMB PDU from sess */ + pSMB->MaxDataCount = cpu_to_le16(1000); + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = fid; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_FILE_DISPOSITION_INFO); + pSMB->Reserved4 = 0; + pSMB->hdr.smb_buf_length += byte_count; + pSMB->ByteCount = cpu_to_le16(byte_count); + *data_offset = delete_file ? 1 : 0; + rc = SendReceiveNoRsp(xid, tcon->ses, (struct smb_hdr *) pSMB, 0); + if (rc) + cFYI(1, ("Send error in SetFileDisposition = %d", rc)); + + return rc; +} int CIFSSMBSetPathInfo(const int xid, struct cifsTconInfo *tcon, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 0711db65afe..4c13bcdb92a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3598,19 +3598,21 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, char ntlm_session_key[CIFS_SESS_KEY_SIZE]; bool ntlmv2_flag = false; int first_time = 0; + struct TCP_Server_Info *server = pSesInfo->server; /* what if server changes its buffer size after dropping the session? */ - if (pSesInfo->server->maxBuf == 0) /* no need to send on reconnect */ { + if (server->maxBuf == 0) /* no need to send on reconnect */ { rc = CIFSSMBNegotiate(xid, pSesInfo); - if (rc == -EAGAIN) /* retry only once on 1st time connection */ { + if (rc == -EAGAIN) { + /* retry only once on 1st time connection */ rc = CIFSSMBNegotiate(xid, pSesInfo); if (rc == -EAGAIN) rc = -EHOSTDOWN; } if (rc == 0) { spin_lock(&GlobalMid_Lock); - if (pSesInfo->server->tcpStatus != CifsExiting) - pSesInfo->server->tcpStatus = CifsGood; + if (server->tcpStatus != CifsExiting) + server->tcpStatus = CifsGood; else rc = -EHOSTDOWN; spin_unlock(&GlobalMid_Lock); @@ -3623,23 +3625,22 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, goto ss_err_exit; pSesInfo->flags = 0; - pSesInfo->capabilities = pSesInfo->server->capabilities; + pSesInfo->capabilities = server->capabilities; if (linuxExtEnabled == 0) pSesInfo->capabilities &= (~CAP_UNIX); /* pSesInfo->sequence_number = 0;*/ cFYI(1, ("Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d", - pSesInfo->server->secMode, - pSesInfo->server->capabilities, - pSesInfo->server->timeAdj)); + server->secMode, server->capabilities, server->timeAdj)); + if (experimEnabled < 2) rc = CIFS_SessSetup(xid, pSesInfo, first_time, nls_info); else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == NTLMSSP)) { + && (server->secType == NTLMSSP)) { rc = -EOPNOTSUPP; } else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) - && (pSesInfo->server->secType == RawNTLMSSP)) { + && (server->secType == RawNTLMSSP)) { cFYI(1, ("NTLMSSP sesssetup")); rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, nls_info); @@ -3668,12 +3669,12 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, } else { SMBNTencrypt(pSesInfo->password, - pSesInfo->server->cryptKey, + server->cryptKey, ntlm_session_key); if (first_time) cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, + &server->mac_signing_key, ntlm_session_key, pSesInfo->password); } @@ -3686,13 +3687,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, nls_info); } } else { /* old style NTLM 0.12 session setup */ - SMBNTencrypt(pSesInfo->password, pSesInfo->server->cryptKey, + SMBNTencrypt(pSesInfo->password, server->cryptKey, ntlm_session_key); if (first_time) - cifs_calculate_mac_key( - &pSesInfo->server->mac_signing_key, - ntlm_session_key, pSesInfo->password); + cifs_calculate_mac_key(&server->mac_signing_key, + ntlm_session_key, + pSesInfo->password); rc = CIFSSessSetup(xid, pSesInfo, ntlm_session_key, nls_info); } diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index f730ef35499..1e0c1bd8f2e 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -29,38 +29,13 @@ #include "cifsproto.h" #include "cifs_debug.h" -static int dns_resolver_instantiate(struct key *key, const void *data, - size_t datalen) -{ - int rc = 0; - char *ip; - - ip = kmalloc(datalen+1, GFP_KERNEL); - if (!ip) - return -ENOMEM; - - memcpy(ip, data, datalen); - ip[datalen] = '\0'; - - rcu_assign_pointer(key->payload.data, ip); - - return rc; -} - -struct key_type key_type_dns_resolver = { - .name = "dns_resolver", - .def_datalen = sizeof(struct in_addr), - .describe = user_describe, - .instantiate = dns_resolver_instantiate, - .match = user_match, -}; - /* Checks if supplied name is IP address * returns: * 1 - name is IP * 0 - name is not IP */ -static int is_ip(const char *name) +static int +is_ip(const char *name) { int rc; struct sockaddr_in sin_server; @@ -82,6 +57,47 @@ static int is_ip(const char *name) return 0; } +static int +dns_resolver_instantiate(struct key *key, const void *data, + size_t datalen) +{ + int rc = 0; + char *ip; + + ip = kmalloc(datalen + 1, GFP_KERNEL); + if (!ip) + return -ENOMEM; + + memcpy(ip, data, datalen); + ip[datalen] = '\0'; + + /* make sure this looks like an address */ + if (!is_ip((const char *) ip)) { + kfree(ip); + return -EINVAL; + } + + key->type_data.x[0] = datalen; + rcu_assign_pointer(key->payload.data, ip); + + return rc; +} + +static void +dns_resolver_destroy(struct key *key) +{ + kfree(key->payload.data); +} + +struct key_type key_type_dns_resolver = { + .name = "dns_resolver", + .def_datalen = sizeof(struct in_addr), + .describe = user_describe, + .instantiate = dns_resolver_instantiate, + .destroy = dns_resolver_destroy, + .match = user_match, +}; + /* Resolves server name to ip address. * input: * unc - server UNC @@ -133,6 +149,7 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) rkey = request_key(&key_type_dns_resolver, name, ""); if (!IS_ERR(rkey)) { + len = rkey->type_data.x[0]; data = rkey->payload.data; } else { cERROR(1, ("%s: unable to resolve: %s", __func__, name)); @@ -141,11 +158,9 @@ dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) skip_upcall: if (data) { - len = strlen(data); - *ip_addr = kmalloc(len+1, GFP_KERNEL); + *ip_addr = kmalloc(len + 1, GFP_KERNEL); if (*ip_addr) { - memcpy(*ip_addr, data, len); - (*ip_addr)[len] = '\0'; + memcpy(*ip_addr, data, len + 1); if (!IS_ERR(rkey)) cFYI(1, ("%s: resolved: %s to %s", __func__, name, diff --git a/fs/cifs/file.c b/fs/cifs/file.c index ff14d14903a..c4a8a060512 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -107,7 +107,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, /* want handles we can use to read with first in the list so we do not have to walk the - list to search for one in prepare_write */ + list to search for one in write_begin */ if ((file->f_flags & O_ACCMODE) == O_WRONLY) { list_add_tail(&pCifsFile->flist, &pCifsInode->openFileList); @@ -833,6 +833,10 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, return -EBADF; open_file = (struct cifsFileInfo *) file->private_data; + rc = generic_write_checks(file, poffset, &write_size, 0); + if (rc) + return rc; + xid = GetXid(); if (*poffset > file->f_path.dentry->d_inode->i_size) @@ -911,7 +915,7 @@ ssize_t cifs_user_write(struct file *file, const char __user *write_data, } static ssize_t cifs_write(struct file *file, const char *write_data, - size_t write_size, loff_t *poffset) + size_t write_size, loff_t *poffset) { int rc = 0; unsigned int bytes_written = 0; @@ -1061,6 +1065,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode) struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) { struct cifsFileInfo *open_file; + bool any_available = false; int rc; /* Having a null inode here (because mapping->host was set to zero by @@ -1076,8 +1081,10 @@ struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *cifs_inode) read_lock(&GlobalSMBSeslock); refind_writable: list_for_each_entry(open_file, &cifs_inode->openFileList, flist) { - if (open_file->closePend) + if (open_file->closePend || + (!any_available && open_file->pid != current->tgid)) continue; + if (open_file->pfile && ((open_file->pfile->f_flags & O_RDWR) || (open_file->pfile->f_flags & O_WRONLY))) { @@ -1127,6 +1134,11 @@ refind_writable: of the loop here. */ } } + /* couldn't find useable FH with same pid, try any available */ + if (!any_available) { + any_available = true; + goto refind_writable; + } read_unlock(&GlobalSMBSeslock); return NULL; } @@ -1443,49 +1455,52 @@ static int cifs_writepage(struct page *page, struct writeback_control *wbc) return rc; } -static int cifs_commit_write(struct file *file, struct page *page, - unsigned offset, unsigned to) +static int cifs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - int xid; - int rc = 0; - struct inode *inode = page->mapping->host; - loff_t position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - char *page_data; + int rc; + struct inode *inode = mapping->host; - xid = GetXid(); - cFYI(1, ("commit write for page %p up to position %lld for %d", - page, position, to)); - spin_lock(&inode->i_lock); - if (position > inode->i_size) - i_size_write(inode, position); + cFYI(1, ("write_end for page %p from pos %lld with %d bytes", + page, pos, copied)); + + if (!PageUptodate(page) && copied == PAGE_CACHE_SIZE) + SetPageUptodate(page); - spin_unlock(&inode->i_lock); if (!PageUptodate(page)) { - position = ((loff_t)page->index << PAGE_CACHE_SHIFT) + offset; - /* can not rely on (or let) writepage write this data */ - if (to < offset) { - cFYI(1, ("Illegal offsets, can not copy from %d to %d", - offset, to)); - FreeXid(xid); - return rc; - } + char *page_data; + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + int xid; + + xid = GetXid(); /* this is probably better than directly calling partialpage_write since in this function the file handle is known which we might as well leverage */ /* BB check if anything else missing out of ppw such as updating last write time */ page_data = kmap(page); - rc = cifs_write(file, page_data + offset, to-offset, - &position); - if (rc > 0) - rc = 0; - /* else if (rc < 0) should we set writebehind rc? */ + rc = cifs_write(file, page_data + offset, copied, &pos); + /* if (rc < 0) should we set writebehind rc? */ kunmap(page); + + FreeXid(xid); } else { + rc = copied; + pos += copied; set_page_dirty(page); } - FreeXid(xid); + if (rc > 0) { + spin_lock(&inode->i_lock); + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&inode->i_lock); + } + + unlock_page(page); + page_cache_release(page); + return rc; } @@ -2031,49 +2046,44 @@ bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 end_of_file) return true; } -static int cifs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int cifs_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { - int rc = 0; - loff_t i_size; - loff_t offset; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + loff_t offset = pos & (PAGE_CACHE_SIZE - 1); + + cFYI(1, ("write_begin from %lld len %d", (long long)pos, len)); - cFYI(1, ("prepare write for page %p from %d to %d", page, from, to)); - if (PageUptodate(page)) + *pagep = __grab_cache_page(mapping, index); + if (!*pagep) + return -ENOMEM; + + if (PageUptodate(*pagep)) return 0; /* If we are writing a full page it will be up to date, no need to read from the server */ - if ((to == PAGE_CACHE_SIZE) && (from == 0)) { - SetPageUptodate(page); + if (len == PAGE_CACHE_SIZE && flags & AOP_FLAG_UNINTERRUPTIBLE) return 0; - } - offset = (loff_t)page->index << PAGE_CACHE_SHIFT; - i_size = i_size_read(page->mapping->host); + if ((file->f_flags & O_ACCMODE) != O_WRONLY) { + int rc; - if ((offset >= i_size) || - ((from == 0) && (offset + to) >= i_size)) { - /* - * We don't need to read data beyond the end of the file. - * zero it, and set the page uptodate - */ - simple_prepare_write(file, page, from, to); - SetPageUptodate(page); - } else if ((file->f_flags & O_ACCMODE) != O_WRONLY) { /* might as well read a page, it is fast enough */ - rc = cifs_readpage_worker(file, page, &offset); + rc = cifs_readpage_worker(file, *pagep, &offset); + + /* we do not need to pass errors back + e.g. if we do not have read access to the file + because cifs_write_end will attempt synchronous writes + -- shaggy */ } else { /* we could try using another file handle if there is one - but how would we lock it to prevent close of that handle racing with this read? In any case - this will be written out by commit_write so is fine */ + this will be written out by write_end so is fine */ } - /* we do not need to pass errors back - e.g. if we do not have read access to the file - because cifs_commit_write will do the right thing. -- shaggy */ - return 0; } @@ -2082,8 +2092,8 @@ const struct address_space_operations cifs_addr_ops = { .readpages = cifs_readpages, .writepage = cifs_writepage, .writepages = cifs_writepages, - .prepare_write = cifs_prepare_write, - .commit_write = cifs_commit_write, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ @@ -2098,8 +2108,8 @@ const struct address_space_operations cifs_addr_ops_smallbuf = { .readpage = cifs_readpage, .writepage = cifs_writepage, .writepages = cifs_writepages, - .prepare_write = cifs_prepare_write, - .commit_write = cifs_commit_write, + .write_begin = cifs_write_begin, + .write_end = cifs_write_end, .set_page_dirty = __set_page_dirty_nobuffers, /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 848286861c3..a8c833345fc 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -546,7 +546,8 @@ int cifs_get_inode_info(struct inode **pinode, if ((inode->i_mode & S_IWUGO) == 0 && (attr & ATTR_READONLY) == 0) inode->i_mode |= (S_IWUGO & default_mode); - inode->i_mode &= ~S_IFMT; + + inode->i_mode &= ~S_IFMT; } /* clear write bits if ATTR_READONLY is set */ if (attr & ATTR_READONLY) @@ -664,40 +665,201 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) return inode; } -int cifs_unlink(struct inode *inode, struct dentry *direntry) +static int +cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid, + char *full_path, __u32 dosattr) +{ + int rc; + int oplock = 0; + __u16 netfid; + __u32 netpid; + bool set_time = false; + struct cifsFileInfo *open_file; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + FILE_BASIC_INFO info_buf; + + if (attrs->ia_valid & ATTR_ATIME) { + set_time = true; + info_buf.LastAccessTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); + } else + info_buf.LastAccessTime = 0; + + if (attrs->ia_valid & ATTR_MTIME) { + set_time = true; + info_buf.LastWriteTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); + } else + info_buf.LastWriteTime = 0; + + /* + * Samba throws this field away, but windows may actually use it. + * Do not set ctime unless other time stamps are changed explicitly + * (i.e. by utimes()) since we would then have a mix of client and + * server times. + */ + if (set_time && (attrs->ia_valid & ATTR_CTIME)) { + cFYI(1, ("CIFS - CTIME changed")); + info_buf.ChangeTime = + cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); + } else + info_buf.ChangeTime = 0; + + info_buf.CreationTime = 0; /* don't change */ + info_buf.Attributes = cpu_to_le32(dosattr); + + /* + * If the file is already open for write, just use that fileid + */ + open_file = find_writable_file(cifsInode); + if (open_file) { + netfid = open_file->netfid; + netpid = open_file->pid; + goto set_via_filehandle; + } + + /* + * NT4 apparently returns success on this call, but it doesn't + * really work. + */ + if (!(pTcon->ses->flags & CIFS_SES_NT4)) { + rc = CIFSSMBSetPathInfo(xid, pTcon, full_path, + &info_buf, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc == 0) { + cifsInode->cifsAttrs = dosattr; + goto out; + } else if (rc != -EOPNOTSUPP && rc != -EINVAL) + goto out; + } + + cFYI(1, ("calling SetFileInfo since SetPathInfo for " + "times not supported by this server")); + rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, + SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, + CREATE_NOT_DIR, &netfid, &oplock, + NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc != 0) { + if (rc == -EIO) + rc = -EINVAL; + goto out; + } + + netpid = current->tgid; + +set_via_filehandle: + rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid); + if (!rc) + cifsInode->cifsAttrs = dosattr; + + if (open_file == NULL) + CIFSSMBClose(xid, pTcon, netfid); + else + atomic_dec(&open_file->wrtPending); +out: + return rc; +} + +/* + * open the given file (if it isn't already), set the DELETE_ON_CLOSE bit + * and rename it to a random name that hopefully won't conflict with + * anything else. + */ +static int +cifs_rename_pending_delete(char *full_path, struct inode *inode, int xid) +{ + int oplock = 0; + int rc; + __u16 netfid; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; + __u32 dosattr; + FILE_BASIC_INFO *info_buf; + + rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN, + DELETE|FILE_WRITE_ATTRIBUTES, + CREATE_NOT_DIR|CREATE_DELETE_ON_CLOSE, + &netfid, &oplock, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != 0) + goto out; + + /* set ATTR_HIDDEN and clear ATTR_READONLY */ + cifsInode = CIFS_I(inode); + dosattr = cifsInode->cifsAttrs & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + info_buf = kzalloc(sizeof(*info_buf), GFP_KERNEL); + if (info_buf == NULL) { + rc = -ENOMEM; + goto out_close; + } + info_buf->Attributes = cpu_to_le32(dosattr); + rc = CIFSSMBSetFileInfo(xid, tcon, info_buf, netfid, current->tgid); + kfree(info_buf); + if (rc != 0) + goto out_close; + cifsInode->cifsAttrs = dosattr; + + /* silly-rename the file */ + CIFSSMBRenameOpenFile(xid, tcon, netfid, NULL, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + /* set DELETE_ON_CLOSE */ + rc = CIFSSMBSetFileDisposition(xid, tcon, true, netfid, current->tgid); + + /* + * some samba versions return -ENOENT when we try to set the file + * disposition here. Likely a samba bug, but work around it for now + */ + if (rc == -ENOENT) + rc = 0; + +out_close: + CIFSSMBClose(xid, tcon, netfid); +out: + return rc; +} + +int cifs_unlink(struct inode *dir, struct dentry *dentry) { int rc = 0; int xid; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; char *full_path = NULL; - struct cifsInodeInfo *cifsInode; - FILE_BASIC_INFO *pinfo_buf; + struct inode *inode = dentry->d_inode; + struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct super_block *sb = dir->i_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifsTconInfo *tcon = cifs_sb->tcon; + struct iattr *attrs = NULL; + __u32 dosattr = 0, origattr = 0; - cFYI(1, ("cifs_unlink, inode = 0x%p", inode)); + cFYI(1, ("cifs_unlink, dir=0x%p, dentry=0x%p", dir, dentry)); xid = GetXid(); - if (inode) - cifs_sb = CIFS_SB(inode->i_sb); - else - cifs_sb = CIFS_SB(direntry->d_sb); - pTcon = cifs_sb->tcon; - - /* Unlink can be called from rename so we can not grab the sem here - since we deadlock otherwise */ -/* mutex_lock(&direntry->d_sb->s_vfs_rename_mutex);*/ - full_path = build_path_from_dentry(direntry); -/* mutex_unlock(&direntry->d_sb->s_vfs_rename_mutex);*/ + /* Unlink can be called from rename so we can not take the + * sb->s_vfs_rename_mutex here */ + full_path = build_path_from_dentry(dentry); if (full_path == NULL) { FreeXid(xid); return -ENOMEM; } - if ((pTcon->ses->capabilities & CAP_UNIX) && + if ((tcon->ses->capabilities & CAP_UNIX) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & - le64_to_cpu(pTcon->fsUnixInfo.Capability))) { - rc = CIFSPOSIXDelFile(xid, pTcon, full_path, + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = CIFSPOSIXDelFile(xid, tcon, full_path, SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cFYI(1, ("posix del rc %d", rc)); @@ -705,125 +867,60 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry) goto psx_del_no_retry; } - rc = CIFSSMBDelFile(xid, pTcon, full_path, cifs_sb->local_nls, +retry_std_delete: + rc = CIFSSMBDelFile(xid, tcon, full_path, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + psx_del_no_retry: if (!rc) { - if (direntry->d_inode) - drop_nlink(direntry->d_inode); + if (inode) + drop_nlink(inode); } else if (rc == -ENOENT) { - d_drop(direntry); + d_drop(dentry); } else if (rc == -ETXTBSY) { - int oplock = 0; - __u16 netfid; - - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, DELETE, - CREATE_NOT_DIR | CREATE_DELETE_ON_CLOSE, - &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - CIFSSMBRenameOpenFile(xid, pTcon, netfid, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, pTcon, netfid); - if (direntry->d_inode) - drop_nlink(direntry->d_inode); + rc = cifs_rename_pending_delete(full_path, inode, xid); + if (rc == 0) + drop_nlink(inode); + } else if (rc == -EACCES && dosattr == 0) { + attrs = kzalloc(sizeof(*attrs), GFP_KERNEL); + if (attrs == NULL) { + rc = -ENOMEM; + goto out_reval; } - } else if (rc == -EACCES) { - /* try only if r/o attribute set in local lookup data? */ - pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL); - if (pinfo_buf) { - /* ATTRS set to normal clears r/o bit */ - pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL); - if (!(pTcon->ses->flags & CIFS_SES_NT4)) - rc = CIFSSMBSetPathInfo(xid, pTcon, full_path, - pinfo_buf, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - else - rc = -EOPNOTSUPP; - if (rc == -EOPNOTSUPP) { - int oplock = 0; - __u16 netfid; - /* rc = CIFSSMBSetAttrLegacy(xid, pTcon, - full_path, - (__u16)ATTR_NORMAL, - cifs_sb->local_nls); - For some strange reason it seems that NT4 eats the - old setattr call without actually setting the - attributes so on to the third attempted workaround - */ - - /* BB could scan to see if we already have it open - and pass in pid of opener to function */ - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, SYNCHRONIZE | - FILE_WRITE_ATTRIBUTES, 0, - &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - rc = CIFSSMBSetFileInfo(xid, pTcon, - pinfo_buf, - netfid, - current->tgid); - CIFSSMBClose(xid, pTcon, netfid); - } - } - kfree(pinfo_buf); - } - if (rc == 0) { - rc = CIFSSMBDelFile(xid, pTcon, full_path, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (!rc) { - if (direntry->d_inode) - drop_nlink(direntry->d_inode); - } else if (rc == -ETXTBSY) { - int oplock = 0; - __u16 netfid; - - rc = CIFSSMBOpen(xid, pTcon, full_path, - FILE_OPEN, DELETE, - CREATE_NOT_DIR | - CREATE_DELETE_ON_CLOSE, - &netfid, &oplock, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - CIFSSMBRenameOpenFile(xid, pTcon, - netfid, NULL, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, pTcon, netfid); - if (direntry->d_inode) - drop_nlink(direntry->d_inode); - } - /* BB if rc = -ETXTBUSY goto the rename logic BB */ - } - } - } - if (direntry->d_inode) { - cifsInode = CIFS_I(direntry->d_inode); - cifsInode->time = 0; /* will force revalidate to get info - when needed */ - direntry->d_inode->i_ctime = current_fs_time(inode->i_sb); + /* try to reset dos attributes */ + origattr = cifsInode->cifsAttrs; + if (origattr == 0) + origattr |= ATTR_NORMAL; + dosattr = origattr & ~ATTR_READONLY; + if (dosattr == 0) + dosattr |= ATTR_NORMAL; + dosattr |= ATTR_HIDDEN; + + rc = cifs_set_file_info(inode, attrs, xid, full_path, dosattr); + if (rc != 0) + goto out_reval; + + goto retry_std_delete; } + + /* undo the setattr if we errored out and it's needed */ + if (rc != 0 && dosattr != 0) + cifs_set_file_info(inode, attrs, xid, full_path, origattr); + +out_reval: if (inode) { - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); cifsInode = CIFS_I(inode); - cifsInode->time = 0; /* force revalidate of dir as well */ + cifsInode->time = 0; /* will force revalidate to get info + when needed */ + inode->i_ctime = current_fs_time(sb); } + dir->i_ctime = dir->i_mtime = current_fs_time(sb); + cifsInode = CIFS_I(dir); + CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ kfree(full_path); + kfree(attrs); FreeXid(xid); return rc; } @@ -868,7 +965,7 @@ static void posix_fill_in_inode(struct inode *tmp_inode, int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) { - int rc = 0; + int rc = 0, tmprc; int xid; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; @@ -930,6 +1027,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) kfree(pInfo); goto mkdir_get_info; } + /* Is an i_ino of zero legal? */ /* Are there sanity checks we can use to ensure that the server is really filling in that field? */ @@ -1018,12 +1116,20 @@ mkdir_get_info: if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && (mode & S_IWUGO) == 0) { FILE_BASIC_INFO pInfo; + struct cifsInodeInfo *cifsInode; + u32 dosattrs; + memset(&pInfo, 0, sizeof(pInfo)); - pInfo.Attributes = cpu_to_le32(ATTR_READONLY); - CIFSSMBSetPathInfo(xid, pTcon, full_path, - &pInfo, cifs_sb->local_nls, + cifsInode = CIFS_I(newinode); + dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; + pInfo.Attributes = cpu_to_le32(dosattrs); + tmprc = CIFSSMBSetPathInfo(xid, pTcon, + full_path, &pInfo, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + if (tmprc == 0) + cifsInode->cifsAttrs = dosattrs; } if (direntry->d_inode) { if (cifs_sb->mnt_cifs_flags & @@ -1095,117 +1201,141 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) return rc; } +static int +cifs_do_rename(int xid, struct dentry *from_dentry, const char *fromPath, + struct dentry *to_dentry, const char *toPath) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(from_dentry->d_sb); + struct cifsTconInfo *pTcon = cifs_sb->tcon; + __u16 srcfid; + int oplock, rc; + + /* try path-based rename first */ + rc = CIFSSMBRename(xid, pTcon, fromPath, toPath, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + /* + * don't bother with rename by filehandle unless file is busy and + * source Note that cross directory moves do not work with + * rename by filehandle to various Windows servers. + */ + if (rc == 0 || rc != -ETXTBSY) + return rc; + + /* open the file to be renamed -- we need DELETE perms */ + rc = CIFSSMBOpen(xid, pTcon, fromPath, FILE_OPEN, DELETE, + CREATE_NOT_DIR, &srcfid, &oplock, NULL, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + if (rc == 0) { + rc = CIFSSMBRenameOpenFile(xid, pTcon, srcfid, + (const char *) to_dentry->d_name.name, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + + CIFSSMBClose(xid, pTcon, srcfid); + } + + return rc; +} + int cifs_rename(struct inode *source_inode, struct dentry *source_direntry, struct inode *target_inode, struct dentry *target_direntry) { - char *fromName; - char *toName; + char *fromName = NULL; + char *toName = NULL; struct cifs_sb_info *cifs_sb_source; struct cifs_sb_info *cifs_sb_target; struct cifsTconInfo *pTcon; + FILE_UNIX_BASIC_INFO *info_buf_source = NULL; + FILE_UNIX_BASIC_INFO *info_buf_target; int xid; - int rc = 0; - - xid = GetXid(); + int rc; cifs_sb_target = CIFS_SB(target_inode->i_sb); cifs_sb_source = CIFS_SB(source_inode->i_sb); pTcon = cifs_sb_source->tcon; + xid = GetXid(); + + /* + * BB: this might be allowed if same server, but different share. + * Consider adding support for this + */ if (pTcon != cifs_sb_target->tcon) { - FreeXid(xid); - return -EXDEV; /* BB actually could be allowed if same server, - but different share. - Might eventually add support for this */ + rc = -EXDEV; + goto cifs_rename_exit; } - /* we already have the rename sem so we do not need to grab it again - here to protect the path integrity */ + /* + * we already have the rename sem so we do not need to + * grab it again here to protect the path integrity + */ fromName = build_path_from_dentry(source_direntry); + if (fromName == NULL) { + rc = -ENOMEM; + goto cifs_rename_exit; + } + toName = build_path_from_dentry(target_direntry); - if ((fromName == NULL) || (toName == NULL)) { + if (toName == NULL) { rc = -ENOMEM; goto cifs_rename_exit; } - rc = CIFSSMBRename(xid, pTcon, fromName, toName, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + rc = cifs_do_rename(xid, source_direntry, fromName, + target_direntry, toName); + if (rc == -EEXIST) { - /* check if they are the same file because rename of hardlinked - files is a noop */ - FILE_UNIX_BASIC_INFO *info_buf_source; - FILE_UNIX_BASIC_INFO *info_buf_target; - - info_buf_source = - kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); - if (info_buf_source != NULL) { + if (pTcon->unix_ext) { + /* + * Are src and dst hardlinks of same inode? We can + * only tell with unix extensions enabled + */ + info_buf_source = + kmalloc(2 * sizeof(FILE_UNIX_BASIC_INFO), + GFP_KERNEL); + if (info_buf_source == NULL) + goto unlink_target; + info_buf_target = info_buf_source + 1; - if (pTcon->unix_ext) - rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, - info_buf_source, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & + rc = CIFSSMBUnixQPathInfo(xid, pTcon, fromName, + info_buf_source, + cifs_sb_source->local_nls, + cifs_sb_source->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - /* else rc is still EEXIST so will fall through to - unlink the target and retry rename */ - if (rc == 0) { - rc = CIFSSMBUnixQPathInfo(xid, pTcon, toName, - info_buf_target, + if (rc != 0) + goto unlink_target; + + rc = CIFSSMBUnixQPathInfo(xid, pTcon, + toName, info_buf_target, cifs_sb_target->local_nls, /* remap based on source sb */ cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - } - if ((rc == 0) && - (info_buf_source->UniqueId == - info_buf_target->UniqueId)) { - /* do not rename since the files are hardlinked which - is a noop */ - } else { - /* we either can not tell the files are hardlinked - (as with Windows servers) or files are not - hardlinked so delete the target manually before - renaming to follow POSIX rather than Windows - semantics */ - cifs_unlink(target_inode, target_direntry); - rc = CIFSSMBRename(xid, pTcon, fromName, - toName, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); - } - kfree(info_buf_source); - } /* if we can not get memory just leave rc as EEXIST */ - } - - if (rc) - cFYI(1, ("rename rc %d", rc)); - - if ((rc == -EIO) || (rc == -EEXIST)) { - int oplock = 0; - __u16 netfid; - - /* BB FIXME Is Generic Read correct for rename? */ - /* if renaming directory - we should not say CREATE_NOT_DIR, - need to test renaming open directory, also GENERIC_READ - might not right be right access to request */ - rc = CIFSSMBOpen(xid, pTcon, fromName, FILE_OPEN, GENERIC_READ, - CREATE_NOT_DIR, &netfid, &oplock, NULL, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc == 0) { - rc = CIFSSMBRenameOpenFile(xid, pTcon, netfid, toName, - cifs_sb_source->local_nls, - cifs_sb_source->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - CIFSSMBClose(xid, pTcon, netfid); - } + + if (rc == 0 && (info_buf_source->UniqueId == + info_buf_target->UniqueId)) + /* same file, POSIX says that this is a noop */ + goto cifs_rename_exit; + } /* else ... BB we could add the same check for Windows by + checking the UniqueId via FILE_INTERNAL_INFO */ +unlink_target: + /* + * we either can not tell the files are hardlinked (as with + * Windows servers) or files are not hardlinked. Delete the + * target manually before renaming to follow POSIX rather than + * Windows semantics + */ + cifs_unlink(target_inode, target_direntry); + rc = cifs_do_rename(xid, source_direntry, fromName, + target_direntry, toName); } cifs_rename_exit: + kfree(info_buf_source); kfree(fromName); kfree(toName); FreeXid(xid); @@ -1506,101 +1636,6 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, } static int -cifs_set_file_info(struct inode *inode, struct iattr *attrs, int xid, - char *full_path, __u32 dosattr) -{ - int rc; - int oplock = 0; - __u16 netfid; - __u32 netpid; - bool set_time = false; - struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsInode = CIFS_I(inode); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - struct cifsTconInfo *pTcon = cifs_sb->tcon; - FILE_BASIC_INFO info_buf; - - if (attrs->ia_valid & ATTR_ATIME) { - set_time = true; - info_buf.LastAccessTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_atime)); - } else - info_buf.LastAccessTime = 0; - - if (attrs->ia_valid & ATTR_MTIME) { - set_time = true; - info_buf.LastWriteTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_mtime)); - } else - info_buf.LastWriteTime = 0; - - /* - * Samba throws this field away, but windows may actually use it. - * Do not set ctime unless other time stamps are changed explicitly - * (i.e. by utimes()) since we would then have a mix of client and - * server times. - */ - if (set_time && (attrs->ia_valid & ATTR_CTIME)) { - cFYI(1, ("CIFS - CTIME changed")); - info_buf.ChangeTime = - cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); - } else - info_buf.ChangeTime = 0; - - info_buf.CreationTime = 0; /* don't change */ - info_buf.Attributes = cpu_to_le32(dosattr); - - /* - * If the file is already open for write, just use that fileid - */ - open_file = find_writable_file(cifsInode); - if (open_file) { - netfid = open_file->netfid; - netpid = open_file->pid; - goto set_via_filehandle; - } - - /* - * NT4 apparently returns success on this call, but it doesn't - * really work. - */ - if (!(pTcon->ses->flags & CIFS_SES_NT4)) { - rc = CIFSSMBSetPathInfo(xid, pTcon, full_path, - &info_buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - if (rc != -EOPNOTSUPP && rc != -EINVAL) - goto out; - } - - cFYI(1, ("calling SetFileInfo since SetPathInfo for " - "times not supported by this server")); - rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, - SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, - CREATE_NOT_DIR, &netfid, &oplock, - NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); - - if (rc != 0) { - if (rc == -EIO) - rc = -EINVAL; - goto out; - } - - netpid = current->tgid; - -set_via_filehandle: - rc = CIFSSMBSetFileInfo(xid, pTcon, &info_buf, netfid, netpid); - if (open_file == NULL) - CIFSSMBClose(xid, pTcon, netfid); - else - atomic_dec(&open_file->wrtPending); -out: - return rc; -} - -static int cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) { int rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 4b17f8fe315..88786ba02d2 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -150,8 +150,7 @@ cifs_buf_get(void) but it may be more efficient to always alloc same size albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ - ret_buf = (struct smb_hdr *) mempool_alloc(cifs_req_poolp, - GFP_KERNEL | GFP_NOFS); + ret_buf = mempool_alloc(cifs_req_poolp, GFP_NOFS); /* clear the first few header bytes */ /* for most paths, more is cleared in header_assemble */ @@ -188,8 +187,7 @@ cifs_small_buf_get(void) but it may be more efficient to always alloc same size albeit slightly larger than necessary and maxbuffersize defaults to this and can not be bigger */ - ret_buf = (struct smb_hdr *) mempool_alloc(cifs_sm_req_poolp, - GFP_KERNEL | GFP_NOFS); + ret_buf = mempool_alloc(cifs_sm_req_poolp, GFP_NOFS); if (ret_buf) { /* No need to clear memory here, cleared in header assemble */ /* memset(ret_buf, 0, sizeof(struct smb_hdr) + 27);*/ @@ -313,8 +311,6 @@ header_assemble(struct smb_hdr *buffer, char smb_command /* command */ , buffer->Flags2 = SMBFLG2_KNOWS_LONG_NAMES; buffer->Pid = cpu_to_le16((__u16)current->tgid); buffer->PidHigh = cpu_to_le16((__u16)(current->tgid >> 16)); - spin_lock(&GlobalMid_Lock); - spin_unlock(&GlobalMid_Lock); if (treeCon) { buffer->Tid = treeCon->tid; if (treeCon->ses) { diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 5f40ed3473f..765adf12d54 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -640,6 +640,70 @@ static int is_dir_changed(struct file *file) } +static int cifs_save_resume_key(const char *current_entry, + struct cifsFileInfo *cifsFile) +{ + int rc = 0; + unsigned int len = 0; + __u16 level; + char *filename; + + if ((cifsFile == NULL) || (current_entry == NULL)) + return -EINVAL; + + level = cifsFile->srch_inf.info_level; + + if (level == SMB_FIND_FILE_UNIX) { + FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; + + filename = &pFindData->FileName[0]; + if (cifsFile->srch_inf.unicode) { + len = cifs_unicode_bytelen(filename); + } else { + /* BB should we make this strnlen of PATH_MAX? */ + len = strnlen(filename, PATH_MAX); + } + cifsFile->srch_inf.resume_key = pFindData->ResumeKey; + } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { + FILE_DIRECTORY_INFO *pFindData = + (FILE_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { + FILE_FULL_DIRECTORY_INFO *pFindData = + (FILE_FULL_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { + SEARCH_ID_FULL_DIR_INFO *pFindData = + (SEARCH_ID_FULL_DIR_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { + FILE_BOTH_DIRECTORY_INFO *pFindData = + (FILE_BOTH_DIRECTORY_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); + cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if (level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO *pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + /* one byte length, no name conversion */ + len = (unsigned int)pFindData->FileNameLength; + cifsFile->srch_inf.resume_key = pFindData->ResumeKey; + } else { + cFYI(1, ("Unknown findfirst level %d", level)); + return -EINVAL; + } + cifsFile->srch_inf.resume_name_len = len; + cifsFile->srch_inf.presume_name = filename; + return rc; +} + /* find the corresponding entry in the search */ /* Note that the SMB server returns search entries for . and .. which complicates logic here if we choose to parse for them and we do not @@ -703,6 +767,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, while ((index_to_find >= cifsFile->srch_inf.index_of_last_entry) && (rc == 0) && !cifsFile->srch_inf.endOfSearch) { cFYI(1, ("calling findnext2")); + cifs_save_resume_key(cifsFile->srch_inf.last_entry, cifsFile); rc = CIFSFindNext(xid, pTcon, cifsFile->netfid, &cifsFile->srch_inf); if (rc) @@ -919,69 +984,6 @@ static int cifs_filldir(char *pfindEntry, struct file *file, return rc; } -static int cifs_save_resume_key(const char *current_entry, - struct cifsFileInfo *cifsFile) -{ - int rc = 0; - unsigned int len = 0; - __u16 level; - char *filename; - - if ((cifsFile == NULL) || (current_entry == NULL)) - return -EINVAL; - - level = cifsFile->srch_inf.info_level; - - if (level == SMB_FIND_FILE_UNIX) { - FILE_UNIX_INFO *pFindData = (FILE_UNIX_INFO *)current_entry; - - filename = &pFindData->FileName[0]; - if (cifsFile->srch_inf.unicode) { - len = cifs_unicode_bytelen(filename); - } else { - /* BB should we make this strnlen of PATH_MAX? */ - len = strnlen(filename, PATH_MAX); - } - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { - FILE_DIRECTORY_INFO *pFindData = - (FILE_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_FULL_DIRECTORY_INFO) { - FILE_FULL_DIRECTORY_INFO *pFindData = - (FILE_FULL_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_ID_FULL_DIR_INFO) { - SEARCH_ID_FULL_DIR_INFO *pFindData = - (SEARCH_ID_FULL_DIR_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { - FILE_BOTH_DIRECTORY_INFO *pFindData = - (FILE_BOTH_DIRECTORY_INFO *)current_entry; - filename = &pFindData->FileName[0]; - len = le32_to_cpu(pFindData->FileNameLength); - cifsFile->srch_inf.resume_key = pFindData->FileIndex; - } else if (level == SMB_FIND_FILE_INFO_STANDARD) { - FIND_FILE_STANDARD_INFO *pFindData = - (FIND_FILE_STANDARD_INFO *)current_entry; - filename = &pFindData->FileName[0]; - /* one byte length, no name conversion */ - len = (unsigned int)pFindData->FileNameLength; - cifsFile->srch_inf.resume_key = pFindData->ResumeKey; - } else { - cFYI(1, ("Unknown findfirst level %d", level)); - return -EINVAL; - } - cifsFile->srch_inf.resume_name_len = len; - cifsFile->srch_inf.presume_name = filename; - return rc; -} int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) { diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index ed150efbe27..2851d5da0c8 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -409,6 +409,8 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, #ifdef CONFIG_CIFS_WEAK_PW_HASH char lnm_session_key[CIFS_SESS_KEY_SIZE]; + pSMB->req.hdr.Flags2 &= ~SMBFLG2_UNICODE; + /* no capabilities flags in old lanman negotiation */ pSMB->old_req.PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); @@ -505,7 +507,7 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); } else ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); - } else if (type == Kerberos) { + } else if (type == Kerberos || type == MSKerberos) { #ifdef CONFIG_CIFS_UPCALL struct cifs_spnego_msg *msg; spnego_key = cifs_get_spnego_key(ses); @@ -516,6 +518,15 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, } msg = spnego_key->payload.data; + /* check version field to make sure that cifs.upcall is + sending us a response in an expected form */ + if (msg->version != CIFS_SPNEGO_UPCALL_VERSION) { + cERROR(1, ("incorrect version of cifs.upcall (expected" + " %d but got %d)", + CIFS_SPNEGO_UPCALL_VERSION, msg->version)); + rc = -EKEYREJECTED; + goto ssetup_exit; + } /* bail out if key is too long */ if (msg->sesskey_len > sizeof(ses->server->mac_signing_key.data.krb5)) { @@ -613,8 +624,10 @@ CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, ses, nls_cp); ssetup_exit: - if (spnego_key) + if (spnego_key) { + key_revoke(spnego_key); key_put(spnego_key); + } kfree(str_area); if (resp_buf_type == CIFS_SMALL_BUFFER) { cFYI(1, ("ssetup freeing small buf %p", iov[0].iov_base)); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index e286db9f5ee..bf0e6d8e382 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -50,8 +50,7 @@ AllocMidQEntry(const struct smb_hdr *smb_buffer, struct cifsSesInfo *ses) return NULL; } - temp = (struct mid_q_entry *) mempool_alloc(cifs_mid_poolp, - GFP_KERNEL | GFP_NOFS); + temp = mempool_alloc(cifs_mid_poolp, GFP_NOFS); if (temp == NULL) return temp; else { diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 0d9b80ec689..cfd29da714d 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -362,9 +362,8 @@ static int init_coda_psdev(void) goto out_chrdev; } for (i = 0; i < MAX_CODADEVS; i++) - device_create_drvdata(coda_psdev_class, NULL, - MKDEV(CODA_PSDEV_MAJOR, i), - NULL, "cfs%d", i); + device_create(coda_psdev_class, NULL, + MKDEV(CODA_PSDEV_MAJOR, i), NULL, "cfs%d", i); coda_sysctl_init(); goto out; diff --git a/fs/compat.c b/fs/compat.c index c9d1472e65c..5f9ec449c79 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -137,6 +137,45 @@ asmlinkage long compat_sys_utimes(char __user *filename, struct compat_timeval _ return compat_sys_futimesat(AT_FDCWD, filename, t); } +static int cp_compat_stat(struct kstat *stat, struct compat_stat __user *ubuf) +{ + compat_ino_t ino = stat->ino; + typeof(ubuf->st_uid) uid = 0; + typeof(ubuf->st_gid) gid = 0; + int err; + + SET_UID(uid, stat->uid); + SET_GID(gid, stat->gid); + + if ((u64) stat->size > MAX_NON_LFS || + !old_valid_dev(stat->dev) || + !old_valid_dev(stat->rdev)) + return -EOVERFLOW; + if (sizeof(ino) < sizeof(stat->ino) && ino != stat->ino) + return -EOVERFLOW; + + if (clear_user(ubuf, sizeof(*ubuf))) + return -EFAULT; + + err = __put_user(old_encode_dev(stat->dev), &ubuf->st_dev); + err |= __put_user(ino, &ubuf->st_ino); + err |= __put_user(stat->mode, &ubuf->st_mode); + err |= __put_user(stat->nlink, &ubuf->st_nlink); + err |= __put_user(uid, &ubuf->st_uid); + err |= __put_user(gid, &ubuf->st_gid); + err |= __put_user(old_encode_dev(stat->rdev), &ubuf->st_rdev); + err |= __put_user(stat->size, &ubuf->st_size); + err |= __put_user(stat->atime.tv_sec, &ubuf->st_atime); + err |= __put_user(stat->atime.tv_nsec, &ubuf->st_atime_nsec); + err |= __put_user(stat->mtime.tv_sec, &ubuf->st_mtime); + err |= __put_user(stat->mtime.tv_nsec, &ubuf->st_mtime_nsec); + err |= __put_user(stat->ctime.tv_sec, &ubuf->st_ctime); + err |= __put_user(stat->ctime.tv_nsec, &ubuf->st_ctime_nsec); + err |= __put_user(stat->blksize, &ubuf->st_blksize); + err |= __put_user(stat->blocks, &ubuf->st_blocks); + return err; +} + asmlinkage long compat_sys_newstat(char __user * filename, struct compat_stat __user *statbuf) { @@ -792,8 +831,10 @@ static int compat_fillonedir(void *__buf, const char *name, int namlen, if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -862,8 +903,10 @@ static int compat_filldir(void *__buf, const char *name, int namlen, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) @@ -1235,7 +1278,7 @@ static int compat_count(compat_uptr_t __user *argv, int max) if (!p) break; argv++; - if(++i > max) + if (i++ >= max) return -E2BIG; } } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7a8db78a91d..8e93341f3e8 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1311,16 +1311,18 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) * Ensure that no racing symlink() will make detach_prep() fail while * the new link is temporarily attached */ - mutex_lock(&configfs_symlink_mutex); - spin_lock(&configfs_dirent_lock); do { struct mutex *wait_mutex; + mutex_lock(&configfs_symlink_mutex); + spin_lock(&configfs_dirent_lock); ret = configfs_detach_prep(dentry, &wait_mutex); - if (ret) { + if (ret) configfs_detach_rollback(dentry); - spin_unlock(&configfs_dirent_lock); - mutex_unlock(&configfs_symlink_mutex); + spin_unlock(&configfs_dirent_lock); + mutex_unlock(&configfs_symlink_mutex); + + if (ret) { if (ret != -EAGAIN) { config_item_put(parent_item); return ret; @@ -1329,13 +1331,8 @@ static int configfs_rmdir(struct inode *dir, struct dentry *dentry) /* Wait until the racing operation terminates */ mutex_lock(wait_mutex); mutex_unlock(wait_mutex); - - mutex_lock(&configfs_symlink_mutex); - spin_lock(&configfs_dirent_lock); } } while (ret == -EAGAIN); - spin_unlock(&configfs_dirent_lock); - mutex_unlock(&configfs_symlink_mutex); /* Get a working ref for the duration of this function */ item = configfs_get_config_item(dentry); diff --git a/fs/dcache.c b/fs/dcache.c index 101663d15e9..e7a1a99b746 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1236,7 +1236,7 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) * If no entry exists with the exact case name, allocate new dentry with * the exact case, and return the spliced entry. */ -struct dentry *d_add_ci(struct inode *inode, struct dentry *dentry, +struct dentry *d_add_ci(struct dentry *dentry, struct inode *inode, struct qstr *name) { int error; @@ -1395,6 +1395,10 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) if (dentry->d_parent != parent) goto next; + /* non-existing due to RCU? */ + if (d_unhashed(dentry)) + goto next; + /* * It is safe to compare names since d_move() cannot * change the qstr (protected by d_lock). @@ -1410,10 +1414,8 @@ struct dentry * __d_lookup(struct dentry * parent, struct qstr * name) goto next; } - if (!d_unhashed(dentry)) { - atomic_inc(&dentry->d_count); - found = dentry; - } + atomic_inc(&dentry->d_count); + found = dentry; spin_unlock(&dentry->d_lock); break; next: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 08e28c9bb41..3dbe2169cf3 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -26,8 +26,7 @@ #include <linux/debugfs.h> #include <linux/fsnotify.h> #include <linux/string.h> - -#define DEBUGFS_MAGIC 0x64626720 +#include <linux/magic.h> static struct vfsmount *debugfs_mount; static int debugfs_mount_count; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 488eb424f66..4a714f6c1be 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -27,6 +27,7 @@ #define DEVPTS_SUPER_MAGIC 0x1cd1 #define DEVPTS_DEFAULT_MODE 0600 +#define PTMX_MINOR 2 extern int pty_limit; /* Config limit on Unix98 ptys */ static DEFINE_IDA(allocated_ptys); @@ -48,7 +49,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_mode, "mode=%o"}, @@ -169,15 +170,7 @@ static struct file_system_type devpts_fs_type = { * to the System V naming convention */ -static struct dentry *get_node(int num) -{ - char s[12]; - struct dentry *root = devpts_root; - mutex_lock(&root->d_inode->i_mutex); - return lookup_one_len(s, root, sprintf(s, "%d", num)); -} - -int devpts_new_index(void) +int devpts_new_index(struct inode *ptmx_inode) { int index; int ida_ret; @@ -205,20 +198,21 @@ retry: return index; } -void devpts_kill_index(int idx) +void devpts_kill_index(struct inode *ptmx_inode, int idx) { mutex_lock(&allocated_ptys_lock); ida_remove(&allocated_ptys, idx); mutex_unlock(&allocated_ptys_lock); } -int devpts_pty_new(struct tty_struct *tty) +int devpts_pty_new(struct inode *ptmx_inode, struct tty_struct *tty) { int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ struct tty_driver *driver = tty->driver; dev_t device = MKDEV(driver->major, driver->minor_start+number); struct dentry *dentry; struct inode *inode = new_inode(devpts_mnt->mnt_sb); + char s[12]; /* We're supposed to be given the slave end of a pty */ BUG_ON(driver->type != TTY_DRIVER_TYPE_PTY); @@ -233,10 +227,15 @@ int devpts_pty_new(struct tty_struct *tty) inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; init_special_inode(inode, S_IFCHR|config.mode, device); inode->i_private = tty; + tty->driver_data = inode; - dentry = get_node(number); - if (!IS_ERR(dentry) && !dentry->d_inode) { - d_instantiate(dentry, inode); + sprintf(s, "%d", number); + + mutex_lock(&devpts_root->d_inode->i_mutex); + + dentry = d_alloc_name(devpts_root, s); + if (!IS_ERR(dentry)) { + d_add(dentry, inode); fsnotify_create(devpts_root->d_inode, dentry); } @@ -245,36 +244,31 @@ int devpts_pty_new(struct tty_struct *tty) return 0; } -struct tty_struct *devpts_get_tty(int number) +struct tty_struct *devpts_get_tty(struct inode *pts_inode, int number) { - struct dentry *dentry = get_node(number); - struct tty_struct *tty; - - tty = NULL; - if (!IS_ERR(dentry)) { - if (dentry->d_inode) - tty = dentry->d_inode->i_private; - dput(dentry); - } + BUG_ON(pts_inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_unlock(&devpts_root->d_inode->i_mutex); - - return tty; + if (pts_inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC) + return (struct tty_struct *)pts_inode->i_private; + return NULL; } -void devpts_pty_kill(int number) +void devpts_pty_kill(struct tty_struct *tty) { - struct dentry *dentry = get_node(number); + struct inode *inode = tty->driver_data; + struct dentry *dentry; - if (!IS_ERR(dentry)) { - struct inode *inode = dentry->d_inode; - if (inode) { - inode->i_nlink--; - d_delete(dentry); - dput(dentry); - } + BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); + + mutex_lock(&devpts_root->d_inode->i_mutex); + + dentry = d_find_alias(inode); + if (dentry && !IS_ERR(dentry)) { + inode->i_nlink--; + d_delete(dentry); dput(dentry); } + mutex_unlock(&devpts_root->d_inode->i_mutex); } diff --git a/fs/direct-io.c b/fs/direct-io.c index 9606ee848fd..af0558dbe8b 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -5,11 +5,11 @@ * * O_DIRECT * - * 04Jul2002 akpm@zip.com.au + * 04Jul2002 Andrew Morton * Initial version * 11Sep2002 janetinc@us.ibm.com * added readv/writev support. - * 29Oct2002 akpm@zip.com.au + * 29Oct2002 Andrew Morton * rewrote bio_add_page() support. * 30Oct2002 pbadari@us.ibm.com * added support for non-aligned IO. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 89d2fb7b991..fd9859f92fa 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -14,6 +14,9 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/configfs.h> +#include <linux/in.h> +#include <linux/in6.h> +#include <net/ipv6.h> #include <net/sock.h> #include "config.h" @@ -377,24 +380,24 @@ static struct config_item_type node_type = { .ct_owner = THIS_MODULE, }; -static struct dlm_cluster *to_cluster(struct config_item *i) +static struct dlm_cluster *config_item_to_cluster(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_cluster, group) : NULL; } -static struct dlm_space *to_space(struct config_item *i) +static struct dlm_space *config_item_to_space(struct config_item *i) { return i ? container_of(to_config_group(i), struct dlm_space, group) : NULL; } -static struct dlm_comm *to_comm(struct config_item *i) +static struct dlm_comm *config_item_to_comm(struct config_item *i) { return i ? container_of(i, struct dlm_comm, item) : NULL; } -static struct dlm_node *to_node(struct config_item *i) +static struct dlm_node *config_item_to_node(struct config_item *i) { return i ? container_of(i, struct dlm_node, item) : NULL; } @@ -450,7 +453,7 @@ static struct config_group *make_cluster(struct config_group *g, static void drop_cluster(struct config_group *g, struct config_item *i) { - struct dlm_cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); struct config_item *tmp; int j; @@ -468,7 +471,7 @@ static void drop_cluster(struct config_group *g, struct config_item *i) static void release_cluster(struct config_item *i) { - struct dlm_cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); kfree(cl->group.default_groups); kfree(cl); } @@ -507,7 +510,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) static void drop_space(struct config_group *g, struct config_item *i) { - struct dlm_space *sp = to_space(i); + struct dlm_space *sp = config_item_to_space(i); struct config_item *tmp; int j; @@ -524,7 +527,7 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { - struct dlm_space *sp = to_space(i); + struct dlm_space *sp = config_item_to_space(i); kfree(sp->group.default_groups); kfree(sp); } @@ -546,7 +549,7 @@ static struct config_item *make_comm(struct config_group *g, const char *name) static void drop_comm(struct config_group *g, struct config_item *i) { - struct dlm_comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); if (local_comm == cm) local_comm = NULL; dlm_lowcomms_close(cm->nodeid); @@ -557,13 +560,13 @@ static void drop_comm(struct config_group *g, struct config_item *i) static void release_comm(struct config_item *i) { - struct dlm_comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); kfree(cm); } static struct config_item *make_node(struct config_group *g, const char *name) { - struct dlm_space *sp = to_space(g->cg_item.ci_parent); + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); struct dlm_node *nd; nd = kzalloc(sizeof(struct dlm_node), GFP_KERNEL); @@ -585,8 +588,8 @@ static struct config_item *make_node(struct config_group *g, const char *name) static void drop_node(struct config_group *g, struct config_item *i) { - struct dlm_space *sp = to_space(g->cg_item.ci_parent); - struct dlm_node *nd = to_node(i); + struct dlm_space *sp = config_item_to_space(g->cg_item.ci_parent); + struct dlm_node *nd = config_item_to_node(i); mutex_lock(&sp->members_lock); list_del(&nd->list); @@ -598,7 +601,7 @@ static void drop_node(struct config_group *g, struct config_item *i) static void release_node(struct config_item *i) { - struct dlm_node *nd = to_node(i); + struct dlm_node *nd = config_item_to_node(i); kfree(nd); } @@ -632,7 +635,7 @@ void dlm_config_exit(void) static ssize_t show_cluster(struct config_item *i, struct configfs_attribute *a, char *buf) { - struct dlm_cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); struct cluster_attribute *cla = container_of(a, struct cluster_attribute, attr); return cla->show ? cla->show(cl, buf) : 0; @@ -642,7 +645,7 @@ static ssize_t store_cluster(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len) { - struct dlm_cluster *cl = to_cluster(i); + struct dlm_cluster *cl = config_item_to_cluster(i); struct cluster_attribute *cla = container_of(a, struct cluster_attribute, attr); return cla->store ? cla->store(cl, buf, len) : -EINVAL; @@ -651,7 +654,7 @@ static ssize_t store_cluster(struct config_item *i, static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, char *buf) { - struct dlm_comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); struct comm_attribute *cma = container_of(a, struct comm_attribute, attr); return cma->show ? cma->show(cm, buf) : 0; @@ -660,7 +663,7 @@ static ssize_t show_comm(struct config_item *i, struct configfs_attribute *a, static ssize_t store_comm(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len) { - struct dlm_comm *cm = to_comm(i); + struct dlm_comm *cm = config_item_to_comm(i); struct comm_attribute *cma = container_of(a, struct comm_attribute, attr); return cma->store ? cma->store(cm, buf, len) : -EINVAL; @@ -714,7 +717,7 @@ static ssize_t comm_addr_write(struct dlm_comm *cm, const char *buf, size_t len) static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, char *buf) { - struct dlm_node *nd = to_node(i); + struct dlm_node *nd = config_item_to_node(i); struct node_attribute *nda = container_of(a, struct node_attribute, attr); return nda->show ? nda->show(nd, buf) : 0; @@ -723,7 +726,7 @@ static ssize_t show_node(struct config_item *i, struct configfs_attribute *a, static ssize_t store_node(struct config_item *i, struct configfs_attribute *a, const char *buf, size_t len) { - struct dlm_node *nd = to_node(i); + struct dlm_node *nd = config_item_to_node(i); struct node_attribute *nda = container_of(a, struct node_attribute, attr); return nda->store ? nda->store(nd, buf, len) : -EINVAL; @@ -768,7 +771,7 @@ static struct dlm_space *get_space(char *name) i = config_group_find_item(space_list, name); mutex_unlock(&space_list->cg_subsys->su_mutex); - return to_space(i); + return config_item_to_space(i); } static void put_space(struct dlm_space *sp) @@ -776,6 +779,33 @@ static void put_space(struct dlm_space *sp) config_item_put(&sp->group.cg_item); } +static int addr_compare(struct sockaddr_storage *x, struct sockaddr_storage *y) +{ + switch (x->ss_family) { + case AF_INET: { + struct sockaddr_in *sinx = (struct sockaddr_in *)x; + struct sockaddr_in *siny = (struct sockaddr_in *)y; + if (sinx->sin_addr.s_addr != siny->sin_addr.s_addr) + return 0; + if (sinx->sin_port != siny->sin_port) + return 0; + break; + } + case AF_INET6: { + struct sockaddr_in6 *sinx = (struct sockaddr_in6 *)x; + struct sockaddr_in6 *siny = (struct sockaddr_in6 *)y; + if (!ipv6_addr_equal(&sinx->sin6_addr, &siny->sin6_addr)) + return 0; + if (sinx->sin6_port != siny->sin6_port) + return 0; + break; + } + default: + return 0; + } + return 1; +} + static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) { struct config_item *i; @@ -788,7 +818,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) mutex_lock(&clusters_root.subsys.su_mutex); list_for_each_entry(i, &comm_list->cg_children, ci_entry) { - cm = to_comm(i); + cm = config_item_to_comm(i); if (nodeid) { if (cm->nodeid != nodeid) @@ -797,8 +827,7 @@ static struct dlm_comm *get_comm(int nodeid, struct sockaddr_storage *addr) config_item_get(i); break; } else { - if (!cm->addr_count || - memcmp(cm->addr[0], addr, sizeof(*addr))) + if (!cm->addr_count || !addr_compare(cm->addr[0], addr)) continue; found = 1; config_item_get(i); diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h index 5a7ac33b629..868e4c9ef12 100644 --- a/fs/dlm/dlm_internal.h +++ b/fs/dlm/dlm_internal.h @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -441,8 +441,11 @@ struct dlm_ls { uint32_t ls_global_id; /* global unique lockspace ID */ uint32_t ls_exflags; int ls_lvblen; - int ls_count; /* reference count */ + int ls_count; /* refcount of processes in + the dlm using this ls */ + int ls_create_count; /* create/release refcount */ unsigned long ls_flags; /* LSFL_ */ + unsigned long ls_scan_time; struct kobject ls_kobj; struct dlm_rsbtable *ls_rsbtbl; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 499e16759e9..d910501de6d 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -2,7 +2,7 @@ ******************************************************************************* ** ** Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. -** Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. +** Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. ** ** This copyrighted material is made available to anyone wishing to use, ** modify, copy, or redistribute it subject to the terms and conditions @@ -23,6 +23,7 @@ #include "lock.h" #include "recover.h" #include "requestqueue.h" +#include "user.h" static int ls_count; static struct mutex ls_lock; @@ -211,19 +212,41 @@ void dlm_lockspace_exit(void) kset_unregister(dlm_kset); } +static struct dlm_ls *find_ls_to_scan(void) +{ + struct dlm_ls *ls; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (time_after_eq(jiffies, ls->ls_scan_time + + dlm_config.ci_scan_secs * HZ)) { + spin_unlock(&lslist_lock); + return ls; + } + } + spin_unlock(&lslist_lock); + return NULL; +} + static int dlm_scand(void *data) { struct dlm_ls *ls; + int timeout_jiffies = dlm_config.ci_scan_secs * HZ; while (!kthread_should_stop()) { - list_for_each_entry(ls, &lslist, ls_list) { + ls = find_ls_to_scan(); + if (ls) { if (dlm_lock_recovery_try(ls)) { + ls->ls_scan_time = jiffies; dlm_scan_rsbs(ls); dlm_scan_timeout(ls); dlm_unlock_recovery(ls); + } else { + ls->ls_scan_time += HZ; } + } else { + schedule_timeout_interruptible(timeout_jiffies); } - schedule_timeout_interruptible(dlm_config.ci_scan_secs * HZ); } return 0; } @@ -246,23 +269,6 @@ static void dlm_scand_stop(void) kthread_stop(scand_task); } -static struct dlm_ls *dlm_find_lockspace_name(char *name, int namelen) -{ - struct dlm_ls *ls; - - spin_lock(&lslist_lock); - - list_for_each_entry(ls, &lslist, ls_list) { - if (ls->ls_namelen == namelen && - memcmp(ls->ls_name, name, namelen) == 0) - goto out; - } - ls = NULL; - out: - spin_unlock(&lslist_lock); - return ls; -} - struct dlm_ls *dlm_find_lockspace_global(uint32_t id) { struct dlm_ls *ls; @@ -327,6 +333,7 @@ static void remove_lockspace(struct dlm_ls *ls) for (;;) { spin_lock(&lslist_lock); if (ls->ls_count == 0) { + WARN_ON(ls->ls_create_count != 0); list_del(&ls->ls_list); spin_unlock(&lslist_lock); return; @@ -381,7 +388,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, uint32_t flags, int lvblen) { struct dlm_ls *ls; - int i, size, error = -ENOMEM; + int i, size, error; int do_unreg = 0; if (namelen > DLM_LOCKSPACE_LEN) @@ -393,12 +400,37 @@ static int new_lockspace(char *name, int namelen, void **lockspace, if (!try_module_get(THIS_MODULE)) return -EINVAL; - ls = dlm_find_lockspace_name(name, namelen); - if (ls) { - *lockspace = ls; + if (!dlm_user_daemon_available()) { + module_put(THIS_MODULE); + return -EUNATCH; + } + + error = 0; + + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + WARN_ON(ls->ls_create_count <= 0); + if (ls->ls_namelen != namelen) + continue; + if (memcmp(ls->ls_name, name, namelen)) + continue; + if (flags & DLM_LSFL_NEWEXCL) { + error = -EEXIST; + break; + } + ls->ls_create_count++; module_put(THIS_MODULE); - return -EEXIST; + error = 1; /* not an error, return 0 */ + break; } + spin_unlock(&lslist_lock); + + if (error < 0) + goto out; + if (error) + goto ret_zero; + + error = -ENOMEM; ls = kzalloc(sizeof(struct dlm_ls) + namelen, GFP_KERNEL); if (!ls) @@ -408,6 +440,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_lvblen = lvblen; ls->ls_count = 0; ls->ls_flags = 0; + ls->ls_scan_time = jiffies; if (flags & DLM_LSFL_TIMEWARN) set_bit(LSFL_TIMEWARN, &ls->ls_flags); @@ -418,8 +451,9 @@ static int new_lockspace(char *name, int namelen, void **lockspace, ls->ls_allocation = GFP_KERNEL; /* ls_exflags are forced to match among nodes, and we don't - need to require all nodes to have TIMEWARN or FS set */ - ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS)); + need to require all nodes to have some flags set */ + ls->ls_exflags = (flags & ~(DLM_LSFL_TIMEWARN | DLM_LSFL_FS | + DLM_LSFL_NEWEXCL)); size = dlm_config.ci_rsbtbl_size; ls->ls_rsbtbl_size = size; @@ -510,6 +544,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, down_write(&ls->ls_in_recovery); spin_lock(&lslist_lock); + ls->ls_create_count = 1; list_add(&ls->ls_list, &lslist); spin_unlock(&lslist_lock); @@ -548,7 +583,7 @@ static int new_lockspace(char *name, int namelen, void **lockspace, dlm_create_debug_file(ls); log_debug(ls, "join complete"); - + ret_zero: *lockspace = ls; return 0; @@ -635,13 +670,34 @@ static int release_lockspace(struct dlm_ls *ls, int force) struct dlm_lkb *lkb; struct dlm_rsb *rsb; struct list_head *head; - int i; - int busy = lockspace_busy(ls); + int i, busy, rv; + + busy = lockspace_busy(ls); + + spin_lock(&lslist_lock); + if (ls->ls_create_count == 1) { + if (busy > force) + rv = -EBUSY; + else { + /* remove_lockspace takes ls off lslist */ + ls->ls_create_count = 0; + rv = 0; + } + } else if (ls->ls_create_count > 1) { + rv = --ls->ls_create_count; + } else { + rv = -EINVAL; + } + spin_unlock(&lslist_lock); - if (busy > force) - return -EBUSY; + if (rv) { + log_debug(ls, "release_lockspace no remove %d", rv); + return rv; + } + + dlm_device_deregister(ls); - if (force < 3) + if (force < 3 && dlm_user_daemon_available()) do_uevent(ls, 0); dlm_recoverd_stop(ls); @@ -720,15 +776,10 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_clear_members(ls); dlm_clear_members_gone(ls); kfree(ls->ls_node_array); + log_debug(ls, "release_lockspace final free"); kobject_put(&ls->ls_kobj); /* The ls structure will be freed when the kobject is done with */ - mutex_lock(&ls_lock); - ls_count--; - if (!ls_count) - threads_stop(); - mutex_unlock(&ls_lock); - module_put(THIS_MODULE); return 0; } @@ -750,11 +801,38 @@ static int release_lockspace(struct dlm_ls *ls, int force) int dlm_release_lockspace(void *lockspace, int force) { struct dlm_ls *ls; + int error; ls = dlm_find_lockspace_local(lockspace); if (!ls) return -EINVAL; dlm_put_lockspace(ls); - return release_lockspace(ls, force); + + mutex_lock(&ls_lock); + error = release_lockspace(ls, force); + if (!error) + ls_count--; + else if (!ls_count) + threads_stop(); + mutex_unlock(&ls_lock); + + return error; +} + +void dlm_stop_lockspaces(void) +{ + struct dlm_ls *ls; + + restart: + spin_lock(&lslist_lock); + list_for_each_entry(ls, &lslist, ls_list) { + if (!test_bit(LSFL_RUNNING, &ls->ls_flags)) + continue; + spin_unlock(&lslist_lock); + log_error(ls, "no userland control daemon, stopping lockspace"); + dlm_ls_stop(ls); + goto restart; + } + spin_unlock(&lslist_lock); } diff --git a/fs/dlm/lockspace.h b/fs/dlm/lockspace.h index 891eabbdd02..f879f87901f 100644 --- a/fs/dlm/lockspace.h +++ b/fs/dlm/lockspace.h @@ -20,6 +20,7 @@ struct dlm_ls *dlm_find_lockspace_global(uint32_t id); struct dlm_ls *dlm_find_lockspace_local(void *id); struct dlm_ls *dlm_find_lockspace_device(int minor); void dlm_put_lockspace(struct dlm_ls *ls); +void dlm_stop_lockspaces(void); #endif /* __LOCKSPACE_DOT_H__ */ diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 34f14a14fb4..b3832c67194 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006-2007 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -15,7 +15,6 @@ #include <linux/poll.h> #include <linux/signal.h> #include <linux/spinlock.h> -#include <linux/smp_lock.h> #include <linux/dlm.h> #include <linux/dlm_device.h> @@ -27,6 +26,8 @@ static const char name_prefix[] = "dlm"; static const struct file_operations device_fops; +static atomic_t dlm_monitor_opened; +static int dlm_monitor_unused = 1; #ifdef CONFIG_COMPAT @@ -340,10 +341,15 @@ static int device_user_deadlock(struct dlm_user_proc *proc, return error; } -static int create_misc_device(struct dlm_ls *ls, char *name) +static int dlm_device_register(struct dlm_ls *ls, char *name) { int error, len; + /* The device is already registered. This happens when the + lockspace is created multiple times from userspace. */ + if (ls->ls_device.name) + return 0; + error = -ENOMEM; len = strlen(name) + strlen(name_prefix) + 2; ls->ls_device.name = kzalloc(len, GFP_KERNEL); @@ -363,6 +369,22 @@ fail: return error; } +int dlm_device_deregister(struct dlm_ls *ls) +{ + int error; + + /* The device is not registered. This happens when the lockspace + was never used from userspace, or when device_create_lockspace() + calls dlm_release_lockspace() after the register fails. */ + if (!ls->ls_device.name) + return 0; + + error = misc_deregister(&ls->ls_device); + if (!error) + kfree(ls->ls_device.name); + return error; +} + static int device_user_purge(struct dlm_user_proc *proc, struct dlm_purge_params *params) { @@ -397,7 +419,7 @@ static int device_create_lockspace(struct dlm_lspace_params *params) if (!ls) return -ENOENT; - error = create_misc_device(ls, params->name); + error = dlm_device_register(ls, params->name); dlm_put_lockspace(ls); if (error) @@ -421,31 +443,22 @@ static int device_remove_lockspace(struct dlm_lspace_params *params) if (!ls) return -ENOENT; - /* Deregister the misc device first, so we don't have - * a device that's not attached to a lockspace. If - * dlm_release_lockspace fails then we can recreate it - */ - error = misc_deregister(&ls->ls_device); - if (error) { - dlm_put_lockspace(ls); - goto out; - } - kfree(ls->ls_device.name); - if (params->flags & DLM_USER_LSFLG_FORCEFREE) force = 2; lockspace = ls->ls_local_handle; + dlm_put_lockspace(ls); - /* dlm_release_lockspace waits for references to go to zero, - so all processes will need to close their device for the ls - before the release will procede */ + /* The final dlm_release_lockspace waits for references to go to + zero, so all processes will need to close their device for the + ls before the release will proceed. release also calls the + device_deregister above. Converting a positive return value + from release to zero means that userspace won't know when its + release was the final one, but it shouldn't need to know. */ - dlm_put_lockspace(ls); error = dlm_release_lockspace(lockspace, force); - if (error) - create_misc_device(ls, ls->ls_name); - out: + if (error > 0) + error = 0; return error; } @@ -623,17 +636,13 @@ static int device_open(struct inode *inode, struct file *file) struct dlm_user_proc *proc; struct dlm_ls *ls; - lock_kernel(); ls = dlm_find_lockspace_device(iminor(inode)); - if (!ls) { - unlock_kernel(); + if (!ls) return -ENOENT; - } proc = kzalloc(sizeof(struct dlm_user_proc), GFP_KERNEL); if (!proc) { dlm_put_lockspace(ls); - unlock_kernel(); return -ENOMEM; } @@ -645,7 +654,6 @@ static int device_open(struct inode *inode, struct file *file) spin_lock_init(&proc->locks_spin); init_waitqueue_head(&proc->wait); file->private_data = proc; - unlock_kernel(); return 0; } @@ -878,9 +886,28 @@ static unsigned int device_poll(struct file *file, poll_table *wait) return 0; } +int dlm_user_daemon_available(void) +{ + /* dlm_controld hasn't started (or, has started, but not + properly populated configfs) */ + + if (!dlm_our_nodeid()) + return 0; + + /* This is to deal with versions of dlm_controld that don't + know about the monitor device. We assume that if the + dlm_controld was started (above), but the monitor device + was never opened, that it's an old version. dlm_controld + should open the monitor device before populating configfs. */ + + if (dlm_monitor_unused) + return 1; + + return atomic_read(&dlm_monitor_opened) ? 1 : 0; +} + static int ctl_device_open(struct inode *inode, struct file *file) { - cycle_kernel_lock(); file->private_data = NULL; return 0; } @@ -890,6 +917,20 @@ static int ctl_device_close(struct inode *inode, struct file *file) return 0; } +static int monitor_device_open(struct inode *inode, struct file *file) +{ + atomic_inc(&dlm_monitor_opened); + dlm_monitor_unused = 0; + return 0; +} + +static int monitor_device_close(struct inode *inode, struct file *file) +{ + if (atomic_dec_and_test(&dlm_monitor_opened)) + dlm_stop_lockspaces(); + return 0; +} + static const struct file_operations device_fops = { .open = device_open, .release = device_close, @@ -913,19 +954,42 @@ static struct miscdevice ctl_device = { .minor = MISC_DYNAMIC_MINOR, }; +static const struct file_operations monitor_device_fops = { + .open = monitor_device_open, + .release = monitor_device_close, + .owner = THIS_MODULE, +}; + +static struct miscdevice monitor_device = { + .name = "dlm-monitor", + .fops = &monitor_device_fops, + .minor = MISC_DYNAMIC_MINOR, +}; + int __init dlm_user_init(void) { int error; + atomic_set(&dlm_monitor_opened, 0); + error = misc_register(&ctl_device); - if (error) + if (error) { log_print("misc_register failed for control device"); + goto out; + } + error = misc_register(&monitor_device); + if (error) { + log_print("misc_register failed for monitor device"); + misc_deregister(&ctl_device); + } + out: return error; } void dlm_user_exit(void) { misc_deregister(&ctl_device); + misc_deregister(&monitor_device); } diff --git a/fs/dlm/user.h b/fs/dlm/user.h index d38e9f3e415..35eb6a13d61 100644 --- a/fs/dlm/user.h +++ b/fs/dlm/user.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2006 Red Hat, Inc. All rights reserved. + * Copyright (C) 2006-2008 Red Hat, Inc. All rights reserved. * * This copyrighted material is made available to anyone wishing to use, * modify, copy, or redistribute it subject to the terms and conditions @@ -12,5 +12,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type); int dlm_user_init(void); void dlm_user_exit(void); +int dlm_device_deregister(struct dlm_ls *ls); +int dlm_user_daemon_available(void); #endif diff --git a/fs/dquot.c b/fs/dquot.c index 8ec4d6cc763..da30a27f224 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -9,8 +9,6 @@ * implementation is based on one of the several variants of the LINUX * inode-subsystem with added complexity of the diskquota system. * - * Version: $Id: dquot.c,v 6.3 1996/11/17 18:35:34 mvw Exp mvw $ - * * Author: Marco van Wieringen <mvw@planets.elm.net> * * Fixes: Dmitry Gorodchanin <pgmdsg@ibi.com>, 11 Feb 96 @@ -895,10 +893,9 @@ static void print_warning(struct dquot *dquot, const int warntype) warntype == QUOTA_NL_BSOFTBELOW || !need_print_warning(dquot)) return; - mutex_lock(&tty_mutex); tty = get_current_tty(); if (!tty) - goto out_lock; + return; tty_write_message(tty, dquot->dq_sb->s_id); if (warntype == QUOTA_NL_ISOFTWARN || warntype == QUOTA_NL_BSOFTWARN) tty_write_message(tty, ": warning, "); @@ -926,8 +923,7 @@ static void print_warning(struct dquot *dquot, const int warntype) break; } tty_write_message(tty, msg); -out_lock: - mutex_unlock(&tty_mutex); + tty_kref_put(tty); } #endif diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index b4755a85996..2cc9ee4ad2e 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o kthread.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o miscdev.o kthread.o debug.o diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index b73fb752c5f..3504cf9df35 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -79,11 +79,6 @@ #define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 -#define ECRYPTFS_TRANSPORT_NETLINK 0 -#define ECRYPTFS_TRANSPORT_CONNECTOR 1 -#define ECRYPTFS_TRANSPORT_RELAYFS 2 -#define ECRYPTFS_TRANSPORT_MISCDEV 3 -#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV #define ECRYPTFS_XATTR_NAME "user.ecryptfs" #define RFC2440_CIPHER_DES3_EDE 0x02 @@ -400,8 +395,6 @@ struct ecryptfs_msg_ctx { struct mutex mux; }; -extern unsigned int ecryptfs_transport; - struct ecryptfs_daemon; struct ecryptfs_daemon { @@ -627,31 +620,20 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); -int ecryptfs_process_helo(unsigned int transport, uid_t euid, - struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid); int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, struct pid *pid); int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, struct user_namespace *user_ns, struct pid *pid, u32 seq); -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, +int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx); int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **emsg); -int ecryptfs_init_messaging(unsigned int transport); -void ecryptfs_release_messaging(unsigned int transport); +int ecryptfs_init_messaging(void); +void ecryptfs_release_messaging(void); -int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid); -int ecryptfs_init_netlink(void); -void ecryptfs_release_netlink(void); - -int ecryptfs_send_connector(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid); -int ecryptfs_init_connector(void); -void ecryptfs_release_connector(void); void ecryptfs_write_header_metadata(char *virt, struct ecryptfs_crypt_stat *crypt_stat, diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 9244d653743..eb3dc4c7ac0 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -71,12 +71,11 @@ struct ecryptfs_getdents_callback { void *dirent; struct dentry *dentry; filldir_t filldir; - int err; int filldir_called; int entries_written; }; -/* Inspired by generic filldir in fs/readir.c */ +/* Inspired by generic filldir in fs/readdir.c */ static int ecryptfs_filldir(void *dirent, const char *name, int namelen, loff_t offset, u64 ino, unsigned int d_type) @@ -125,18 +124,18 @@ static int ecryptfs_readdir(struct file *file, void *dirent, filldir_t filldir) buf.dirent = dirent; buf.dentry = file->f_path.dentry; buf.filldir = filldir; -retry: buf.filldir_called = 0; buf.entries_written = 0; - buf.err = 0; rc = vfs_readdir(lower_file, ecryptfs_filldir, (void *)&buf); - if (buf.err) - rc = buf.err; - if (buf.filldir_called && !buf.entries_written) - goto retry; file->f_pos = lower_file->f_pos; + if (rc < 0) + goto out; + if (buf.filldir_called && !buf.entries_written) + goto out; if (rc >= 0) - fsstack_copy_attr_atime(inode, lower_file->f_path.dentry->d_inode); + fsstack_copy_attr_atime(inode, + lower_file->f_path.dentry->d_inode); +out: return rc; } diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index f5b76a331b9..e22bc396134 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -234,8 +234,8 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code, } i += data_len; if (message_len < (i + m_size)) { - ecryptfs_printk(KERN_ERR, "The received netlink message is " - "shorter than expected\n"); + ecryptfs_printk(KERN_ERR, "The message received from ecryptfsd " + "is shorter than expected\n"); rc = -EIO; goto out; } @@ -438,8 +438,8 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_msg_ctx *msg_ctx; struct ecryptfs_message *msg = NULL; char *auth_tok_sig; - char *netlink_message; - size_t netlink_message_length; + char *payload; + size_t payload_len; int rc; rc = ecryptfs_get_auth_tok_sig(&auth_tok_sig, auth_tok); @@ -449,15 +449,15 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, goto out; } rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), - &netlink_message, &netlink_message_length); + &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); goto out; } - rc = ecryptfs_send_message(ecryptfs_transport, netlink_message, - netlink_message_length, &msg_ctx); + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { - ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1333,23 +1333,22 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, struct ecryptfs_key_record *key_rec) { struct ecryptfs_msg_ctx *msg_ctx = NULL; - char *netlink_payload; - size_t netlink_payload_length; + char *payload = NULL; + size_t payload_len; struct ecryptfs_message *msg; int rc; rc = write_tag_66_packet(auth_tok->token.private_key.signature, ecryptfs_code_for_cipher_string(crypt_stat), - crypt_stat, &netlink_payload, - &netlink_payload_length); + crypt_stat, &payload, &payload_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet\n"); goto out; } - rc = ecryptfs_send_message(ecryptfs_transport, netlink_payload, - netlink_payload_length, &msg_ctx); + rc = ecryptfs_send_message(payload, payload_len, &msg_ctx); if (rc) { - ecryptfs_printk(KERN_ERR, "Error sending netlink message\n"); + ecryptfs_printk(KERN_ERR, "Error sending message to " + "ecryptfsd\n"); goto out; } rc = ecryptfs_wait_for_response(msg_ctx, &msg); @@ -1364,8 +1363,7 @@ pki_encrypt_session_key(struct ecryptfs_auth_tok *auth_tok, ecryptfs_printk(KERN_ERR, "Error parsing tag 67 packet\n"); kfree(msg); out: - if (netlink_payload) - kfree(netlink_payload); + kfree(payload); return rc; } /** diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 448dfd597b5..046e027a4cb 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -30,7 +30,6 @@ #include <linux/namei.h> #include <linux/skbuff.h> #include <linux/crypto.h> -#include <linux/netlink.h> #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/key.h> @@ -49,8 +48,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity, "0, which is Quiet)"); /** - * Module parameter that defines the number of netlink message buffer - * elements + * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; @@ -60,9 +58,9 @@ MODULE_PARM_DESC(ecryptfs_message_buf_len, /** * Module parameter that defines the maximum guaranteed amount of time to wait - * for a response through netlink. The actual sleep time will be, more than + * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if - * the netlink message successfully arrives. + * the message successfully arrives. */ signed long ecryptfs_message_wait_timeout = ECRYPTFS_MAX_MSG_CTX_TTL / HZ; @@ -83,8 +81,6 @@ module_param(ecryptfs_number_of_users, uint, 0); MODULE_PARM_DESC(ecryptfs_number_of_users, "An estimate of the number of " "concurrent users of eCryptfs"); -unsigned int ecryptfs_transport = ECRYPTFS_DEFAULT_TRANSPORT; - void __ecryptfs_printk(const char *fmt, ...) { va_list args; @@ -211,7 +207,7 @@ enum { ecryptfs_opt_sig, ecryptfs_opt_ecryptfs_sig, ecryptfs_opt_passthrough, ecryptfs_opt_xattr_metadata, ecryptfs_opt_encrypted_view, ecryptfs_opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {ecryptfs_opt_sig, "sig=%s"}, {ecryptfs_opt_ecryptfs_sig, "ecryptfs_sig=%s"}, {ecryptfs_opt_cipher, "cipher=%s"}, @@ -779,10 +775,11 @@ static int __init ecryptfs_init(void) "rc = [%d]\n", __func__, rc); goto out_do_sysfs_unregistration; } - rc = ecryptfs_init_messaging(ecryptfs_transport); + rc = ecryptfs_init_messaging(); if (rc) { printk(KERN_ERR "Failure occured while attempting to " - "initialize the eCryptfs netlink socket\n"); + "initialize the communications channel to " + "ecryptfsd\n"); goto out_destroy_kthread; } rc = ecryptfs_init_crypto(); @@ -797,7 +794,7 @@ static int __init ecryptfs_init(void) goto out; out_release_messaging: - ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_release_messaging(); out_destroy_kthread: ecryptfs_destroy_kthread(); out_do_sysfs_unregistration: @@ -818,7 +815,7 @@ static void __exit ecryptfs_exit(void) if (rc) printk(KERN_ERR "Failure whilst attempting to destroy crypto; " "rc = [%d]\n", rc); - ecryptfs_release_messaging(ecryptfs_transport); + ecryptfs_release_messaging(); ecryptfs_destroy_kthread(); do_sysfs_unregistration(); unregister_filesystem(&ecryptfs_fs_type); diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 1b5c20058ac..c6983978a31 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -134,12 +134,11 @@ out: } static int -ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, - u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx); +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx); /** * ecryptfs_send_raw_message - * @transport: Transport type * @msg_type: Message type * @daemon: Daemon struct for recipient of message * @@ -150,38 +149,25 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type, +static int ecryptfs_send_raw_message(u8 msg_type, struct ecryptfs_daemon *daemon) { struct ecryptfs_msg_ctx *msg_ctx; int rc; - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, - daemon->pid); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type, - &msg_ctx); - if (rc) { - printk(KERN_ERR "%s: Error whilst attempting to send " - "message via procfs; rc = [%d]\n", __func__, rc); - goto out; - } - /* Raw messages are logically context-free (e.g., no - * reply is expected), so we set the state of the - * ecryptfs_msg_ctx object to indicate that it should - * be freed as soon as the transport sends out the message. */ - mutex_lock(&msg_ctx->mux); - msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; - mutex_unlock(&msg_ctx->mux); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; + rc = ecryptfs_send_message_locked(NULL, 0, msg_type, &msg_ctx); + if (rc) { + printk(KERN_ERR "%s: Error whilst attempting to send " + "message to ecryptfsd; rc = [%d]\n", __func__, rc); + goto out; } + /* Raw messages are logically context-free (e.g., no + * reply is expected), so we set the state of the + * ecryptfs_msg_ctx object to indicate that it should + * be freed as soon as the message is sent. */ + mutex_lock(&msg_ctx->mux); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; + mutex_unlock(&msg_ctx->mux); out: return rc; } @@ -227,7 +213,6 @@ out: /** * ecryptfs_process_helo - * @transport: The underlying transport (netlink, etc.) * @euid: The user ID owner of the message * @user_ns: The namespace in which @euid applies * @pid: The process ID for the userspace program that sent the @@ -239,8 +224,8 @@ out: * Returns zero after adding a new daemon to the hash list; * non-zero otherwise. */ -int ecryptfs_process_helo(unsigned int transport, uid_t euid, - struct user_namespace *user_ns, struct pid *pid) +int ecryptfs_process_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) { struct ecryptfs_daemon *new_daemon; struct ecryptfs_daemon *old_daemon; @@ -252,8 +237,7 @@ int ecryptfs_process_helo(unsigned int transport, uid_t euid, printk(KERN_WARNING "Received request from user [%d] " "to register daemon [0x%p]; unregistering daemon " "[0x%p]\n", euid, pid, old_daemon->pid); - rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT, - old_daemon); + rc = ecryptfs_send_raw_message(ECRYPTFS_MSG_QUIT, old_daemon); if (rc) printk(KERN_WARNING "Failed to send QUIT " "message to daemon [0x%p]; rc = [%d]\n", @@ -467,8 +451,6 @@ out: /** * ecryptfs_send_message_locked - * @transport: The transport over which to send the message (i.e., - * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send @@ -478,8 +460,8 @@ out: * Returns zero on success; non-zero otherwise */ static int -ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, - u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx) +ecryptfs_send_message_locked(char *data, int data_len, u8 msg_type, + struct ecryptfs_msg_ctx **msg_ctx) { struct ecryptfs_daemon *daemon; int rc; @@ -503,20 +485,8 @@ ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); mutex_unlock(&(*msg_ctx)->mux); mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - switch (transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type, - 0, daemon->pid); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, - 0, daemon); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; - } + rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, 0, + daemon); if (rc) printk(KERN_ERR "%s: Error attempting to send message to " "userspace daemon; rc = [%d]\n", __func__, rc); @@ -526,8 +496,6 @@ out: /** * ecryptfs_send_message - * @transport: The transport over which to send the message (i.e., - * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send @@ -536,14 +504,14 @@ out: * * Returns zero on success; non-zero otherwise */ -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, +int ecryptfs_send_message(char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx) { int rc; mutex_lock(&ecryptfs_daemon_hash_mux); - rc = ecryptfs_send_message_locked(transport, data, data_len, - ECRYPTFS_MSG_REQUEST, msg_ctx); + rc = ecryptfs_send_message_locked(data, data_len, ECRYPTFS_MSG_REQUEST, + msg_ctx); mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } @@ -586,7 +554,7 @@ sleep: return rc; } -int ecryptfs_init_messaging(unsigned int transport) +int ecryptfs_init_messaging(void) { int i; int rc = 0; @@ -639,27 +607,14 @@ int ecryptfs_init_messaging(unsigned int transport) mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); } mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_init_netlink(); - if (rc) - ecryptfs_release_messaging(transport); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - rc = ecryptfs_init_ecryptfs_miscdev(); - if (rc) - ecryptfs_release_messaging(transport); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - rc = -ENOSYS; - } + rc = ecryptfs_init_ecryptfs_miscdev(); + if (rc) + ecryptfs_release_messaging(); out: return rc; } -void ecryptfs_release_messaging(unsigned int transport) +void ecryptfs_release_messaging(void) { if (ecryptfs_msg_ctx_arr) { int i; @@ -698,17 +653,6 @@ void ecryptfs_release_messaging(unsigned int transport) kfree(ecryptfs_daemon_hash); mutex_unlock(&ecryptfs_daemon_hash_mux); } - switch(transport) { - case ECRYPTFS_TRANSPORT_NETLINK: - ecryptfs_release_netlink(); - break; - case ECRYPTFS_TRANSPORT_MISCDEV: - ecryptfs_destroy_ecryptfs_miscdev(); - break; - case ECRYPTFS_TRANSPORT_CONNECTOR: - case ECRYPTFS_TRANSPORT_RELAYFS: - default: - break; - } + ecryptfs_destroy_ecryptfs_miscdev(); return; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 245c2dc02d5..04d7b3fa1ac 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -265,22 +265,34 @@ out: } /** - * ecryptfs_prepare_write + * ecryptfs_write_begin * @file: The eCryptfs file - * @page: The eCryptfs page - * @from: The start byte from which we will write - * @to: The end byte to which we will write + * @mapping: The eCryptfs object + * @pos: The file offset at which to start writing + * @len: Length of the write + * @flags: Various flags + * @pagep: Pointer to return the page + * @fsdata: Pointer to return fs data (unused) * * This function must zero any hole we create * * Returns zero on success; non-zero otherwise */ -static int ecryptfs_prepare_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_begin(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned flags, + struct page **pagep, void **fsdata) { + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + struct page *page; loff_t prev_page_end_size; int rc = 0; + page = __grab_cache_page(mapping, index); + if (!page) + return -ENOMEM; + *pagep = page; + if (!PageUptodate(page)) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private( @@ -289,8 +301,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (!(crypt_stat->flags & ECRYPTFS_ENCRYPTED) || (crypt_stat->flags & ECRYPTFS_NEW_FILE)) { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, - page->mapping->host); + page, index, 0, PAGE_CACHE_SIZE, mapping->host); if (rc) { printk(KERN_ERR "%s: Error attemping to read " "lower page segment; rc = [%d]\n", @@ -316,8 +327,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, SetPageUptodate(page); } else { rc = ecryptfs_read_lower_page_segment( - page, page->index, 0, PAGE_CACHE_SIZE, - page->mapping->host); + page, index, 0, PAGE_CACHE_SIZE, + mapping->host); if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", @@ -339,10 +350,10 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, SetPageUptodate(page); } } - prev_page_end_size = ((loff_t)page->index << PAGE_CACHE_SHIFT); + prev_page_end_size = ((loff_t)index << PAGE_CACHE_SHIFT); /* If creating a page or more of holes, zero them out via truncate. * Note, this will increase i_size. */ - if (page->index != 0) { + if (index != 0) { if (prev_page_end_size > i_size_read(page->mapping->host)) { rc = ecryptfs_truncate(file->f_path.dentry, prev_page_end_size); @@ -357,8 +368,8 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, } /* Writing to a new page, and creating a small hole from start * of page? Zero it out. */ - if ((i_size_read(page->mapping->host) == prev_page_end_size) - && (from != 0)) + if ((i_size_read(mapping->host) == prev_page_end_size) + && (pos != 0)) zero_user(page, 0, PAGE_CACHE_SIZE); out: return rc; @@ -445,21 +456,28 @@ int ecryptfs_write_inode_size_to_metadata(struct inode *ecryptfs_inode) } /** - * ecryptfs_commit_write + * ecryptfs_write_end * @file: The eCryptfs file object + * @mapping: The eCryptfs object + * @pos: The file position + * @len: The length of the data (unused) + * @copied: The amount of data copied * @page: The eCryptfs page - * @from: Ignored (we rotate the page IV on each write) - * @to: Ignored + * @fsdata: The fsdata (unused) * * This is where we encrypt the data and pass the encrypted data to * the lower filesystem. In OpenPGP-compatible mode, we operate on * entire underlying packets. */ -static int ecryptfs_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) +static int ecryptfs_write_end(struct file *file, + struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) { - loff_t pos; - struct inode *ecryptfs_inode = page->mapping->host; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + unsigned from = pos & (PAGE_CACHE_SIZE - 1); + unsigned to = from + copied; + struct inode *ecryptfs_inode = mapping->host; struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(file->f_path.dentry->d_inode)->crypt_stat; int rc; @@ -471,25 +489,22 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, } else ecryptfs_printk(KERN_DEBUG, "Not a new file\n"); ecryptfs_printk(KERN_DEBUG, "Calling fill_zeros_to_end_of_page" - "(page w/ index = [0x%.16x], to = [%d])\n", page->index, - to); + "(page w/ index = [0x%.16x], to = [%d])\n", index, to); /* Fills in zeros if 'to' goes beyond inode size */ rc = fill_zeros_to_end_of_page(page, to); if (rc) { ecryptfs_printk(KERN_WARNING, "Error attempting to fill " - "zeros in page with index = [0x%.16x]\n", - page->index); + "zeros in page with index = [0x%.16x]\n", index); goto out; } rc = ecryptfs_encrypt_page(page); if (rc) { ecryptfs_printk(KERN_WARNING, "Error encrypting page (upper " - "index [0x%.16x])\n", page->index); + "index [0x%.16x])\n", index); goto out; } - pos = (((loff_t)page->index) << PAGE_CACHE_SHIFT) + to; - if (pos > i_size_read(ecryptfs_inode)) { - i_size_write(ecryptfs_inode, pos); + if (pos + copied > i_size_read(ecryptfs_inode)) { + i_size_write(ecryptfs_inode, pos + copied); ecryptfs_printk(KERN_DEBUG, "Expanded file size to " "[0x%.16x]\n", i_size_read(ecryptfs_inode)); } @@ -497,7 +512,11 @@ static int ecryptfs_commit_write(struct file *file, struct page *page, if (rc) printk(KERN_ERR "Error writing inode size to metadata; " "rc = [%d]\n", rc); + else + rc = copied; out: + unlock_page(page); + page_cache_release(page); return rc; } @@ -518,7 +537,7 @@ static sector_t ecryptfs_bmap(struct address_space *mapping, sector_t block) struct address_space_operations ecryptfs_aops = { .writepage = ecryptfs_writepage, .readpage = ecryptfs_readpage, - .prepare_write = ecryptfs_prepare_write, - .commit_write = ecryptfs_commit_write, + .write_begin = ecryptfs_write_begin, + .write_end = ecryptfs_write_end, .bmap = ecryptfs_bmap, }; diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c deleted file mode 100644 index e0abad62b39..00000000000 --- a/fs/ecryptfs/netlink.c +++ /dev/null @@ -1,249 +0,0 @@ -/** - * eCryptfs: Linux filesystem encryption layer - * - * Copyright (C) 2004-2006 International Business Machines Corp. - * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> - * Tyler Hicks <tyhicks@ou.edu> - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License version - * 2 as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA - * 02111-1307, USA. - */ - -#include <net/sock.h> -#include <linux/hash.h> -#include <linux/random.h> -#include "ecryptfs_kernel.h" - -static struct sock *ecryptfs_nl_sock; - -/** - * ecryptfs_send_netlink - * @data: The data to include as the payload - * @data_len: The byte count of the data - * @msg_ctx: The netlink context that will be used to handle the - * response message - * @msg_type: The type of netlink message to send - * @msg_flags: The flags to include in the netlink header - * @daemon_pid: The process id of the daemon to send the message to - * - * Sends the data to the specified daemon pid and uses the netlink - * context element to store the data needed for validation upon - * receiving the response. The data and the netlink context can be - * null if just sending a netlink header is sufficient. Returns zero - * upon sending the message; non-zero upon error. - */ -int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, - u16 msg_flags, struct pid *daemon_pid) -{ - struct sk_buff *skb; - struct nlmsghdr *nlh; - struct ecryptfs_message *msg; - size_t payload_len; - int rc; - - payload_len = ((data && data_len) ? (sizeof(*msg) + data_len) : 0); - skb = alloc_skb(NLMSG_SPACE(payload_len), GFP_KERNEL); - if (!skb) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n"); - goto out; - } - nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0, - msg_type, payload_len); - nlh->nlmsg_flags = msg_flags; - if (msg_ctx && payload_len) { - msg = (struct ecryptfs_message *)NLMSG_DATA(nlh); - msg->index = msg_ctx->index; - msg->data_len = data_len; - memcpy(msg->data, data, data_len); - } - rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0); - if (rc < 0) { - ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink " - "message; rc = [%d]\n", rc); - goto out; - } - rc = 0; - goto out; -nlmsg_failure: - rc = -EMSGSIZE; - kfree_skb(skb); -out: - return rc; -} - -/** - * ecryptfs_process_nl_reponse - * @skb: The socket buffer containing the netlink message of state - * RESPONSE - * - * Processes a response message after sending a operation request to - * userspace. Attempts to assign the msg to a netlink context element - * at the index specified in the msg. The sk_buff and nlmsghdr must - * be validated before this function. Returns zero upon delivery to - * desired context element; non-zero upon delivery failure or error. - */ -static int ecryptfs_process_nl_response(struct sk_buff *skb) -{ - struct nlmsghdr *nlh = nlmsg_hdr(skb); - struct ecryptfs_message *msg = NLMSG_DATA(nlh); - struct pid *pid; - int rc; - - if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) { - rc = -EINVAL; - ecryptfs_printk(KERN_ERR, "Received netlink message with " - "incorrectly specified data length\n"); - goto out; - } - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL, - pid, nlh->nlmsg_seq); - put_pid(pid); - if (rc) - printk(KERN_ERR - "Error processing response message; rc = [%d]\n", rc); -out: - return rc; -} - -/** - * ecryptfs_process_nl_helo - * @skb: The socket buffer containing the nlmsghdr in HELO state - * - * Gets uid and pid of the skb and adds the values to the daemon id - * hash. Returns zero after adding a new daemon id to the hash list; - * non-zero otherwise. - */ -static int ecryptfs_process_nl_helo(struct sk_buff *skb) -{ - struct pid *pid; - int rc; - - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK, - NETLINK_CREDS(skb)->uid, NULL, pid); - put_pid(pid); - if (rc) - printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_process_nl_quit - * @skb: The socket buffer containing the nlmsghdr in QUIT state - * - * Gets uid and pid of the skb and deletes the corresponding daemon - * id, if it is the registered that is requesting the - * deletion. Returns zero after deleting the desired daemon id; - * non-zero otherwise. - */ -static int ecryptfs_process_nl_quit(struct sk_buff *skb) -{ - struct pid *pid; - int rc; - - pid = find_get_pid(NETLINK_CREDS(skb)->pid); - rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid); - put_pid(pid); - if (rc) - printk(KERN_WARNING - "Error processing QUIT message; rc = [%d]\n", rc); - return rc; -} - -/** - * ecryptfs_receive_nl_message - * - * Callback function called by netlink system when a message arrives. - * If the message looks to be valid, then an attempt is made to assign - * it to its desired netlink context element and wake up the process - * that is waiting for a response. - */ -static void ecryptfs_receive_nl_message(struct sk_buff *skb) -{ - struct nlmsghdr *nlh; - - nlh = nlmsg_hdr(skb); - if (!NLMSG_OK(nlh, skb->len)) { - ecryptfs_printk(KERN_ERR, "Received corrupt netlink " - "message\n"); - goto free; - } - switch (nlh->nlmsg_type) { - case ECRYPTFS_MSG_RESPONSE: - if (ecryptfs_process_nl_response(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "deliver netlink response to " - "requesting operation\n"); - } - break; - case ECRYPTFS_MSG_HELO: - if (ecryptfs_process_nl_helo(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "fulfill HELO request\n"); - } - break; - case ECRYPTFS_MSG_QUIT: - if (ecryptfs_process_nl_quit(skb)) { - ecryptfs_printk(KERN_WARNING, "Failed to " - "fulfill QUIT request\n"); - } - break; - default: - ecryptfs_printk(KERN_WARNING, "Dropping netlink " - "message of unrecognized type [%d]\n", - nlh->nlmsg_type); - break; - } -free: - kfree_skb(skb); -} - -/** - * ecryptfs_init_netlink - * - * Initializes the daemon id hash list, netlink context array, and - * necessary locks. Returns zero upon success; non-zero upon error. - */ -int ecryptfs_init_netlink(void) -{ - int rc; - - ecryptfs_nl_sock = netlink_kernel_create(&init_net, NETLINK_ECRYPTFS, 0, - ecryptfs_receive_nl_message, - NULL, THIS_MODULE); - if (!ecryptfs_nl_sock) { - rc = -EIO; - ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n"); - goto out; - } - ecryptfs_nl_sock->sk_sndtimeo = ECRYPTFS_DEFAULT_SEND_TIMEOUT; - rc = 0; -out: - return rc; -} - -/** - * ecryptfs_release_netlink - * - * Frees all memory used by the netlink context array and releases the - * netlink socket. - */ -void ecryptfs_release_netlink(void) -{ - netlink_kernel_release(ecryptfs_nl_sock); - ecryptfs_nl_sock = NULL; -} diff --git a/fs/efs/namei.c b/fs/efs/namei.c index 3a404e7fad5..291abb11e20 100644 --- a/fs/efs/namei.c +++ b/fs/efs/namei.c @@ -74,8 +74,7 @@ struct dentry *efs_lookup(struct inode *dir, struct dentry *dentry, struct namei } unlock_kernel(); - d_add(dentry, inode); - return NULL; + return d_splice_alias(inode, dentry); } static struct inode *efs_nfs_get_inode(struct super_block *sb, u64 ino, diff --git a/fs/efs/super.c b/fs/efs/super.c index 567b134fa1f..73b19cfc91f 100644 --- a/fs/efs/super.c +++ b/fs/efs/super.c @@ -341,8 +341,6 @@ static int efs_statfs(struct dentry *dentry, struct kstatfs *buf) { sb->inode_blocks * (EFS_BLOCKSIZE / sizeof(struct efs_dinode)); buf->f_ffree = sb->inode_free; /* free inodes */ - buf->f_fsid.val[0] = (sb->fs_magic >> 16) & 0xffff; /* fs ID */ - buf->f_fsid.val[1] = sb->fs_magic & 0xffff; /* fs ID */ buf->f_namelen = EFS_MAXNAMELEN; /* max filename length */ return 0; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7cc0eb756b5..99368bda026 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -927,14 +927,11 @@ errxit: /* * During the time we spent in the loop above, some other events * might have been queued by the poll callback. We re-insert them - * here (in case they are not already queued, or they're one-shot). + * inside the main ready-list here. */ for (nepi = ep->ovflist; (epi = nepi) != NULL; - nepi = epi->next, epi->next = EP_UNACTIVE_PTR) { - if (!ep_is_linked(&epi->rdllink) && - (epi->event.events & ~EP_PRIVATE_BITS)) - list_add_tail(&epi->rdllink, &ep->rdllist); - } + nepi = epi->next, epi->next = EP_UNACTIVE_PTR) + list_add_tail(&epi->rdllink, &ep->rdllist); /* * We need to set back ep->ovflist to EP_UNACTIVE_PTR, so that after * releasing the lock, events will be queued in the normal way inside diff --git a/fs/exec.c b/fs/exec.c index 32993beecbe..a41e7902ed0 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -50,15 +50,12 @@ #include <linux/cn_proc.h> #include <linux/audit.h> #include <linux/tracehook.h> +#include <linux/kmod.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> #include <asm/tlb.h> -#ifdef CONFIG_KMOD -#include <linux/kmod.h> -#endif - #ifdef __alpha__ /* for /sbin/loader handling in search_binary_handler() */ #include <linux/a.out.h> @@ -391,7 +388,7 @@ static int count(char __user * __user * argv, int max) if (!p) break; argv++; - if(++i > max) + if (i++ >= max) return -E2BIG; cond_resched(); } @@ -752,11 +749,11 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); - mm_update_next_owner(old_mm); arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); BUG_ON(active_mm != old_mm); + mm_update_next_owner(old_mm); mmput(old_mm); return 0; } @@ -825,8 +822,6 @@ static int de_thread(struct task_struct *tsk) schedule(); } - if (unlikely(task_child_reaper(tsk) == leader)) - task_active_pid_ns(tsk)->child_reaper = tsk; /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. @@ -1189,7 +1184,7 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) return retval; /* Remember if the application is TASO. */ - bprm->sh_bang = eh->ah.entry < 0x100000000UL; + bprm->taso = eh->ah.entry < 0x100000000UL; bprm->file = file; bprm->loader = loader; @@ -1247,8 +1242,8 @@ int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs) read_unlock(&binfmt_lock); if (retval != -ENOEXEC || bprm->mm == NULL) { break; -#ifdef CONFIG_KMOD - }else{ +#ifdef CONFIG_MODULES + } else { #define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e)) if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index 10bb02c3f25..6dac7ba2d22 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -1295,6 +1295,7 @@ retry_alloc: * turn off reservation for this allocation */ if (my_rsv && (free_blocks < windowsz) + && (free_blocks > 0) && (rsv_is_empty(&my_rsv->rsv_window))) my_rsv = NULL; @@ -1332,7 +1333,7 @@ retry_alloc: * free blocks is less than half of the reservation * window size. */ - if (free_blocks <= (windowsz/2)) + if (my_rsv && (free_blocks <= (windowsz/2))) continue; brelse(bitmap_bh); diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index a78c6b4af06..11a49ce8439 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -103,7 +103,7 @@ static int ext2_commit_chunk(struct page *page, loff_t pos, unsigned len) return err; } -static void ext2_check_page(struct page *page) +static void ext2_check_page(struct page *page, int quiet) { struct inode *dir = page->mapping->host; struct super_block *sb = dir->i_sb; @@ -146,10 +146,10 @@ out: /* Too bad, we had an error */ Ebadsize: - ext2_error(sb, "ext2_check_page", - "size of directory #%lu is not a multiple of chunk size", - dir->i_ino - ); + if (!quiet) + ext2_error(sb, __func__, + "size of directory #%lu is not a multiple " + "of chunk size", dir->i_ino); goto fail; Eshort: error = "rec_len is smaller than minimal"; @@ -166,32 +166,36 @@ Espan: Einumber: error = "inode out of bounds"; bad_entry: - ext2_error (sb, "ext2_check_page", "bad entry in directory #%lu: %s - " - "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", - dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, - (unsigned long) le32_to_cpu(p->inode), - rec_len, p->name_len); + if (!quiet) + ext2_error(sb, __func__, "bad entry in directory #%lu: : %s - " + "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", + dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le32_to_cpu(p->inode), + rec_len, p->name_len); goto fail; Eend: - p = (ext2_dirent *)(kaddr + offs); - ext2_error (sb, "ext2_check_page", - "entry in directory #%lu spans the page boundary" - "offset=%lu, inode=%lu", - dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, - (unsigned long) le32_to_cpu(p->inode)); + if (!quiet) { + p = (ext2_dirent *)(kaddr + offs); + ext2_error(sb, "ext2_check_page", + "entry in directory #%lu spans the page boundary" + "offset=%lu, inode=%lu", + dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs, + (unsigned long) le32_to_cpu(p->inode)); + } fail: SetPageChecked(page); SetPageError(page); } -static struct page * ext2_get_page(struct inode *dir, unsigned long n) +static struct page * ext2_get_page(struct inode *dir, unsigned long n, + int quiet) { struct address_space *mapping = dir->i_mapping; struct page *page = read_mapping_page(mapping, n, NULL); if (!IS_ERR(page)) { kmap(page); if (!PageChecked(page)) - ext2_check_page(page); + ext2_check_page(page, quiet); if (PageError(page)) goto fail; } @@ -292,7 +296,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) for ( ; n < npages; n++, offset = 0) { char *kaddr, *limit; ext2_dirent *de; - struct page *page = ext2_get_page(inode, n); + struct page *page = ext2_get_page(inode, n, 0); if (IS_ERR(page)) { ext2_error(sb, __func__, @@ -361,6 +365,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, struct page *page = NULL; struct ext2_inode_info *ei = EXT2_I(dir); ext2_dirent * de; + int dir_has_error = 0; if (npages == 0) goto out; @@ -374,7 +379,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, n = start; do { char *kaddr; - page = ext2_get_page(dir, n); + page = ext2_get_page(dir, n, dir_has_error); if (!IS_ERR(page)) { kaddr = page_address(page); de = (ext2_dirent *) kaddr; @@ -391,7 +396,9 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, de = ext2_next_entry(de); } ext2_put_page(page); - } + } else + dir_has_error = 1; + if (++n >= npages) n = 0; /* next page is past the blocks we've got */ @@ -414,7 +421,7 @@ found: struct ext2_dir_entry_2 * ext2_dotdot (struct inode *dir, struct page **p) { - struct page *page = ext2_get_page(dir, 0); + struct page *page = ext2_get_page(dir, 0, 0); ext2_dirent *de = NULL; if (!IS_ERR(page)) { @@ -487,7 +494,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) for (n = 0; n <= npages; n++) { char *dir_end; - page = ext2_get_page(dir, n); + page = ext2_get_page(dir, n, 0); err = PTR_ERR(page); if (IS_ERR(page)) goto out; @@ -655,14 +662,17 @@ int ext2_empty_dir (struct inode * inode) { struct page *page = NULL; unsigned long i, npages = dir_pages(inode); + int dir_has_error = 0; for (i = 0; i < npages; i++) { char *kaddr; ext2_dirent * de; - page = ext2_get_page(inode, i); + page = ext2_get_page(inode, i, dir_has_error); - if (IS_ERR(page)) + if (IS_ERR(page)) { + dir_has_error = 1; continue; + } kaddr = page_address(page); de = (ext2_dirent *)kaddr; diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index 47d88da2d33..bae998c1e44 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -133,6 +133,8 @@ extern void ext2_truncate (struct inode *); extern int ext2_setattr (struct dentry *, struct iattr *); extern void ext2_set_inode_flags(struct inode *inode); extern void ext2_get_inode_flags(struct ext2_inode_info *); +extern int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); int __ext2_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 5f2fa9c3629..45ed0712218 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -86,4 +86,5 @@ const struct inode_operations ext2_file_inode_operations = { #endif .setattr = ext2_setattr, .permission = ext2_permission, + .fiemap = ext2_fiemap, }; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 991d6dfeb51..7658b33e265 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/buffer_head.h> #include <linux/mpage.h> +#include <linux/fiemap.h> #include "ext2.h" #include "acl.h" #include "xip.h" @@ -704,6 +705,13 @@ int ext2_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_ } +int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext2_get_block); +} + static int ext2_writepage(struct page *page, struct writeback_control *wbc) { return block_write_full_page(page, ext2_get_block, wbc); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index fd88c7b43e6..647cd888ac8 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -393,7 +393,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, diff --git a/fs/ext3/file.c b/fs/ext3/file.c index acc4913d301..3be1e0689c9 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -134,5 +134,6 @@ const struct inode_operations ext3_file_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext3_permission, + .fiemap = ext3_fiemap, }; diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 507d8689b11..ebfec4d0148 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -36,6 +36,7 @@ #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> +#include <linux/fiemap.h> #include "xattr.h" #include "acl.h" @@ -981,6 +982,13 @@ out: return ret; } +int ext3_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) +{ + return generic_block_fiemap(inode, fieinfo, start, len, + ext3_get_block); +} + /* * `handle' can be NULL if create is zero */ diff --git a/fs/ext3/super.c b/fs/ext3/super.c index f38a5afc39a..399a96a6c55 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -760,7 +760,7 @@ enum { Opt_grpquota }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index ac6fa8ca0a2..a8ff003a00f 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -2,12 +2,12 @@ # Makefile for the linux ext4-filesystem routines. # -obj-$(CONFIG_EXT4DEV_FS) += ext4dev.o +obj-$(CONFIG_EXT4_FS) += ext4.o -ext4dev-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ +ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ ext4_jbd2.o migrate.o mballoc.o -ext4dev-$(CONFIG_EXT4DEV_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o -ext4dev-$(CONFIG_EXT4DEV_FS_POSIX_ACL) += acl.o -ext4dev-$(CONFIG_EXT4DEV_FS_SECURITY) += xattr_security.o +ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o +ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o +ext4-$(CONFIG_EXT4_FS_SECURITY) += xattr_security.o diff --git a/fs/ext4/acl.h b/fs/ext4/acl.h index cd2b855a07d..cb45257a246 100644 --- a/fs/ext4/acl.h +++ b/fs/ext4/acl.h @@ -51,18 +51,18 @@ static inline int ext4_acl_count(size_t size) } } -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL /* Value for inode->u.ext4_i.i_acl and inode->u.ext4_i.i_default_acl if the ACL has not been cached */ #define EXT4_ACL_NOT_CACHED ((void *)-1) /* acl.c */ -extern int ext4_permission (struct inode *, int); -extern int ext4_acl_chmod (struct inode *); -extern int ext4_init_acl (handle_t *, struct inode *, struct inode *); +extern int ext4_permission(struct inode *, int); +extern int ext4_acl_chmod(struct inode *); +extern int ext4_init_acl(handle_t *, struct inode *, struct inode *); -#else /* CONFIG_EXT4DEV_FS_POSIX_ACL */ +#else /* CONFIG_EXT4_FS_POSIX_ACL */ #include <linux/sched.h> #define ext4_permission NULL @@ -77,5 +77,5 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) { return 0; } -#endif /* CONFIG_EXT4DEV_FS_POSIX_ACL */ +#endif /* CONFIG_EXT4_FS_POSIX_ACL */ diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 1ae5004e93f..b9821be709b 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -83,6 +83,7 @@ static int ext4_group_used_meta_blocks(struct super_block *sb, } return used_blocks; } + /* Initializes an uninitialized block bitmap if given, and returns the * number of blocks free in the group. */ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, @@ -132,7 +133,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ group_blocks = ext4_blocks_count(sbi->s_es) - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count -1)); + (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -200,20 +201,20 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, * @bh: pointer to the buffer head to store the block * group descriptor */ -struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, +struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, ext4_group_t block_group, - struct buffer_head ** bh) + struct buffer_head **bh) { unsigned long group_desc; unsigned long offset; - struct ext4_group_desc * desc; + struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); if (block_group >= sbi->s_groups_count) { - ext4_error (sb, "ext4_get_group_desc", - "block_group >= groups_count - " - "block_group = %lu, groups_count = %lu", - block_group, sbi->s_groups_count); + ext4_error(sb, "ext4_get_group_desc", + "block_group >= groups_count - " + "block_group = %lu, groups_count = %lu", + block_group, sbi->s_groups_count); return NULL; } @@ -222,10 +223,10 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); if (!sbi->s_group_desc[group_desc]) { - ext4_error (sb, "ext4_get_group_desc", - "Group descriptor not loaded - " - "block_group = %lu, group_desc = %lu, desc = %lu", - block_group, group_desc, offset); + ext4_error(sb, "ext4_get_group_desc", + "Group descriptor not loaded - " + "block_group = %lu, group_desc = %lu, desc = %lu", + block_group, group_desc, offset); return NULL; } @@ -302,8 +303,8 @@ err_out: struct buffer_head * ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) { - struct ext4_group_desc * desc; - struct buffer_head * bh = NULL; + struct ext4_group_desc *desc; + struct buffer_head *bh = NULL; ext4_fsblk_t bitmap_blk; desc = ext4_get_group_desc(sb, block_group, NULL); @@ -318,9 +319,11 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } - if (bh_uptodate_or_lock(bh)) + if (buffer_uptodate(bh) && + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) return bh; + lock_buffer(bh); spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); @@ -345,301 +348,6 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) */ return bh; } -/* - * The reservation window structure operations - * -------------------------------------------- - * Operations include: - * dump, find, add, remove, is_empty, find_next_reservable_window, etc. - * - * We use a red-black tree to represent per-filesystem reservation - * windows. - * - */ - -/** - * __rsv_window_dump() -- Dump the filesystem block allocation reservation map - * @rb_root: root of per-filesystem reservation rb tree - * @verbose: verbose mode - * @fn: function which wishes to dump the reservation map - * - * If verbose is turned on, it will print the whole block reservation - * windows(start, end). Otherwise, it will only print out the "bad" windows, - * those windows that overlap with their immediate neighbors. - */ -#if 1 -static void __rsv_window_dump(struct rb_root *root, int verbose, - const char *fn) -{ - struct rb_node *n; - struct ext4_reserve_window_node *rsv, *prev; - int bad; - -restart: - n = rb_first(root); - bad = 0; - prev = NULL; - - printk("Block Allocation Reservation Windows Map (%s):\n", fn); - while (n) { - rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node); - if (verbose) - printk("reservation window 0x%p " - "start: %llu, end: %llu\n", - rsv, rsv->rsv_start, rsv->rsv_end); - if (rsv->rsv_start && rsv->rsv_start >= rsv->rsv_end) { - printk("Bad reservation %p (start >= end)\n", - rsv); - bad = 1; - } - if (prev && prev->rsv_end >= rsv->rsv_start) { - printk("Bad reservation %p (prev->end >= start)\n", - rsv); - bad = 1; - } - if (bad) { - if (!verbose) { - printk("Restarting reservation walk in verbose mode\n"); - verbose = 1; - goto restart; - } - } - n = rb_next(n); - prev = rsv; - } - printk("Window map complete.\n"); - BUG_ON(bad); -} -#define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __func__) -#else -#define rsv_window_dump(root, verbose) do {} while (0) -#endif - -/** - * goal_in_my_reservation() - * @rsv: inode's reservation window - * @grp_goal: given goal block relative to the allocation block group - * @group: the current allocation block group - * @sb: filesystem super block - * - * Test if the given goal block (group relative) is within the file's - * own block reservation window range. - * - * If the reservation window is outside the goal allocation group, return 0; - * grp_goal (given goal block) could be -1, which means no specific - * goal block. In this case, always return 1. - * If the goal block is within the reservation window, return 1; - * otherwise, return 0; - */ -static int -goal_in_my_reservation(struct ext4_reserve_window *rsv, ext4_grpblk_t grp_goal, - ext4_group_t group, struct super_block *sb) -{ - ext4_fsblk_t group_first_block, group_last_block; - - group_first_block = ext4_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - if ((rsv->_rsv_start > group_last_block) || - (rsv->_rsv_end < group_first_block)) - return 0; - if ((grp_goal >= 0) && ((grp_goal + group_first_block < rsv->_rsv_start) - || (grp_goal + group_first_block > rsv->_rsv_end))) - return 0; - return 1; -} - -/** - * search_reserve_window() - * @rb_root: root of reservation tree - * @goal: target allocation block - * - * Find the reserved window which includes the goal, or the previous one - * if the goal is not in any window. - * Returns NULL if there are no windows or if all windows start after the goal. - */ -static struct ext4_reserve_window_node * -search_reserve_window(struct rb_root *root, ext4_fsblk_t goal) -{ - struct rb_node *n = root->rb_node; - struct ext4_reserve_window_node *rsv; - - if (!n) - return NULL; - - do { - rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node); - - if (goal < rsv->rsv_start) - n = n->rb_left; - else if (goal > rsv->rsv_end) - n = n->rb_right; - else - return rsv; - } while (n); - /* - * We've fallen off the end of the tree: the goal wasn't inside - * any particular node. OK, the previous node must be to one - * side of the interval containing the goal. If it's the RHS, - * we need to back up one. - */ - if (rsv->rsv_start > goal) { - n = rb_prev(&rsv->rsv_node); - rsv = rb_entry(n, struct ext4_reserve_window_node, rsv_node); - } - return rsv; -} - -/** - * ext4_rsv_window_add() -- Insert a window to the block reservation rb tree. - * @sb: super block - * @rsv: reservation window to add - * - * Must be called with rsv_lock hold. - */ -void ext4_rsv_window_add(struct super_block *sb, - struct ext4_reserve_window_node *rsv) -{ - struct rb_root *root = &EXT4_SB(sb)->s_rsv_window_root; - struct rb_node *node = &rsv->rsv_node; - ext4_fsblk_t start = rsv->rsv_start; - - struct rb_node ** p = &root->rb_node; - struct rb_node * parent = NULL; - struct ext4_reserve_window_node *this; - - while (*p) - { - parent = *p; - this = rb_entry(parent, struct ext4_reserve_window_node, rsv_node); - - if (start < this->rsv_start) - p = &(*p)->rb_left; - else if (start > this->rsv_end) - p = &(*p)->rb_right; - else { - rsv_window_dump(root, 1); - BUG(); - } - } - - rb_link_node(node, parent, p); - rb_insert_color(node, root); -} - -/** - * ext4_rsv_window_remove() -- unlink a window from the reservation rb tree - * @sb: super block - * @rsv: reservation window to remove - * - * Mark the block reservation window as not allocated, and unlink it - * from the filesystem reservation window rb tree. Must be called with - * rsv_lock hold. - */ -static void rsv_window_remove(struct super_block *sb, - struct ext4_reserve_window_node *rsv) -{ - rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_alloc_hit = 0; - rb_erase(&rsv->rsv_node, &EXT4_SB(sb)->s_rsv_window_root); -} - -/* - * rsv_is_empty() -- Check if the reservation window is allocated. - * @rsv: given reservation window to check - * - * returns 1 if the end block is EXT4_RESERVE_WINDOW_NOT_ALLOCATED. - */ -static inline int rsv_is_empty(struct ext4_reserve_window *rsv) -{ - /* a valid reservation end block could not be 0 */ - return rsv->_rsv_end == EXT4_RESERVE_WINDOW_NOT_ALLOCATED; -} - -/** - * ext4_init_block_alloc_info() - * @inode: file inode structure - * - * Allocate and initialize the reservation window structure, and - * link the window to the ext4 inode structure at last - * - * The reservation window structure is only dynamically allocated - * and linked to ext4 inode the first time the open file - * needs a new block. So, before every ext4_new_block(s) call, for - * regular files, we should check whether the reservation window - * structure exists or not. In the latter case, this function is called. - * Fail to do so will result in block reservation being turned off for that - * open file. - * - * This function is called from ext4_get_blocks_handle(), also called - * when setting the reservation window size through ioctl before the file - * is open for write (needs block allocation). - * - * Needs down_write(i_data_sem) protection prior to call this function. - */ -void ext4_init_block_alloc_info(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info; - struct super_block *sb = inode->i_sb; - - block_i = kmalloc(sizeof(*block_i), GFP_NOFS); - if (block_i) { - struct ext4_reserve_window_node *rsv = &block_i->rsv_window_node; - - rsv->rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - rsv->rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - - /* - * if filesystem is mounted with NORESERVATION, the goal - * reservation window size is set to zero to indicate - * block reservation is off - */ - if (!test_opt(sb, RESERVATION)) - rsv->rsv_goal_size = 0; - else - rsv->rsv_goal_size = EXT4_DEFAULT_RESERVE_BLOCKS; - rsv->rsv_alloc_hit = 0; - block_i->last_alloc_logical_block = 0; - block_i->last_alloc_physical_block = 0; - } - ei->i_block_alloc_info = block_i; -} - -/** - * ext4_discard_reservation() - * @inode: inode - * - * Discard(free) block reservation window on last file close, or truncate - * or at last iput(). - * - * It is being called in three cases: - * ext4_release_file(): last writer close the file - * ext4_clear_inode(): last iput(), when nobody link to this file. - * ext4_truncate(): when the block indirect map is about to change. - * - */ -void ext4_discard_reservation(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_block_alloc_info *block_i = ei->i_block_alloc_info; - struct ext4_reserve_window_node *rsv; - spinlock_t *rsv_lock = &EXT4_SB(inode->i_sb)->s_rsv_window_lock; - - ext4_mb_discard_inode_preallocations(inode); - - if (!block_i) - return; - - rsv = &block_i->rsv_window_node; - if (!rsv_is_empty(&rsv->rsv_window)) { - spin_lock(rsv_lock); - if (!rsv_is_empty(&rsv->rsv_window)) - rsv_window_remove(inode->i_sb, rsv); - spin_unlock(rsv_lock); - } -} /** * ext4_free_blocks_sb() -- Free given blocks and update quota @@ -648,6 +356,13 @@ void ext4_discard_reservation(struct inode *inode) * @block: start physcial block to free * @count: number of blocks to free * @pdquot_freed_blocks: pointer to quota + * + * XXX This function is only used by the on-line resizing code, which + * should probably be fixed up to call the mballoc variant. There + * this needs to be cleaned up later; in fact, I'm not convinced this + * is 100% correct in the face of the mballoc code. The online resizing + * code needs to be fixed up to more tightly (and correctly) interlock + * with the mballoc code. */ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, ext4_fsblk_t block, unsigned long count, @@ -659,8 +374,8 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, ext4_grpblk_t bit; unsigned long i; unsigned long overflow; - struct ext4_group_desc * desc; - struct ext4_super_block * es; + struct ext4_group_desc *desc; + struct ext4_super_block *es; struct ext4_sb_info *sbi; int err = 0, ret; ext4_grpblk_t group_freed; @@ -671,13 +386,13 @@ void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || block + count > ext4_blocks_count(es)) { - ext4_error (sb, "ext4_free_blocks", - "Freeing blocks not in datazone - " - "block = %llu, count = %lu", block, count); + ext4_error(sb, "ext4_free_blocks", + "Freeing blocks not in datazone - " + "block = %llu, count = %lu", block, count); goto error_return; } - ext4_debug ("freeing block(s) %llu-%llu\n", block, block + count - 1); + ext4_debug("freeing block(s) %llu-%llu\n", block, block + count - 1); do_more: overflow = 0; @@ -694,7 +409,7 @@ do_more: bitmap_bh = ext4_read_block_bitmap(sb, block_group); if (!bitmap_bh) goto error_return; - desc = ext4_get_group_desc (sb, block_group, &gd_bh); + desc = ext4_get_group_desc(sb, block_group, &gd_bh); if (!desc) goto error_return; @@ -703,10 +418,10 @@ do_more: in_range(block, ext4_inode_table(sb, desc), sbi->s_itb_per_group) || in_range(block + count - 1, ext4_inode_table(sb, desc), sbi->s_itb_per_group)) { - ext4_error (sb, "ext4_free_blocks", - "Freeing blocks in system zones - " - "Block = %llu, count = %lu", - block, count); + ext4_error(sb, "ext4_free_blocks", + "Freeing blocks in system zones - " + "Block = %llu, count = %lu", + block, count); goto error_return; } @@ -848,759 +563,71 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata) { - struct super_block * sb; + struct super_block *sb; unsigned long dquot_freed_blocks; /* this isn't the right place to decide whether block is metadata * inode.c/extents.c knows better, but for safety ... */ - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode) || - ext4_should_journal_data(inode)) + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + metadata = 1; + + /* We need to make sure we don't reuse + * block released untill the transaction commit. + * writeback mode have weak data consistency so + * don't force data as metadata when freeing block + * for writeback mode. + */ + if (metadata == 0 && !ext4_should_writeback_data(inode)) metadata = 1; sb = inode->i_sb; - if (!test_opt(sb, MBALLOC) || !EXT4_SB(sb)->s_group_info) - ext4_free_blocks_sb(handle, sb, block, count, - &dquot_freed_blocks); - else - ext4_mb_free_blocks(handle, inode, block, count, - metadata, &dquot_freed_blocks); + ext4_mb_free_blocks(handle, inode, block, count, + metadata, &dquot_freed_blocks); if (dquot_freed_blocks) DQUOT_FREE_BLOCK(inode, dquot_freed_blocks); return; } -/** - * ext4_test_allocatable() - * @nr: given allocation block group - * @bh: bufferhead contains the bitmap of the given block group - * - * For ext4 allocations, we must not reuse any blocks which are - * allocated in the bitmap buffer's "last committed data" copy. This - * prevents deletes from freeing up the page for reuse until we have - * committed the delete transaction. - * - * If we didn't do this, then deleting something and reallocating it as - * data would allow the old block to be overwritten before the - * transaction committed (because we force data to disk before commit). - * This would lead to corruption if we crashed between overwriting the - * data and committing the delete. - * - * @@@ We may want to make this allocation behaviour conditional on - * data-writes at some point, and disable it for metadata allocations or - * sync-data inodes. - */ -static int ext4_test_allocatable(ext4_grpblk_t nr, struct buffer_head *bh) +int ext4_claim_free_blocks(struct ext4_sb_info *sbi, + s64 nblocks) { - int ret; - struct journal_head *jh = bh2jh(bh); - - if (ext4_test_bit(nr, bh->b_data)) - return 0; - - jbd_lock_bh_state(bh); - if (!jh->b_committed_data) - ret = 1; - else - ret = !ext4_test_bit(nr, jh->b_committed_data); - jbd_unlock_bh_state(bh); - return ret; -} + s64 free_blocks, dirty_blocks; + s64 root_blocks = 0; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; -/** - * bitmap_search_next_usable_block() - * @start: the starting block (group relative) of the search - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) of the reservation - * - * The bitmap search --- search forward alternately through the actual - * bitmap on disk and the last-committed copy in journal, until we find a - * bit free in both bitmaps. - */ -static ext4_grpblk_t -bitmap_search_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh, - ext4_grpblk_t maxblocks) -{ - ext4_grpblk_t next; - struct journal_head *jh = bh2jh(bh); - - while (start < maxblocks) { - next = ext4_find_next_zero_bit(bh->b_data, maxblocks, start); - if (next >= maxblocks) - return -1; - if (ext4_test_allocatable(next, bh)) - return next; - jbd_lock_bh_state(bh); - if (jh->b_committed_data) - start = ext4_find_next_zero_bit(jh->b_committed_data, - maxblocks, next); - jbd_unlock_bh_state(bh); - } - return -1; -} + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); -/** - * find_next_usable_block() - * @start: the starting block (group relative) to find next - * allocatable block in bitmap. - * @bh: bufferhead contains the block group bitmap - * @maxblocks: the ending block (group relative) for the search - * - * Find an allocatable block in a bitmap. We honor both the bitmap and - * its last-committed copy (if that exists), and perform the "most - * appropriate allocation" algorithm of looking for a free block near - * the initial goal; then for a free byte somewhere in the bitmap; then - * for any free bit in the bitmap. - */ -static ext4_grpblk_t -find_next_usable_block(ext4_grpblk_t start, struct buffer_head *bh, - ext4_grpblk_t maxblocks) -{ - ext4_grpblk_t here, next; - char *p, *r; - - if (start > 0) { - /* - * The goal was occupied; search forward for a free - * block within the next XX blocks. - * - * end_goal is more or less random, but it has to be - * less than EXT4_BLOCKS_PER_GROUP. Aligning up to the - * next 64-bit boundary is simple.. - */ - ext4_grpblk_t end_goal = (start + 63) & ~63; - if (end_goal > maxblocks) - end_goal = maxblocks; - here = ext4_find_next_zero_bit(bh->b_data, end_goal, start); - if (here < end_goal && ext4_test_allocatable(here, bh)) - return here; - ext4_debug("Bit not found near goal\n"); - } - - here = start; - if (here < 0) - here = 0; - - p = ((char *)bh->b_data) + (here >> 3); - r = memscan(p, 0, ((maxblocks + 7) >> 3) - (here >> 3)); - next = (r - ((char *)bh->b_data)) << 3; - - if (next < maxblocks && next >= start && ext4_test_allocatable(next, bh)) - return next; - - /* - * The bitmap search --- search forward alternately through the actual - * bitmap and the last-committed copy until we find a bit free in - * both - */ - here = bitmap_search_next_usable_block(here, bh, maxblocks); - return here; -} - -/** - * claim_block() - * @block: the free block (group relative) to allocate - * @bh: the bufferhead containts the block group bitmap - * - * We think we can allocate this block in this bitmap. Try to set the bit. - * If that succeeds then check that nobody has allocated and then freed the - * block since we saw that is was not marked in b_committed_data. If it _was_ - * allocated and freed then clear the bit in the bitmap again and return - * zero (failure). - */ -static inline int -claim_block(spinlock_t *lock, ext4_grpblk_t block, struct buffer_head *bh) -{ - struct journal_head *jh = bh2jh(bh); - int ret; - - if (ext4_set_bit_atomic(lock, block, bh->b_data)) - return 0; - jbd_lock_bh_state(bh); - if (jh->b_committed_data && ext4_test_bit(block,jh->b_committed_data)) { - ext4_clear_bit_atomic(lock, block, bh->b_data); - ret = 0; - } else { - ret = 1; - } - jbd_unlock_bh_state(bh); - return ret; -} - -/** - * ext4_try_to_allocate() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @count: target number of blocks to allocate - * @my_rsv: reservation window - * - * Attempt to allocate blocks within a give range. Set the range of allocation - * first, then find the first free bit(s) from the bitmap (within the range), - * and at last, allocate the blocks by claiming the found free bit as allocated. - * - * To set the range of this allocation: - * if there is a reservation window, only try to allocate block(s) from the - * file's own reservation window; - * Otherwise, the allocation range starts from the give goal block, ends at - * the block group's last block. - * - * If we failed to allocate the desired block then we may end up crossing to a - * new bitmap. In that case we must release write access to the old one via - * ext4_journal_release_buffer(), else we'll run out of credits. - */ -static ext4_grpblk_t -ext4_try_to_allocate(struct super_block *sb, handle_t *handle, - ext4_group_t group, struct buffer_head *bitmap_bh, - ext4_grpblk_t grp_goal, unsigned long *count, - struct ext4_reserve_window *my_rsv) -{ - ext4_fsblk_t group_first_block; - ext4_grpblk_t start, end; - unsigned long num = 0; - - /* we do allocation within the reservation window if we have a window */ - if (my_rsv) { - group_first_block = ext4_group_first_block_no(sb, group); - if (my_rsv->_rsv_start >= group_first_block) - start = my_rsv->_rsv_start - group_first_block; - else - /* reservation window cross group boundary */ - start = 0; - end = my_rsv->_rsv_end - group_first_block + 1; - if (end > EXT4_BLOCKS_PER_GROUP(sb)) - /* reservation window crosses group boundary */ - end = EXT4_BLOCKS_PER_GROUP(sb); - if ((start <= grp_goal) && (grp_goal < end)) - start = grp_goal; - else - grp_goal = -1; - } else { - if (grp_goal > 0) - start = grp_goal; - else - start = 0; - end = EXT4_BLOCKS_PER_GROUP(sb); - } - - BUG_ON(start > EXT4_BLOCKS_PER_GROUP(sb)); - -repeat: - if (grp_goal < 0 || !ext4_test_allocatable(grp_goal, bitmap_bh)) { - grp_goal = find_next_usable_block(start, bitmap_bh, end); - if (grp_goal < 0) - goto fail_access; - if (!my_rsv) { - int i; - - for (i = 0; i < 7 && grp_goal > start && - ext4_test_allocatable(grp_goal - 1, - bitmap_bh); - i++, grp_goal--) - ; - } - } - start = grp_goal; - - if (!claim_block(sb_bgl_lock(EXT4_SB(sb), group), - grp_goal, bitmap_bh)) { - /* - * The block was allocated by another thread, or it was - * allocated and then freed by another thread - */ - start++; - grp_goal++; - if (start >= end) - goto fail_access; - goto repeat; - } - num++; - grp_goal++; - while (num < *count && grp_goal < end - && ext4_test_allocatable(grp_goal, bitmap_bh) - && claim_block(sb_bgl_lock(EXT4_SB(sb), group), - grp_goal, bitmap_bh)) { - num++; - grp_goal++; - } - *count = num; - return grp_goal - num; -fail_access: - *count = num; - return -1; -} - -/** - * find_next_reservable_window(): - * find a reservable space within the given range. - * It does not allocate the reservation window for now: - * alloc_new_reservation() will do the work later. - * - * @search_head: the head of the searching list; - * This is not necessarily the list head of the whole filesystem - * - * We have both head and start_block to assist the search - * for the reservable space. The list starts from head, - * but we will shift to the place where start_block is, - * then start from there, when looking for a reservable space. - * - * @size: the target new reservation window size - * - * @group_first_block: the first block we consider to start - * the real search from - * - * @last_block: - * the maximum block number that our goal reservable space - * could start from. This is normally the last block in this - * group. The search will end when we found the start of next - * possible reservable space is out of this boundary. - * This could handle the cross boundary reservation window - * request. - * - * basically we search from the given range, rather than the whole - * reservation double linked list, (start_block, last_block) - * to find a free region that is of my size and has not - * been reserved. - * - */ -static int find_next_reservable_window( - struct ext4_reserve_window_node *search_head, - struct ext4_reserve_window_node *my_rsv, - struct super_block * sb, - ext4_fsblk_t start_block, - ext4_fsblk_t last_block) -{ - struct rb_node *next; - struct ext4_reserve_window_node *rsv, *prev; - ext4_fsblk_t cur; - int size = my_rsv->rsv_goal_size; - - /* TODO: make the start of the reservation window byte-aligned */ - /* cur = *start_block & ~7;*/ - cur = start_block; - rsv = search_head; - if (!rsv) - return -1; - - while (1) { - if (cur <= rsv->rsv_end) - cur = rsv->rsv_end + 1; - - /* TODO? - * in the case we could not find a reservable space - * that is what is expected, during the re-search, we could - * remember what's the largest reservable space we could have - * and return that one. - * - * For now it will fail if we could not find the reservable - * space with expected-size (or more)... - */ - if (cur > last_block) - return -1; /* fail */ - - prev = rsv; - next = rb_next(&rsv->rsv_node); - rsv = rb_entry(next,struct ext4_reserve_window_node,rsv_node); + if (!capable(CAP_SYS_RESOURCE) && + sbi->s_resuid != current->fsuid && + (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) + root_blocks = ext4_r_blocks_count(sbi->s_es); - /* - * Reached the last reservation, we can just append to the - * previous one. - */ - if (!next) - break; - - if (cur + size <= rsv->rsv_start) { - /* - * Found a reserveable space big enough. We could - * have a reservation across the group boundary here - */ - break; + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum(fbc); + dirty_blocks = percpu_counter_sum(dbc); + if (dirty_blocks < 0) { + printk(KERN_CRIT "Dirty block accounting " + "went wrong %lld\n", + dirty_blocks); } } - /* - * we come here either : - * when we reach the end of the whole list, - * and there is empty reservable space after last entry in the list. - * append it to the end of the list. - * - * or we found one reservable space in the middle of the list, - * return the reservation window that we could append to. - * succeed. + /* Check whether we have space after + * accounting for current dirty blocks */ + if (free_blocks < ((root_blocks + nblocks) + dirty_blocks)) + /* we don't have free space */ + return -ENOSPC; - if ((prev != my_rsv) && (!rsv_is_empty(&my_rsv->rsv_window))) - rsv_window_remove(sb, my_rsv); - - /* - * Let's book the whole avaliable window for now. We will check the - * disk bitmap later and then, if there are free blocks then we adjust - * the window size if it's larger than requested. - * Otherwise, we will remove this node from the tree next time - * call find_next_reservable_window. - */ - my_rsv->rsv_start = cur; - my_rsv->rsv_end = cur + size - 1; - my_rsv->rsv_alloc_hit = 0; - - if (prev != my_rsv) - ext4_rsv_window_add(sb, my_rsv); - + /* Add the blocks to nblocks */ + percpu_counter_add(dbc, nblocks); return 0; } /** - * alloc_new_reservation()--allocate a new reservation window - * - * To make a new reservation, we search part of the filesystem - * reservation list (the list that inside the group). We try to - * allocate a new reservation window near the allocation goal, - * or the beginning of the group, if there is no goal. - * - * We first find a reservable space after the goal, then from - * there, we check the bitmap for the first free block after - * it. If there is no free block until the end of group, then the - * whole group is full, we failed. Otherwise, check if the free - * block is inside the expected reservable space, if so, we - * succeed. - * If the first free block is outside the reservable space, then - * start from the first free block, we search for next available - * space, and go on. - * - * on succeed, a new reservation will be found and inserted into the list - * It contains at least one free block, and it does not overlap with other - * reservation windows. - * - * failed: we failed to find a reservation window in this group - * - * @rsv: the reservation - * - * @grp_goal: The goal (group-relative). It is where the search for a - * free reservable space should start from. - * if we have a grp_goal(grp_goal >0 ), then start from there, - * no grp_goal(grp_goal = -1), we start from the first block - * of the group. - * - * @sb: the super block - * @group: the group we are trying to allocate in - * @bitmap_bh: the block group block bitmap - * - */ -static int alloc_new_reservation(struct ext4_reserve_window_node *my_rsv, - ext4_grpblk_t grp_goal, struct super_block *sb, - ext4_group_t group, struct buffer_head *bitmap_bh) -{ - struct ext4_reserve_window_node *search_head; - ext4_fsblk_t group_first_block, group_end_block, start_block; - ext4_grpblk_t first_free_block; - struct rb_root *fs_rsv_root = &EXT4_SB(sb)->s_rsv_window_root; - unsigned long size; - int ret; - spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock; - - group_first_block = ext4_group_first_block_no(sb, group); - group_end_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - if (grp_goal < 0) - start_block = group_first_block; - else - start_block = grp_goal + group_first_block; - - size = my_rsv->rsv_goal_size; - - if (!rsv_is_empty(&my_rsv->rsv_window)) { - /* - * if the old reservation is cross group boundary - * and if the goal is inside the old reservation window, - * we will come here when we just failed to allocate from - * the first part of the window. We still have another part - * that belongs to the next group. In this case, there is no - * point to discard our window and try to allocate a new one - * in this group(which will fail). we should - * keep the reservation window, just simply move on. - * - * Maybe we could shift the start block of the reservation - * window to the first block of next group. - */ - - if ((my_rsv->rsv_start <= group_end_block) && - (my_rsv->rsv_end > group_end_block) && - (start_block >= my_rsv->rsv_start)) - return -1; - - if ((my_rsv->rsv_alloc_hit > - (my_rsv->rsv_end - my_rsv->rsv_start + 1) / 2)) { - /* - * if the previously allocation hit ratio is - * greater than 1/2, then we double the size of - * the reservation window the next time, - * otherwise we keep the same size window - */ - size = size * 2; - if (size > EXT4_MAX_RESERVE_BLOCKS) - size = EXT4_MAX_RESERVE_BLOCKS; - my_rsv->rsv_goal_size= size; - } - } - - spin_lock(rsv_lock); - /* - * shift the search start to the window near the goal block - */ - search_head = search_reserve_window(fs_rsv_root, start_block); - - /* - * find_next_reservable_window() simply finds a reservable window - * inside the given range(start_block, group_end_block). - * - * To make sure the reservation window has a free bit inside it, we - * need to check the bitmap after we found a reservable window. - */ -retry: - ret = find_next_reservable_window(search_head, my_rsv, sb, - start_block, group_end_block); - - if (ret == -1) { - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; - } - - /* - * On success, find_next_reservable_window() returns the - * reservation window where there is a reservable space after it. - * Before we reserve this reservable space, we need - * to make sure there is at least a free block inside this region. - * - * searching the first free bit on the block bitmap and copy of - * last committed bitmap alternatively, until we found a allocatable - * block. Search start from the start block of the reservable space - * we just found. - */ - spin_unlock(rsv_lock); - first_free_block = bitmap_search_next_usable_block( - my_rsv->rsv_start - group_first_block, - bitmap_bh, group_end_block - group_first_block + 1); - - if (first_free_block < 0) { - /* - * no free block left on the bitmap, no point - * to reserve the space. return failed. - */ - spin_lock(rsv_lock); - if (!rsv_is_empty(&my_rsv->rsv_window)) - rsv_window_remove(sb, my_rsv); - spin_unlock(rsv_lock); - return -1; /* failed */ - } - - start_block = first_free_block + group_first_block; - /* - * check if the first free block is within the - * free space we just reserved - */ - if (start_block >= my_rsv->rsv_start && start_block <= my_rsv->rsv_end) - return 0; /* success */ - /* - * if the first free bit we found is out of the reservable space - * continue search for next reservable space, - * start from where the free block is, - * we also shift the list head to where we stopped last time - */ - search_head = my_rsv; - spin_lock(rsv_lock); - goto retry; -} - -/** - * try_to_extend_reservation() - * @my_rsv: given reservation window - * @sb: super block - * @size: the delta to extend - * - * Attempt to expand the reservation window large enough to have - * required number of free blocks - * - * Since ext4_try_to_allocate() will always allocate blocks within - * the reservation window range, if the window size is too small, - * multiple blocks allocation has to stop at the end of the reservation - * window. To make this more efficient, given the total number of - * blocks needed and the current size of the window, we try to - * expand the reservation window size if necessary on a best-effort - * basis before ext4_new_blocks() tries to allocate blocks, - */ -static void try_to_extend_reservation(struct ext4_reserve_window_node *my_rsv, - struct super_block *sb, int size) -{ - struct ext4_reserve_window_node *next_rsv; - struct rb_node *next; - spinlock_t *rsv_lock = &EXT4_SB(sb)->s_rsv_window_lock; - - if (!spin_trylock(rsv_lock)) - return; - - next = rb_next(&my_rsv->rsv_node); - - if (!next) - my_rsv->rsv_end += size; - else { - next_rsv = rb_entry(next, struct ext4_reserve_window_node, rsv_node); - - if ((next_rsv->rsv_start - my_rsv->rsv_end - 1) >= size) - my_rsv->rsv_end += size; - else - my_rsv->rsv_end = next_rsv->rsv_start - 1; - } - spin_unlock(rsv_lock); -} - -/** - * ext4_try_to_allocate_with_rsv() - * @sb: superblock - * @handle: handle to this transaction - * @group: given allocation block group - * @bitmap_bh: bufferhead holds the block bitmap - * @grp_goal: given target block within the group - * @count: target number of blocks to allocate - * @my_rsv: reservation window - * @errp: pointer to store the error code - * - * This is the main function used to allocate a new block and its reservation - * window. - * - * Each time when a new block allocation is need, first try to allocate from - * its own reservation. If it does not have a reservation window, instead of - * looking for a free bit on bitmap first, then look up the reservation list to - * see if it is inside somebody else's reservation window, we try to allocate a - * reservation window for it starting from the goal first. Then do the block - * allocation within the reservation window. - * - * This will avoid keeping on searching the reservation list again and - * again when somebody is looking for a free block (without - * reservation), and there are lots of free blocks, but they are all - * being reserved. - * - * We use a red-black tree for the per-filesystem reservation list. - * - */ -static ext4_grpblk_t -ext4_try_to_allocate_with_rsv(struct super_block *sb, handle_t *handle, - ext4_group_t group, struct buffer_head *bitmap_bh, - ext4_grpblk_t grp_goal, - struct ext4_reserve_window_node * my_rsv, - unsigned long *count, int *errp) -{ - ext4_fsblk_t group_first_block, group_last_block; - ext4_grpblk_t ret = 0; - int fatal; - unsigned long num = *count; - - *errp = 0; - - /* - * Make sure we use undo access for the bitmap, because it is critical - * that we do the frozen_data COW on bitmap buffers in all cases even - * if the buffer is in BJ_Forget state in the committing transaction. - */ - BUFFER_TRACE(bitmap_bh, "get undo access for new block"); - fatal = ext4_journal_get_undo_access(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - - /* - * we don't deal with reservation when - * filesystem is mounted without reservation - * or the file is not a regular file - * or last attempt to allocate a block with reservation turned on failed - */ - if (my_rsv == NULL ) { - ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, count, NULL); - goto out; - } - /* - * grp_goal is a group relative block number (if there is a goal) - * 0 <= grp_goal < EXT4_BLOCKS_PER_GROUP(sb) - * first block is a filesystem wide block number - * first block is the block number of the first block in this group - */ - group_first_block = ext4_group_first_block_no(sb, group); - group_last_block = group_first_block + (EXT4_BLOCKS_PER_GROUP(sb) - 1); - - /* - * Basically we will allocate a new block from inode's reservation - * window. - * - * We need to allocate a new reservation window, if: - * a) inode does not have a reservation window; or - * b) last attempt to allocate a block from existing reservation - * failed; or - * c) we come here with a goal and with a reservation window - * - * We do not need to allocate a new reservation window if we come here - * at the beginning with a goal and the goal is inside the window, or - * we don't have a goal but already have a reservation window. - * then we could go to allocate from the reservation window directly. - */ - while (1) { - if (rsv_is_empty(&my_rsv->rsv_window) || (ret < 0) || - !goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) { - if (my_rsv->rsv_goal_size < *count) - my_rsv->rsv_goal_size = *count; - ret = alloc_new_reservation(my_rsv, grp_goal, sb, - group, bitmap_bh); - if (ret < 0) - break; /* failed */ - - if (!goal_in_my_reservation(&my_rsv->rsv_window, - grp_goal, group, sb)) - grp_goal = -1; - } else if (grp_goal >= 0) { - int curr = my_rsv->rsv_end - - (grp_goal + group_first_block) + 1; - - if (curr < *count) - try_to_extend_reservation(my_rsv, sb, - *count - curr); - } - - if ((my_rsv->rsv_start > group_last_block) || - (my_rsv->rsv_end < group_first_block)) { - rsv_window_dump(&EXT4_SB(sb)->s_rsv_window_root, 1); - BUG(); - } - ret = ext4_try_to_allocate(sb, handle, group, bitmap_bh, - grp_goal, &num, &my_rsv->rsv_window); - if (ret >= 0) { - my_rsv->rsv_alloc_hit += num; - *count = num; - break; /* succeed */ - } - num = *count; - } -out: - if (ret >= 0) { - BUFFER_TRACE(bitmap_bh, "journal_dirty_metadata for " - "bitmap block"); - fatal = ext4_journal_dirty_metadata(handle, bitmap_bh); - if (fatal) { - *errp = fatal; - return -1; - } - return ret; - } - - BUFFER_TRACE(bitmap_bh, "journal_release_buffer"); - ext4_journal_release_buffer(handle, bitmap_bh); - return ret; -} - -/** * ext4_has_free_blocks() * @sbi: in-core super block structure. * @nblocks: number of neeed blocks @@ -1610,26 +637,34 @@ out: * On success, return nblocks */ ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, - ext4_fsblk_t nblocks) + s64 nblocks) { - ext4_fsblk_t free_blocks; - ext4_fsblk_t root_blocks = 0; + s64 free_blocks, dirty_blocks; + s64 root_blocks = 0; + struct percpu_counter *fbc = &sbi->s_freeblocks_counter; + struct percpu_counter *dbc = &sbi->s_dirtyblocks_counter; - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + free_blocks = percpu_counter_read_positive(fbc); + dirty_blocks = percpu_counter_read_positive(dbc); if (!capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && (sbi->s_resgid == 0 || !in_group_p(sbi->s_resgid))) root_blocks = ext4_r_blocks_count(sbi->s_es); -#ifdef CONFIG_SMP - if (free_blocks - root_blocks < FBC_BATCH) - free_blocks = - percpu_counter_sum_and_set(&sbi->s_freeblocks_counter); -#endif - if (free_blocks - root_blocks < nblocks) - return free_blocks - root_blocks; + + if (free_blocks - (nblocks + root_blocks + dirty_blocks) < + EXT4_FREEBLOCKS_WATERMARK) { + free_blocks = percpu_counter_sum(fbc); + dirty_blocks = percpu_counter_sum(dbc); + } + if (free_blocks <= (root_blocks + dirty_blocks)) + /* we don't have free space */ + return 0; + + if (free_blocks - (root_blocks + dirty_blocks) < nblocks) + return free_blocks - (root_blocks + dirty_blocks); return nblocks; - } +} /** @@ -1654,303 +689,6 @@ int ext4_should_retry_alloc(struct super_block *sb, int *retries) return jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal); } -/** - * ext4_old_new_blocks() -- core block bitmap based block allocation function - * - * @handle: handle to this transaction - * @inode: file inode - * @goal: given target block(filesystem wide) - * @count: target number of blocks to allocate - * @errp: error code - * - * ext4_old_new_blocks uses a goal block to assist allocation and look up - * the block bitmap directly to do block allocation. It tries to - * allocate block(s) from the block group contains the goal block first. If - * that fails, it will try to allocate block(s) from other block groups - * without any specific goal block. - * - * This function is called when -o nomballoc mount option is enabled - * - */ -ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp) -{ - struct buffer_head *bitmap_bh = NULL; - struct buffer_head *gdp_bh; - ext4_group_t group_no; - ext4_group_t goal_group; - ext4_grpblk_t grp_target_blk; /* blockgroup relative goal block */ - ext4_grpblk_t grp_alloc_blk; /* blockgroup-relative allocated block*/ - ext4_fsblk_t ret_block; /* filesyetem-wide allocated block */ - ext4_group_t bgi; /* blockgroup iteration index */ - int fatal = 0, err; - int performed_allocation = 0; - ext4_grpblk_t free_blocks; /* number of free blocks in a group */ - struct super_block *sb; - struct ext4_group_desc *gdp; - struct ext4_super_block *es; - struct ext4_sb_info *sbi; - struct ext4_reserve_window_node *my_rsv = NULL; - struct ext4_block_alloc_info *block_i; - unsigned short windowsz = 0; - ext4_group_t ngroups; - unsigned long num = *count; - - sb = inode->i_sb; - if (!sb) { - *errp = -ENODEV; - printk("ext4_new_block: nonexistent device"); - return 0; - } - - sbi = EXT4_SB(sb); - if (!EXT4_I(inode)->i_delalloc_reserved_flag) { - /* - * With delalloc we already reserved the blocks - */ - *count = ext4_has_free_blocks(sbi, *count); - } - if (*count == 0) { - *errp = -ENOSPC; - return 0; /*return with ENOSPC error */ - } - num = *count; - - /* - * Check quota for allocation of this block. - */ - if (DQUOT_ALLOC_BLOCK(inode, num)) { - *errp = -EDQUOT; - return 0; - } - - sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; - ext4_debug("goal=%llu.\n", goal); - /* - * Allocate a block from reservation only when - * filesystem is mounted with reservation(default,-o reservation), and - * it's a regular file, and - * the desired window size is greater than 0 (One could use ioctl - * command EXT4_IOC_SETRSVSZ to set the window size to 0 to turn off - * reservation on that particular file) - */ - block_i = EXT4_I(inode)->i_block_alloc_info; - if (block_i && ((windowsz = block_i->rsv_window_node.rsv_goal_size) > 0)) - my_rsv = &block_i->rsv_window_node; - - /* - * First, test whether the goal block is free. - */ - if (goal < le32_to_cpu(es->s_first_data_block) || - goal >= ext4_blocks_count(es)) - goal = le32_to_cpu(es->s_first_data_block); - ext4_get_group_no_and_offset(sb, goal, &group_no, &grp_target_blk); - goal_group = group_no; -retry_alloc: - gdp = ext4_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * if there is not enough free blocks to make a new resevation - * turn off reservation for this allocation - */ - if (my_rsv && (free_blocks < windowsz) - && (rsv_is_empty(&my_rsv->rsv_window))) - my_rsv = NULL; - - if (free_blocks > 0) { - bitmap_bh = ext4_read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, grp_target_blk, - my_rsv, &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - - ngroups = EXT4_SB(sb)->s_groups_count; - smp_rmb(); - - /* - * Now search the rest of the groups. We assume that - * group_no and gdp correctly point to the last group visited. - */ - for (bgi = 0; bgi < ngroups; bgi++) { - group_no++; - if (group_no >= ngroups) - group_no = 0; - gdp = ext4_get_group_desc(sb, group_no, &gdp_bh); - if (!gdp) - goto io_error; - free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); - /* - * skip this group if the number of - * free blocks is less than half of the reservation - * window size. - */ - if (free_blocks <= (windowsz/2)) - continue; - - brelse(bitmap_bh); - bitmap_bh = ext4_read_block_bitmap(sb, group_no); - if (!bitmap_bh) - goto io_error; - /* - * try to allocate block(s) from this group, without a goal(-1). - */ - grp_alloc_blk = ext4_try_to_allocate_with_rsv(sb, handle, - group_no, bitmap_bh, -1, my_rsv, - &num, &fatal); - if (fatal) - goto out; - if (grp_alloc_blk >= 0) - goto allocated; - } - /* - * We may end up a bogus ealier ENOSPC error due to - * filesystem is "full" of reservations, but - * there maybe indeed free blocks avaliable on disk - * In this case, we just forget about the reservations - * just do block allocation as without reservations. - */ - if (my_rsv) { - my_rsv = NULL; - windowsz = 0; - group_no = goal_group; - goto retry_alloc; - } - /* No space left on the device */ - *errp = -ENOSPC; - goto out; - -allocated: - - ext4_debug("using block group %lu(%d)\n", - group_no, gdp->bg_free_blocks_count); - - BUFFER_TRACE(gdp_bh, "get_write_access"); - fatal = ext4_journal_get_write_access(handle, gdp_bh); - if (fatal) - goto out; - - ret_block = grp_alloc_blk + ext4_group_first_block_no(sb, group_no); - - if (in_range(ext4_block_bitmap(sb, gdp), ret_block, num) || - in_range(ext4_inode_bitmap(sb, gdp), ret_block, num) || - in_range(ret_block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || - in_range(ret_block + num - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, "ext4_new_block", - "Allocating block in system zone - " - "blocks from %llu, length %lu", - ret_block, num); - /* - * claim_block marked the blocks we allocated - * as in use. So we may want to selectively - * mark some of the blocks as free - */ - goto retry_alloc; - } - - performed_allocation = 1; - -#ifdef CONFIG_JBD2_DEBUG - { - struct buffer_head *debug_bh; - - /* Record bitmap buffer state in the newly allocated block */ - debug_bh = sb_find_get_block(sb, ret_block); - if (debug_bh) { - BUFFER_TRACE(debug_bh, "state when allocated"); - BUFFER_TRACE2(debug_bh, bitmap_bh, "bitmap state"); - brelse(debug_bh); - } - } - jbd_lock_bh_state(bitmap_bh); - spin_lock(sb_bgl_lock(sbi, group_no)); - if (buffer_jbd(bitmap_bh) && bh2jh(bitmap_bh)->b_committed_data) { - int i; - - for (i = 0; i < num; i++) { - if (ext4_test_bit(grp_alloc_blk+i, - bh2jh(bitmap_bh)->b_committed_data)) { - printk("%s: block was unexpectedly set in " - "b_committed_data\n", __func__); - } - } - } - ext4_debug("found bit %d\n", grp_alloc_blk); - spin_unlock(sb_bgl_lock(sbi, group_no)); - jbd_unlock_bh_state(bitmap_bh); -#endif - - if (ret_block + num - 1 >= ext4_blocks_count(es)) { - ext4_error(sb, "ext4_new_block", - "block(%llu) >= blocks count(%llu) - " - "block_group = %lu, es == %p ", ret_block, - ext4_blocks_count(es), group_no, es); - goto out; - } - - /* - * It is up to the caller to add the new buffer to a journal - * list of some description. We don't know in advance whether - * the caller wants to use it as metadata or data. - */ - spin_lock(sb_bgl_lock(sbi, group_no)); - if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) - gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - le16_add_cpu(&gdp->bg_free_blocks_count, -num); - gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); - spin_unlock(sb_bgl_lock(sbi, group_no)); - if (!EXT4_I(inode)->i_delalloc_reserved_flag) - percpu_counter_sub(&sbi->s_freeblocks_counter, num); - - if (sbi->s_log_groups_per_flex) { - ext4_group_t flex_group = ext4_flex_group(sbi, group_no); - spin_lock(sb_bgl_lock(sbi, flex_group)); - sbi->s_flex_groups[flex_group].free_blocks -= num; - spin_unlock(sb_bgl_lock(sbi, flex_group)); - } - - BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); - err = ext4_journal_dirty_metadata(handle, gdp_bh); - if (!fatal) - fatal = err; - - sb->s_dirt = 1; - if (fatal) - goto out; - - *errp = 0; - brelse(bitmap_bh); - DQUOT_FREE_BLOCK(inode, *count-num); - *count = num; - return ret_block; - -io_error: - *errp = -EIO; -out: - if (fatal) { - *errp = fatal; - ext4_std_error(sb, fatal); - } - /* - * Undo the block allocation - */ - if (!performed_allocation) - DQUOT_FREE_BLOCK(inode, *count); - brelse(bitmap_bh); - return 0; -} - #define EXT4_META_BLOCK 0x1 static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, @@ -1960,10 +698,6 @@ static ext4_fsblk_t do_blk_alloc(handle_t *handle, struct inode *inode, struct ext4_allocation_request ar; ext4_fsblk_t ret; - if (!test_opt(inode->i_sb, MBALLOC)) { - return ext4_old_new_blocks(handle, inode, goal, count, errp); - } - memset(&ar, 0, sizeof(ar)); /* Fill with neighbour allocated blocks */ @@ -2005,7 +739,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, /* * Account for the allocated meta blocks */ - if (!(*errp)) { + if (!(*errp) && EXT4_I(inode)->i_delalloc_reserved_flag) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); EXT4_I(inode)->i_allocated_meta_blocks += *count; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); @@ -2090,10 +824,9 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) bitmap_count += x; } brelse(bitmap_bh); - printk("ext4_count_free_blocks: stored = %llu" - ", computed = %llu, %llu\n", - ext4_free_blocks_count(es), - desc_count, bitmap_count); + printk(KERN_DEBUG "ext4_count_free_blocks: stored = %llu" + ", computed = %llu, %llu\n", ext4_free_blocks_count(es), + desc_count, bitmap_count); return bitmap_count; #else desc_count = 0; @@ -2180,8 +913,9 @@ unsigned long ext4_bg_num_gdb(struct super_block *sb, ext4_group_t group) if (!EXT4_HAS_INCOMPAT_FEATURE(sb,EXT4_FEATURE_INCOMPAT_META_BG) || metagroup < first_meta_bg) - return ext4_bg_num_gdb_nometa(sb,group); + return ext4_bg_num_gdb_nometa(sb, group); return ext4_bg_num_gdb_meta(sb,group); } + diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index d37ea675045..0a7a6663c19 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -15,17 +15,17 @@ static const int nibblemap[] = {4, 3, 3, 2, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0}; -unsigned long ext4_count_free (struct buffer_head * map, unsigned int numchars) +unsigned long ext4_count_free(struct buffer_head *map, unsigned int numchars) { unsigned int i; unsigned long sum = 0; if (!map) - return (0); + return 0; for (i = 0; i < numchars; i++) sum += nibblemap[map->b_data[i] & 0xf] + nibblemap[(map->b_data[i] >> 4) & 0xf]; - return (sum); + return sum; } #endif /* EXT4FS_DEBUG */ diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index d3d23d73c08..3ca6a2b7632 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -33,10 +33,10 @@ static unsigned char ext4_filetype_table[] = { }; static int ext4_readdir(struct file *, void *, filldir_t); -static int ext4_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir); -static int ext4_release_dir (struct inode * inode, - struct file * filp); +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir); +static int ext4_release_dir(struct inode *inode, + struct file *filp); const struct file_operations ext4_dir_operations = { .llseek = generic_file_llseek, @@ -61,12 +61,12 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) } -int ext4_check_dir_entry (const char * function, struct inode * dir, - struct ext4_dir_entry_2 * de, - struct buffer_head * bh, - unsigned long offset) +int ext4_check_dir_entry(const char *function, struct inode *dir, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned long offset) { - const char * error_msg = NULL; + const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len); if (rlen < EXT4_DIR_REC_LEN(1)) @@ -82,7 +82,7 @@ int ext4_check_dir_entry (const char * function, struct inode * dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error (dir->i_sb, function, + ext4_error(dir->i_sb, function, "bad entry in directory #%lu: %s - " "offset=%lu, inode=%lu, rec_len=%d, name_len=%d", dir->i_ino, error_msg, offset, @@ -91,8 +91,8 @@ int ext4_check_dir_entry (const char * function, struct inode * dir, return error_msg == NULL ? 1 : 0; } -static int ext4_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext4_readdir(struct file *filp, + void *dirent, filldir_t filldir) { int error = 0; unsigned long offset; @@ -102,6 +102,7 @@ static int ext4_readdir(struct file * filp, int err; struct inode *inode = filp->f_path.dentry->d_inode; int ret = 0; + int dir_has_error = 0; sb = inode->i_sb; @@ -148,9 +149,13 @@ static int ext4_readdir(struct file * filp, * of recovering data when there's a bad sector */ if (!bh) { - ext4_error (sb, "ext4_readdir", - "directory #%lu contains a hole at offset %lu", - inode->i_ino, (unsigned long)filp->f_pos); + if (!dir_has_error) { + ext4_error(sb, __func__, "directory #%lu " + "contains a hole at offset %Lu", + inode->i_ino, + (unsigned long long) filp->f_pos); + dir_has_error = 1; + } /* corrupt size? Maybe no more blocks to read */ if (filp->f_pos > inode->i_blocks << 9) break; @@ -187,14 +192,14 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry ("ext4_readdir", inode, de, - bh, offset)) { + if (!ext4_check_dir_entry("ext4_readdir", inode, de, + bh, offset)) { /* * On error, skip the f_pos to the next block */ filp->f_pos = (filp->f_pos | (sb->s_blocksize - 1)) + 1; - brelse (bh); + brelse(bh); ret = stored; goto out; } @@ -218,12 +223,12 @@ revalidate: break; if (version != filp->f_version) goto revalidate; - stored ++; + stored++; } filp->f_pos += ext4_rec_len_from_disk(de->rec_len); } offset = 0; - brelse (bh); + brelse(bh); } out: return ret; @@ -290,9 +295,9 @@ static void free_rb_tree_fname(struct rb_root *root) parent = rb_parent(n); fname = rb_entry(n, struct fname, rb_hash); while (fname) { - struct fname * old = fname; + struct fname *old = fname; fname = fname->next; - kfree (old); + kfree(old); } if (!parent) root->rb_node = NULL; @@ -331,7 +336,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent) { struct rb_node **p, *parent = NULL; - struct fname * fname, *new_fn; + struct fname *fname, *new_fn; struct dir_private_info *info; int len; @@ -388,19 +393,20 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, * for all entres on the fname linked list. (Normally there is only * one entry on the linked list, unless there are 62 bit hash collisions.) */ -static int call_filldir(struct file * filp, void * dirent, +static int call_filldir(struct file *filp, void *dirent, filldir_t filldir, struct fname *fname) { struct dir_private_info *info = filp->private_data; loff_t curr_pos; struct inode *inode = filp->f_path.dentry->d_inode; - struct super_block * sb; + struct super_block *sb; int error; sb = inode->i_sb; if (!fname) { - printk("call_filldir: called with null fname?!?\n"); + printk(KERN_ERR "ext4: call_filldir: called with " + "null fname?!?\n"); return 0; } curr_pos = hash2pos(fname->hash, fname->minor_hash); @@ -411,7 +417,7 @@ static int call_filldir(struct file * filp, void * dirent, get_dtype(sb, fname->file_type)); if (error) { filp->f_pos = curr_pos; - info->extra_fname = fname->next; + info->extra_fname = fname; return error; } fname = fname->next; @@ -419,8 +425,8 @@ static int call_filldir(struct file * filp, void * dirent, return 0; } -static int ext4_dx_readdir(struct file * filp, - void * dirent, filldir_t filldir) +static int ext4_dx_readdir(struct file *filp, + void *dirent, filldir_t filldir) { struct dir_private_info *info = filp->private_data; struct inode *inode = filp->f_path.dentry->d_inode; @@ -450,11 +456,21 @@ static int ext4_dx_readdir(struct file * filp, * If there are any leftover names on the hash collision * chain, return them first. */ - if (info->extra_fname && - call_filldir(filp, dirent, filldir, info->extra_fname)) - goto finished; + if (info->extra_fname) { + if (call_filldir(filp, dirent, filldir, info->extra_fname)) + goto finished; - if (!info->curr_node) + info->extra_fname = NULL; + info->curr_node = rb_next(info->curr_node); + if (!info->curr_node) { + if (info->next_hash == ~0) { + filp->f_pos = EXT4_HTREE_EOF; + goto finished; + } + info->curr_hash = info->next_hash; + info->curr_minor_hash = 0; + } + } else if (!info->curr_node) info->curr_node = rb_first(&info->root); while (1) { @@ -501,7 +517,7 @@ finished: return 0; } -static int ext4_release_dir (struct inode * inode, struct file * filp) +static int ext4_release_dir(struct inode *inode, struct file *filp) { if (filp->private_data) ext4_htree_free_dir_info(filp->private_data); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6c7924d9e35..4880cc3e672 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -44,9 +44,9 @@ #ifdef EXT4FS_DEBUG #define ext4_debug(f, a...) \ do { \ - printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + printk(KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ __FILE__, __LINE__, __func__); \ - printk (KERN_DEBUG f, ## a); \ + printk(KERN_DEBUG f, ## a); \ } while (0) #else #define ext4_debug(f, a...) do {} while (0) @@ -128,7 +128,7 @@ struct ext4_allocation_request { #else # define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) #endif -#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof(__u32)) #ifdef __KERNEL__ # define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) #else @@ -245,7 +245,7 @@ struct flex_groups { #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ +#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */ /* * Inode dynamic state flags @@ -291,8 +291,6 @@ struct ext4_new_group_data { #define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS #define EXT4_IOC_GETVERSION _IOR('f', 3, long) #define EXT4_IOC_SETVERSION _IOW('f', 4, long) -#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) -#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input) #define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION #define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION #ifdef CONFIG_JBD2_DEBUG @@ -300,7 +298,10 @@ struct ext4_new_group_data { #endif #define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) #define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) -#define EXT4_IOC_MIGRATE _IO('f', 7) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input) +#define EXT4_IOC_MIGRATE _IO('f', 9) + /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */ /* * ioctl commands in 32 bit emulation @@ -510,7 +511,6 @@ do { \ /* * Mount flags */ -#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ #define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ #define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ #define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ @@ -538,8 +538,9 @@ do { \ #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ -#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ +#define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ + /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H #define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt @@ -667,7 +668,7 @@ struct ext4_super_block { }; #ifdef __KERNEL__ -static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) +static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; } @@ -725,11 +726,11 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ #define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ - ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) + (EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask)) #define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ - ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) + (EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask)) #define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ - ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) + (EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask)) #define EXT4_SET_COMPAT_FEATURE(sb,mask) \ EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) #define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ @@ -789,6 +790,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +#define EXT4_DEF_INODE_READAHEAD_BLKS 32 + /* * Default mount options */ @@ -954,6 +957,24 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, unsigned long *blockgrpp, ext4_grpblk_t *offsetp); +extern struct proc_dir_entry *ext4_proc_root; + +#ifdef CONFIG_PROC_FS +extern const struct file_operations ext4_ui_proc_fops; + +#define EXT4_PROC_HANDLER(name, var) \ +do { \ + proc = proc_create_data(name, mode, sbi->s_proc, \ + &ext4_ui_proc_fops, &sbi->s_##var); \ + if (proc == NULL) { \ + printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \ + goto err_out; \ + } \ +} while (0) +#else +#define EXT4_PROC_HANDLER(name, var) +#endif + /* * Function prototypes */ @@ -981,23 +1002,20 @@ extern ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, extern ext4_fsblk_t ext4_new_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, ext4_fsblk_t goal, unsigned long *count, int *errp); -extern ext4_fsblk_t ext4_old_new_blocks(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp); +extern int ext4_claim_free_blocks(struct ext4_sb_info *sbi, s64 nblocks); extern ext4_fsblk_t ext4_has_free_blocks(struct ext4_sb_info *sbi, - ext4_fsblk_t nblocks); -extern void ext4_free_blocks (handle_t *handle, struct inode *inode, + s64 nblocks); +extern void ext4_free_blocks(handle_t *handle, struct inode *inode, ext4_fsblk_t block, unsigned long count, int metadata); -extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, - ext4_fsblk_t block, unsigned long count, +extern void ext4_free_blocks_sb(handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count, unsigned long *pdquot_freed_blocks); -extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); -extern void ext4_check_blocks_bitmap (struct super_block *); +extern ext4_fsblk_t ext4_count_free_blocks(struct super_block *); +extern void ext4_check_blocks_bitmap(struct super_block *); extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); -extern void ext4_init_block_alloc_info(struct inode *); -extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv); /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -1009,20 +1027,20 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, extern void ext4_htree_free_dir_info(struct dir_private_info *p); /* fsync.c */ -extern int ext4_sync_file (struct file *, struct dentry *, int); +extern int ext4_sync_file(struct file *, struct dentry *, int); /* hash.c */ extern int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo); /* ialloc.c */ -extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); -extern void ext4_free_inode (handle_t *, struct inode *); -extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); -extern unsigned long ext4_count_free_inodes (struct super_block *); -extern unsigned long ext4_count_dirs (struct super_block *); -extern void ext4_check_inodes_bitmap (struct super_block *); -extern unsigned long ext4_count_free (struct buffer_head *, unsigned); +extern struct inode * ext4_new_inode(handle_t *, struct inode *, int); +extern void ext4_free_inode(handle_t *, struct inode *); +extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes(struct super_block *); +extern unsigned long ext4_count_dirs(struct super_block *); +extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned long ext4_count_free(struct buffer_head *, unsigned); /* mballoc.c */ extern long ext4_mb_stats; @@ -1032,7 +1050,7 @@ extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, struct ext4_allocation_request *, int *); extern int ext4_mb_reserve_blocks(struct super_block *, int); -extern void ext4_mb_discard_inode_preallocations(struct inode *); +extern void ext4_discard_preallocations(struct inode *); extern int __init init_ext4_mballoc(void); extern void exit_ext4_mballoc(void); extern void ext4_mb_free_blocks(handle_t *, struct inode *, @@ -1050,39 +1068,41 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int, int *); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int, int *); +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long maxblocks, struct buffer_head *bh_result, int create, int extend_disksize); extern struct inode *ext4_iget(struct super_block *, unsigned long); -extern int ext4_write_inode (struct inode *, int); -extern int ext4_setattr (struct dentry *, struct iattr *); +extern int ext4_write_inode(struct inode *, int); +extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); -extern void ext4_delete_inode (struct inode *); -extern int ext4_sync_inode (handle_t *, struct inode *); -extern void ext4_discard_reservation (struct inode *); +extern void ext4_delete_inode(struct inode *); +extern int ext4_sync_inode(handle_t *, struct inode *); extern void ext4_dirty_inode(struct inode *); extern int ext4_change_inode_journal_flag(struct inode *, int); extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); extern int ext4_can_truncate(struct inode *inode); -extern void ext4_truncate (struct inode *); +extern void ext4_truncate(struct inode *); extern void ext4_set_inode_flags(struct inode *); extern void ext4_get_inode_flags(struct ext4_inode_info *); extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks); +extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_block_truncate_page(handle_t *handle, struct address_space *mapping, loff_t from); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page); /* ioctl.c */ extern long ext4_ioctl(struct file *, unsigned int, unsigned long); -extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); /* migrate.c */ -extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, - unsigned long); +extern int ext4_ext_migrate(struct inode *); /* namei.c */ extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); @@ -1097,14 +1117,14 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void ext4_error (struct super_block *, const char *, const char *, ...) +extern void ext4_error(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void __ext4_std_error (struct super_block *, const char *, int); -extern void ext4_abort (struct super_block *, const char *, const char *, ...) +extern void __ext4_std_error(struct super_block *, const char *, int); +extern void ext4_abort(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_warning (struct super_block *, const char *, const char *, ...) +extern void ext4_warning(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_update_dynamic_rev (struct super_block *sb); +extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); extern int ext4_update_rocompat_feature(handle_t *handle, @@ -1177,7 +1197,7 @@ static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) static inline struct ext4_group_info *ext4_get_group_info(struct super_block *sb, - ext4_group_t group) + ext4_group_t group) { struct ext4_group_info ***grp_info; long indexv, indexh; @@ -1205,6 +1225,28 @@ do { \ __ext4_std_error((sb), __func__, (errno)); \ } while (0) +#ifdef CONFIG_SMP +/* Each CPU can accumulate FBC_BATCH blocks in their local + * counters. So we need to make sure we have free blocks more + * than FBC_BATCH * nr_cpu_ids. Also add a window of 4 times. + */ +#define EXT4_FREEBLOCKS_WATERMARK (4 * (FBC_BATCH * nr_cpu_ids)) +#else +#define EXT4_FREEBLOCKS_WATERMARK 0 +#endif + +static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) +{ + /* + * XXX: replace with spinlock if seen contended -bzzz + */ + down_write(&EXT4_I(inode)->i_data_sem); + if (newsize > EXT4_I(inode)->i_disksize) + EXT4_I(inode)->i_disksize = newsize; + up_write(&EXT4_I(inode)->i_data_sem); + return ; +} + /* * Inodes and files operations */ @@ -1227,6 +1269,8 @@ extern const struct inode_operations ext4_fast_symlink_inode_operations; /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, + int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned long max_blocks, struct buffer_head *bh_result, diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 6c166c0a54b..bec7ce59fc0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -124,6 +124,19 @@ struct ext4_ext_path { #define EXT4_EXT_CACHE_GAP 1 #define EXT4_EXT_CACHE_EXTENT 2 +/* + * to be called by ext4_ext_walk_space() + * negative retcode - error + * positive retcode - signal for ext4_ext_walk_space(), see below + * callback must return valid extent (passed or newly created) + */ +typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *, + struct ext4_ext_cache *, + struct ext4_extent *, void *); + +#define EXT_CONTINUE 0 +#define EXT_BREAK 1 +#define EXT_REPEAT 2 #define EXT_MAX_BLOCK 0xffffffff @@ -216,12 +229,16 @@ extern int ext4_ext_calc_metadata_amount(struct inode *inode, int blocks); extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); extern int ext4_extent_tree_init(handle_t *, struct inode *); -extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_calc_credits_for_single_extent(struct inode *inode, + int num, + struct ext4_ext_path *path); extern int ext4_ext_try_to_merge(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *); extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern int ext4_ext_walk_space(struct inode *, ext4_lblk_t, ext4_lblk_t, + ext_prepare_callback, void *); extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, struct ext4_ext_path *); extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h index ef7409f0e7e..5c124c0ac6d 100644 --- a/fs/ext4/ext4_i.h +++ b/fs/ext4/ext4_i.h @@ -33,38 +33,6 @@ typedef __u32 ext4_lblk_t; /* data type for block group number */ typedef unsigned long ext4_group_t; -struct ext4_reserve_window { - ext4_fsblk_t _rsv_start; /* First byte reserved */ - ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */ -}; - -struct ext4_reserve_window_node { - struct rb_node rsv_node; - __u32 rsv_goal_size; - __u32 rsv_alloc_hit; - struct ext4_reserve_window rsv_window; -}; - -struct ext4_block_alloc_info { - /* information about reservation window */ - struct ext4_reserve_window_node rsv_window_node; - /* - * was i_next_alloc_block in ext4_inode_info - * is the logical (file-relative) number of the - * most-recently-allocated block in this file. - * We use this for detecting linearly ascending allocation requests. - */ - ext4_lblk_t last_alloc_logical_block; - /* - * Was i_next_alloc_goal in ext4_inode_info - * is the *physical* companion to i_next_alloc_block. - * it the physical block number of the block which was most-recentl - * allocated to this file. This give us the goal (target) for the next - * allocation when we detect linearly ascending requests. - */ - ext4_fsblk_t last_alloc_physical_block; -}; - #define rsv_start rsv_window._rsv_start #define rsv_end rsv_window._rsv_end @@ -97,11 +65,8 @@ struct ext4_inode_info { ext4_group_t i_block_group; __u32 i_state; /* Dynamic state flags for ext4 */ - /* block reservation info */ - struct ext4_block_alloc_info *i_block_alloc_info; - ext4_lblk_t i_dir_start_lookup; -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR /* * Extended attributes can be read independently of the main file * data. Taking i_mutex even when reading would cause contention @@ -111,7 +76,7 @@ struct ext4_inode_info { */ struct rw_semaphore xattr_sem; #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index eb8bc3afe6e..b455c685a98 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -51,6 +51,14 @@ EXT4_XATTR_TRANS_BLOCKS - 2 + \ 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) +/* + * Define the number of metadata blocks we need to account to modify data. + * + * This include super block, inode block, quota blocks and xattr blocks + */ +#define EXT4_META_TRANS_BLOCKS(sb) (EXT4_XATTR_TRANS_BLOCKS + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + /* Delete operations potentially hit one directory's namespace plus an * entire inode, plus arbitrary amounts of bitmap/indirection data. Be * generous. We can grow the delete transaction later if necessary. */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 6300226d553..445fde603df 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -40,8 +40,8 @@ struct ext4_sb_info { unsigned long s_blocks_last; /* Last seen block count */ loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ - struct buffer_head ** s_group_desc; + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; unsigned long s_mount_opt; ext4_fsblk_t s_sb_block; uid_t s_resuid; @@ -52,6 +52,7 @@ struct ext4_sb_info { int s_desc_per_block_bits; int s_inode_size; int s_first_ino; + unsigned int s_inode_readahead_blks; spinlock_t s_next_gen_lock; u32 s_next_generation; u32 s_hash_seed[4]; @@ -59,16 +60,17 @@ struct ext4_sb_info { struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; struct blockgroup_lock s_blockgroup_lock; + struct proc_dir_entry *s_proc; /* root of the per fs reservation window tree */ spinlock_t s_rsv_window_lock; struct rb_root s_rsv_window_root; - struct ext4_reserve_window_node s_rsv_window_head; /* Journaling */ - struct inode * s_journal_inode; - struct journal_s * s_journal; + struct inode *s_journal_inode; + struct journal_s *s_journal; struct list_head s_orphan; unsigned long s_commit_interval; struct block_device *journal_bdev; @@ -97,21 +99,18 @@ struct ext4_sb_info { struct inode *s_buddy_cache; long s_blocks_reserved; spinlock_t s_reserve_lock; - struct list_head s_active_transaction; - struct list_head s_closed_transaction; - struct list_head s_committed_transaction; spinlock_t s_md_lock; tid_t s_last_transaction; unsigned short *s_mb_offsets, *s_mb_maxs; /* tunables */ unsigned long s_stripe; - unsigned long s_mb_stream_request; - unsigned long s_mb_max_to_scan; - unsigned long s_mb_min_to_scan; - unsigned long s_mb_stats; - unsigned long s_mb_order2_reqs; - unsigned long s_mb_group_prealloc; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; /* where last allocation was done - for stream allocation */ unsigned long s_mb_last_group; unsigned long s_mb_last_start; @@ -121,7 +120,6 @@ struct ext4_sb_info { int s_mb_history_cur; int s_mb_history_max; int s_mb_history_num; - struct proc_dir_entry *s_mb_proc; spinlock_t s_mb_history_lock; int s_mb_history_filter; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 612c3d2c382..ea2ce3c0ae6 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -40,6 +40,7 @@ #include <linux/slab.h> #include <linux/falloc.h> #include <asm/uaccess.h> +#include <linux/fiemap.h> #include "ext4_jbd2.h" #include "ext4_extents.h" @@ -383,8 +384,8 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) ext_debug("\n"); } #else -#define ext4_ext_show_path(inode,path) -#define ext4_ext_show_leaf(inode,path) +#define ext4_ext_show_path(inode, path) +#define ext4_ext_show_leaf(inode, path) #endif void ext4_ext_drop_refs(struct ext4_ext_path *path) @@ -440,9 +441,10 @@ ext4_ext_binsearch_idx(struct inode *inode, for (k = 0; k < le16_to_cpu(eh->eh_entries); k++, ix++) { if (k != 0 && le32_to_cpu(ix->ei_block) <= le32_to_cpu(ix[-1].ei_block)) { - printk("k=%d, ix=0x%p, first=0x%p\n", k, - ix, EXT_FIRST_INDEX(eh)); - printk("%u <= %u\n", + printk(KERN_DEBUG "k=%d, ix=0x%p, " + "first=0x%p\n", k, + ix, EXT_FIRST_INDEX(eh)); + printk(KERN_DEBUG "%u <= %u\n", le32_to_cpu(ix->ei_block), le32_to_cpu(ix[-1].ei_block)); } @@ -1475,7 +1477,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, struct ext4_ext_path *path, struct ext4_extent *newext) { - struct ext4_extent_header * eh; + struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ struct ext4_ext_path *npath = NULL; @@ -1625,6 +1627,113 @@ cleanup: return err; } +int ext4_ext_walk_space(struct inode *inode, ext4_lblk_t block, + ext4_lblk_t num, ext_prepare_callback func, + void *cbdata) +{ + struct ext4_ext_path *path = NULL; + struct ext4_ext_cache cbex; + struct ext4_extent *ex; + ext4_lblk_t next, start = 0, end = 0; + ext4_lblk_t last = block + num; + int depth, exists, err = 0; + + BUG_ON(func == NULL); + BUG_ON(inode == NULL); + + while (block < last && block != EXT_MAX_BLOCK) { + num = last - block; + /* find extent for this block */ + path = ext4_ext_find_extent(inode, block, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + path = NULL; + break; + } + + depth = ext_depth(inode); + BUG_ON(path[depth].p_hdr == NULL); + ex = path[depth].p_ext; + next = ext4_ext_next_allocated_block(path); + + exists = 0; + if (!ex) { + /* there is no extent yet, so try to allocate + * all requested space */ + start = block; + end = block + num; + } else if (le32_to_cpu(ex->ee_block) > block) { + /* need to allocate space before found extent */ + start = block; + end = le32_to_cpu(ex->ee_block); + if (block + num < end) + end = block + num; + } else if (block >= le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex)) { + /* need to allocate space after found extent */ + start = block; + end = block + num; + if (end >= next) + end = next; + } else if (block >= le32_to_cpu(ex->ee_block)) { + /* + * some part of requested space is covered + * by found extent + */ + start = block; + end = le32_to_cpu(ex->ee_block) + + ext4_ext_get_actual_len(ex); + if (block + num < end) + end = block + num; + exists = 1; + } else { + BUG(); + } + BUG_ON(end <= start); + + if (!exists) { + cbex.ec_block = start; + cbex.ec_len = end - start; + cbex.ec_start = 0; + cbex.ec_type = EXT4_EXT_CACHE_GAP; + } else { + cbex.ec_block = le32_to_cpu(ex->ee_block); + cbex.ec_len = ext4_ext_get_actual_len(ex); + cbex.ec_start = ext_pblock(ex); + cbex.ec_type = EXT4_EXT_CACHE_EXTENT; + } + + BUG_ON(cbex.ec_len == 0); + err = func(inode, path, &cbex, ex, cbdata); + ext4_ext_drop_refs(path); + + if (err < 0) + break; + + if (err == EXT_REPEAT) + continue; + else if (err == EXT_BREAK) { + err = 0; + break; + } + + if (ext_depth(inode) != depth) { + /* depth was changed. we have to realloc path */ + kfree(path); + path = NULL; + } + + block = cbex.ec_block + cbex.ec_len; + } + + if (path) { + ext4_ext_drop_refs(path); + kfree(path); + } + + return err; +} + static void ext4_ext_put_in_cache(struct inode *inode, ext4_lblk_t block, __u32 len, ext4_fsblk_t start, int type) @@ -1747,54 +1856,61 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, } /* - * ext4_ext_calc_credits_for_insert: - * This routine returns max. credits that the extent tree can consume. - * It should be OK for low-performance paths like ->writepage() - * To allow many writing processes to fit into a single transaction, - * the caller should calculate credits under i_data_sem and - * pass the actual path. + * ext4_ext_calc_credits_for_single_extent: + * This routine returns max. credits that needed to insert an extent + * to the extent tree. + * When pass the actual path, the caller should calculate credits + * under i_data_sem. */ -int ext4_ext_calc_credits_for_insert(struct inode *inode, +int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, struct ext4_ext_path *path) { - int depth, needed; - if (path) { + int depth = ext_depth(inode); + int ret = 0; + /* probably there is space in leaf? */ - depth = ext_depth(inode); if (le16_to_cpu(path[depth].p_hdr->eh_entries) - < le16_to_cpu(path[depth].p_hdr->eh_max)) - return 1; - } - - /* - * given 32-bit logical block (4294967296 blocks), max. tree - * can be 4 levels in depth -- 4 * 340^4 == 53453440000. - * Let's also add one more level for imbalance. - */ - depth = 5; + < le16_to_cpu(path[depth].p_hdr->eh_max)) { - /* allocation of new data block(s) */ - needed = 2; + /* + * There are some space in the leaf tree, no + * need to account for leaf block credit + * + * bitmaps and block group descriptor blocks + * and other metadat blocks still need to be + * accounted. + */ + /* 1 bitmap, 1 block group descriptor */ + ret = 2 + EXT4_META_TRANS_BLOCKS(inode->i_sb); + } + } - /* - * tree can be full, so it would need to grow in depth: - * we need one credit to modify old root, credits for - * new root will be added in split accounting - */ - needed += 1; + return ext4_chunk_trans_blocks(inode, nrblocks); +} - /* - * Index split can happen, we would need: - * allocate intermediate indexes (bitmap + group) - * + change two blocks at each level, but root (already included) - */ - needed += (depth * 2) + (depth * 2); +/* + * How many index/leaf blocks need to change/allocate to modify nrblocks? + * + * if nrblocks are fit in a single extent (chunk flag is 1), then + * in the worse case, each tree level index/leaf need to be changed + * if the tree split due to insert a new extent, then the old tree + * index/leaf need to be updated too + * + * If the nrblocks are discontiguous, they could cause + * the whole tree split more than once, but this is really rare. + */ +int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int index; + int depth = ext_depth(inode); - /* any allocation modifies superblock */ - needed += 1; + if (chunk) + index = depth * 2; + else + index = depth * 3; - return needed; + return index; } static int ext4_remove_blocks(handle_t *handle, struct inode *inode, @@ -1921,9 +2037,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, correct_index = 1; credits += (ext_depth(inode)) + 1; } -#ifdef CONFIG_QUOTA credits += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif err = ext4_ext_journal_restart(handle, credits); if (err) @@ -2137,7 +2251,7 @@ void ext4_ext_init(struct super_block *sb) */ if (test_opt(sb, EXTENTS)) { - printk("EXT4-fs: file extents enabled"); + printk(KERN_INFO "EXT4-fs: file extents enabled"); #ifdef AGGRESSIVE_TEST printk(", aggressive tests"); #endif @@ -2691,11 +2805,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, goto out2; } /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary. + * Okay, we need to do block allocation. */ - if (S_ISREG(inode->i_mode) && (!EXT4_I(inode)->i_block_alloc_info)) - ext4_init_block_alloc_info(inode); /* find neighbour allocated blocks */ ar.lleft = iblock; @@ -2755,7 +2866,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* free data blocks we just allocated */ /* not a good idea to call discard here directly, * but otherwise we'd need to call it every free() */ - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_preallocations(inode); ext4_free_blocks(handle, inode, ext_pblock(&newex), ext4_ext_get_actual_len(&newex), 0); goto out2; @@ -2805,7 +2916,7 @@ void ext4_ext_truncate(struct inode *inode) /* * probably first extent we're gonna free will be last in block */ - err = ext4_writepage_trans_blocks(inode) + 3; + err = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, err); if (IS_ERR(handle)) return; @@ -2819,7 +2930,7 @@ void ext4_ext_truncate(struct inode *inode) down_write(&EXT4_I(inode)->i_data_sem); ext4_ext_invalidate_cache(inode); - ext4_mb_discard_inode_preallocations(inode); + ext4_discard_preallocations(inode); /* * TODO: optimization is possible here. @@ -2858,27 +2969,6 @@ out_stop: ext4_journal_stop(handle); } -/* - * ext4_ext_writepage_trans_blocks: - * calculate max number of blocks we could modify - * in order to allocate new block for an inode - */ -int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) -{ - int needed; - - needed = ext4_ext_calc_credits_for_insert(inode, NULL); - - /* caller wants to allocate num blocks, but note it includes sb */ - needed = needed * num - (num - 1); - -#ifdef CONFIG_QUOTA - needed += 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - - return needed; -} - static void ext4_falloc_update_inode(struct inode *inode, int mode, loff_t new_size, int update_ctime) { @@ -2893,10 +2983,11 @@ static void ext4_falloc_update_inode(struct inode *inode, * Update only when preallocation was requested beyond * the file size. */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - new_size > i_size_read(inode)) { - i_size_write(inode, new_size); - EXT4_I(inode)->i_disksize = new_size; + if (!(mode & FALLOC_FL_KEEP_SIZE)) { + if (new_size > i_size_read(inode)) + i_size_write(inode, new_size); + if (new_size > EXT4_I(inode)->i_disksize) + ext4_update_i_disksize(inode, new_size); } } @@ -2939,10 +3030,9 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - block; /* - * credits to insert 1 extent into extent tree + buffers to be able to - * modify 1 super block, 1 block bitmap and 1 group descriptor. + * credits to insert 1 extent into extent tree */ - credits = EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + 3; + credits = ext4_chunk_trans_blocks(inode, max_blocks); mutex_lock(&inode->i_mutex); retry: while (ret >= 0 && ret < max_blocks) { @@ -2989,3 +3079,143 @@ retry: mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } + +/* + * Callback function called for each extent to gather FIEMAP information. + */ +int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, + struct ext4_ext_cache *newex, struct ext4_extent *ex, + void *data) +{ + struct fiemap_extent_info *fieinfo = data; + unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + __u64 logical; + __u64 physical; + __u64 length; + __u32 flags = 0; + int error; + + logical = (__u64)newex->ec_block << blksize_bits; + + if (newex->ec_type == EXT4_EXT_CACHE_GAP) { + pgoff_t offset; + struct page *page; + struct buffer_head *bh = NULL; + + offset = logical >> PAGE_SHIFT; + page = find_get_page(inode->i_mapping, offset); + if (!page || !page_has_buffers(page)) + return EXT_CONTINUE; + + bh = page_buffers(page); + + if (!bh) + return EXT_CONTINUE; + + if (buffer_delay(bh)) { + flags |= FIEMAP_EXTENT_DELALLOC; + page_cache_release(page); + } else { + page_cache_release(page); + return EXT_CONTINUE; + } + } + + physical = (__u64)newex->ec_start << blksize_bits; + length = (__u64)newex->ec_len << blksize_bits; + + if (ex && ext4_ext_is_uninitialized(ex)) + flags |= FIEMAP_EXTENT_UNWRITTEN; + + /* + * If this extent reaches EXT_MAX_BLOCK, it must be last. + * + * Or if ext4_ext_next_allocated_block is EXT_MAX_BLOCK, + * this also indicates no more allocated blocks. + * + * XXX this might miss a single-block extent at EXT_MAX_BLOCK + */ + if (logical + length - 1 == EXT_MAX_BLOCK || + ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) + flags |= FIEMAP_EXTENT_LAST; + + error = fiemap_fill_next_extent(fieinfo, logical, physical, + length, flags); + if (error < 0) + return error; + if (error == 1) + return EXT_BREAK; + + return EXT_CONTINUE; +} + +/* fiemap flags we can handle specified here */ +#define EXT4_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC|FIEMAP_FLAG_XATTR) + +int ext4_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) +{ + __u64 physical = 0; + __u64 length; + __u32 flags = FIEMAP_EXTENT_LAST; + int blockbits = inode->i_sb->s_blocksize_bits; + int error = 0; + + /* in-inode? */ + if (EXT4_I(inode)->i_state & EXT4_STATE_XATTR) { + struct ext4_iloc iloc; + int offset; /* offset of xattr in inode */ + + error = ext4_get_inode_loc(inode, &iloc); + if (error) + return error; + physical = iloc.bh->b_blocknr << blockbits; + offset = EXT4_GOOD_OLD_INODE_SIZE + + EXT4_I(inode)->i_extra_isize; + physical += offset; + length = EXT4_SB(inode->i_sb)->s_inode_size - offset; + flags |= FIEMAP_EXTENT_DATA_INLINE; + } else { /* external block */ + physical = EXT4_I(inode)->i_file_acl << blockbits; + length = inode->i_sb->s_blocksize; + } + + if (physical) + error = fiemap_fill_next_extent(fieinfo, 0, physical, + length, flags); + return (error < 0 ? error : 0); +} + +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len) +{ + ext4_lblk_t start_blk; + ext4_lblk_t len_blks; + int error = 0; + + /* fallback to generic here if not in extents fmt */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return generic_block_fiemap(inode, fieinfo, start, len, + ext4_get_block); + + if (fiemap_check_flags(fieinfo, EXT4_FIEMAP_FLAGS)) + return -EBADR; + + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { + error = ext4_xattr_fiemap(inode, fieinfo); + } else { + start_blk = start >> inode->i_sb->s_blocksize_bits; + len_blks = len >> inode->i_sb->s_blocksize_bits; + + /* + * Walk the extent tree gathering extent information. + * ext4_ext_fiemap_cb will push extents back to user. + */ + down_write(&EXT4_I(inode)->i_data_sem); + error = ext4_ext_walk_space(inode, start_blk, len_blks, + ext4_ext_fiemap_cb, fieinfo); + up_write(&EXT4_I(inode)->i_data_sem); + } + + return error; +} + diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 430eb7978db..6bd11fba71f 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -31,14 +31,14 @@ * from ext4_file_open: open gets called at every open, but release * gets called only when /all/ the files are closed. */ -static int ext4_release_file (struct inode * inode, struct file * filp) +static int ext4_release_file(struct inode *inode, struct file *filp) { /* if we are the last writer on the inode, drop the block reservation */ if ((filp->f_mode & FMODE_WRITE) && (atomic_read(&inode->i_writecount) == 1)) { down_write(&EXT4_I(inode)->i_data_sem); - ext4_discard_reservation(inode); + ext4_discard_preallocations(inode); up_write(&EXT4_I(inode)->i_data_sem); } if (is_dx(inode) && filp->private_data) @@ -140,6 +140,9 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) return 0; } +extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); + const struct file_operations ext4_file_operations = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -162,7 +165,7 @@ const struct inode_operations ext4_file_inode_operations = { .truncate = ext4_truncate, .setattr = ext4_setattr, .getattr = ext4_getattr, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, @@ -170,5 +173,6 @@ const struct inode_operations ext4_file_inode_operations = { #endif .permission = ext4_permission, .fallocate = ext4_fallocate, + .fiemap = ext4_fiemap, }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index a45c3737ad3..5afe4370840 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -28,6 +28,7 @@ #include <linux/writeback.h> #include <linux/jbd2.h> #include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4.h" #include "ext4_jbd2.h" @@ -43,7 +44,7 @@ * inode to disk. */ -int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) +int ext4_sync_file(struct file *file, struct dentry *dentry, int datasync) { struct inode *inode = dentry->d_inode; journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; @@ -51,6 +52,10 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) J_ASSERT(ext4_journal_current_handle() == NULL); + trace_mark(ext4_sync_file, "dev %s datasync %d ino %ld parent %ld", + inode->i_sb->s_id, datasync, inode->i_ino, + dentry->d_parent->d_inode->i_ino); + /* * data=writeback: * The caller's filemap_fdatawrite()/wait will sync the data. diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 1d6329dbe39..556ca8eba3d 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -27,7 +27,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) sum += DELTA; b0 += ((b1 << 4)+a) ^ (b1+sum) ^ ((b1 >> 5)+b); b1 += ((b0 << 4)+c) ^ (b0+sum) ^ ((b0 >> 5)+d); - } while(--n); + } while (--n); buf[0] += b0; buf[1] += b1; @@ -35,7 +35,7 @@ static void TEA_transform(__u32 buf[4], __u32 const in[]) /* The old legacy hash */ -static __u32 dx_hack_hash (const char *name, int len) +static __u32 dx_hack_hash(const char *name, int len) { __u32 hash0 = 0x12a3fe2d, hash1 = 0x37abe8f9; while (len--) { @@ -59,7 +59,7 @@ static void str2hashbuf(const char *msg, int len, __u32 *buf, int num) val = pad; if (len > num*4) len = num * 4; - for (i=0; i < len; i++) { + for (i = 0; i < len; i++) { if ((i % 4) == 0) val = pad; val = msg[i] + (val << 8); @@ -104,7 +104,7 @@ int ext4fs_dirhash(const char *name, int len, struct dx_hash_info *hinfo) /* Check to see if the seed is all zero's */ if (hinfo->seed) { - for (i=0; i < 4; i++) { + for (i = 0; i < 4; i++) { if (hinfo->seed[i]) break; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 655e760212b..fe34d74cfb1 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -115,9 +115,11 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) block_group, bitmap_blk); return NULL; } - if (bh_uptodate_or_lock(bh)) + if (buffer_uptodate(bh) && + !(desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) return bh; + lock_buffer(bh); spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); @@ -154,39 +156,40 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) * though), and then we'd have two inodes sharing the * same inode number and space on the harddisk. */ -void ext4_free_inode (handle_t *handle, struct inode * inode) +void ext4_free_inode(handle_t *handle, struct inode *inode) { - struct super_block * sb = inode->i_sb; + struct super_block *sb = inode->i_sb; int is_directory; unsigned long ino; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t block_group; unsigned long bit; - struct ext4_group_desc * gdp; - struct ext4_super_block * es; + struct ext4_group_desc *gdp; + struct ext4_super_block *es; struct ext4_sb_info *sbi; int fatal = 0, err; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { - printk ("ext4_free_inode: inode has count=%d\n", - atomic_read(&inode->i_count)); + printk(KERN_ERR "ext4_free_inode: inode has count=%d\n", + atomic_read(&inode->i_count)); return; } if (inode->i_nlink) { - printk ("ext4_free_inode: inode has nlink=%d\n", - inode->i_nlink); + printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n", + inode->i_nlink); return; } if (!sb) { - printk("ext4_free_inode: inode on nonexistent device\n"); + printk(KERN_ERR "ext4_free_inode: inode on " + "nonexistent device\n"); return; } sbi = EXT4_SB(sb); ino = inode->i_ino; - ext4_debug ("freeing inode %lu\n", ino); + ext4_debug("freeing inode %lu\n", ino); /* * Note: we must free any quota before locking the superblock, @@ -200,12 +203,12 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) is_directory = S_ISDIR(inode->i_mode); /* Do this BEFORE marking the inode not in use or returning an error */ - clear_inode (inode); + clear_inode(inode); es = EXT4_SB(sb)->s_es; if (ino < EXT4_FIRST_INO(sb) || ino > le32_to_cpu(es->s_inodes_count)) { - ext4_error (sb, "ext4_free_inode", - "reserved or nonexistent inode %lu", ino); + ext4_error(sb, "ext4_free_inode", + "reserved or nonexistent inode %lu", ino); goto error_return; } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); @@ -222,10 +225,10 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) /* Ok, now we can actually update the inode bitmaps.. */ if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit, bitmap_bh->b_data)) - ext4_error (sb, "ext4_free_inode", - "bit already cleared for inode %lu", ino); + ext4_error(sb, "ext4_free_inode", + "bit already cleared for inode %lu", ino); else { - gdp = ext4_get_group_desc (sb, block_group, &bh2); + gdp = ext4_get_group_desc(sb, block_group, &bh2); BUFFER_TRACE(bh2, "get_write_access"); fatal = ext4_journal_get_write_access(handle, bh2); @@ -287,7 +290,7 @@ static int find_group_dir(struct super_block *sb, struct inode *parent, avefreei = freei / ngroups; for (group = 0; group < ngroups; group++) { - desc = ext4_get_group_desc (sb, group, NULL); + desc = ext4_get_group_desc(sb, group, NULL); if (!desc || !desc->bg_free_inodes_count) continue; if (le16_to_cpu(desc->bg_free_inodes_count) < avefreei) @@ -351,7 +354,7 @@ find_close_to_parent: goto found_flexbg; } - if (best_flex < 0 || + if (flex_group[best_flex].free_inodes == 0 || (flex_group[i].free_blocks > flex_group[best_flex].free_blocks && flex_group[i].free_inodes)) @@ -576,16 +579,16 @@ static int find_group_other(struct super_block *sb, struct inode *parent, * For other inodes, search forward from the parent directory's block * group to find a free inode. */ -struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) +struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) { struct super_block *sb; struct buffer_head *bitmap_bh = NULL; struct buffer_head *bh2; ext4_group_t group = 0; unsigned long ino = 0; - struct inode * inode; - struct ext4_group_desc * gdp = NULL; - struct ext4_super_block * es; + struct inode *inode; + struct ext4_group_desc *gdp = NULL; + struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err = 0; @@ -613,7 +616,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode * dir, int mode) } if (S_ISDIR(mode)) { - if (test_opt (sb, OLDALLOC)) + if (test_opt(sb, OLDALLOC)) ret2 = find_group_dir(sb, dir, &group); else ret2 = find_group_orlov(sb, dir, &group); @@ -783,7 +786,7 @@ got: } inode->i_uid = current->fsuid; - if (test_opt (sb, GRPID)) + if (test_opt(sb, GRPID)) inode->i_gid = dir->i_gid; else if (dir->i_mode & S_ISGID) { inode->i_gid = dir->i_gid; @@ -816,7 +819,6 @@ got: ei->i_flags &= ~EXT4_DIRSYNC_FL; ei->i_file_acl = 0; ei->i_dtime = 0; - ei->i_block_alloc_info = NULL; ei->i_block_group = group; ext4_set_inode_flags(inode); @@ -832,7 +834,7 @@ got: ei->i_extra_isize = EXT4_SB(sb)->s_want_extra_isize; ret = inode; - if(DQUOT_ALLOC_INODE(inode)) { + if (DQUOT_ALLOC_INODE(inode)) { err = -EDQUOT; goto fail_drop; } @@ -841,7 +843,7 @@ got: if (err) goto fail_free_drop; - err = ext4_init_security(handle,inode, dir); + err = ext4_init_security(handle, inode, dir); if (err) goto fail_free_drop; @@ -959,7 +961,7 @@ error: return ERR_PTR(err); } -unsigned long ext4_count_free_inodes (struct super_block * sb) +unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; @@ -974,7 +976,7 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) bitmap_count = 0; gdp = NULL; for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { - gdp = ext4_get_group_desc (sb, i, NULL); + gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += le16_to_cpu(gdp->bg_free_inodes_count); @@ -989,13 +991,14 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) bitmap_count += x; } brelse(bitmap_bh); - printk("ext4_count_free_inodes: stored = %u, computed = %lu, %lu\n", - le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); + printk(KERN_DEBUG "ext4_count_free_inodes: " + "stored = %u, computed = %lu, %lu\n", + le32_to_cpu(es->s_free_inodes_count), desc_count, bitmap_count); return desc_count; #else desc_count = 0; for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { - gdp = ext4_get_group_desc (sb, i, NULL); + gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; desc_count += le16_to_cpu(gdp->bg_free_inodes_count); @@ -1006,13 +1009,13 @@ unsigned long ext4_count_free_inodes (struct super_block * sb) } /* Called at mount-time, super-block is locked */ -unsigned long ext4_count_dirs (struct super_block * sb) +unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; ext4_group_t i; for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { - struct ext4_group_desc *gdp = ext4_get_group_desc (sb, i, NULL); + struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; count += le16_to_cpu(gdp->bg_used_dirs_count); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 59fbbe899ac..8dbf6953845 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -41,6 +41,8 @@ #include "acl.h" #include "ext4_extents.h" +#define MPAGE_DA_EXTENT_TAIL 0x01 + static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { @@ -188,7 +190,7 @@ static int ext4_journal_test_restart(handle_t *handle, struct inode *inode) /* * Called at the last iput() if i_nlink is zero. */ -void ext4_delete_inode (struct inode * inode) +void ext4_delete_inode(struct inode *inode) { handle_t *handle; int err; @@ -328,11 +330,11 @@ static int ext4_block_to_path(struct inode *inode, int final = 0; if (i_block < 0) { - ext4_warning (inode->i_sb, "ext4_block_to_path", "block < 0"); + ext4_warning(inode->i_sb, "ext4_block_to_path", "block < 0"); } else if (i_block < direct_blocks) { offsets[n++] = i_block; final = direct_blocks; - } else if ( (i_block -= direct_blocks) < indirect_blocks) { + } else if ((i_block -= direct_blocks) < indirect_blocks) { offsets[n++] = EXT4_IND_BLOCK; offsets[n++] = i_block; final = ptrs; @@ -398,14 +400,14 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth, *err = 0; /* i_data is not going away, no lock needed */ - add_chain (chain, NULL, EXT4_I(inode)->i_data + *offsets); + add_chain(chain, NULL, EXT4_I(inode)->i_data + *offsets); if (!p->key) goto no_block; while (--depth) { bh = sb_bread(sb, le32_to_cpu(p->key)); if (!bh) goto failure; - add_chain(++p, bh, (__le32*)bh->b_data + *++offsets); + add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets); /* Reader: end */ if (!p->key) goto no_block; @@ -441,7 +443,7 @@ no_block: static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) { struct ext4_inode_info *ei = EXT4_I(inode); - __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data; + __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; ext4_fsblk_t bg_start; ext4_fsblk_t last_block; @@ -484,18 +486,9 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind) static ext4_fsblk_t ext4_find_goal(struct inode *inode, ext4_lblk_t block, Indirect *partial) { - struct ext4_block_alloc_info *block_i; - - block_i = EXT4_I(inode)->i_block_alloc_info; - /* - * try the heuristic for sequential allocation, - * failing that at least try to get decent locality. + * XXX need to get goal block from mballoc's data structures */ - if (block_i && (block == block_i->last_alloc_logical_block + 1) - && (block_i->last_alloc_physical_block != 0)) { - return block_i->last_alloc_physical_block + 1; - } return ext4_find_near(inode, partial); } @@ -628,7 +621,7 @@ allocated: *err = 0; return ret; failed_out: - for (i = 0; i <index; i++) + for (i = 0; i < index; i++) ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); return ret; } @@ -701,7 +694,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, branch[n].p = (__le32 *) bh->b_data + offsets[n]; branch[n].key = cpu_to_le32(new_blocks[n]); *branch[n].p = branch[n].key; - if ( n == indirect_blks) { + if (n == indirect_blks) { current_block = new_blocks[n]; /* * End of chain, update the last new metablock of @@ -728,7 +721,7 @@ failed: BUFFER_TRACE(branch[i].bh, "call jbd2_journal_forget"); ext4_journal_forget(handle, branch[i].bh); } - for (i = 0; i <indirect_blks; i++) + for (i = 0; i < indirect_blks; i++) ext4_free_blocks(handle, inode, new_blocks[i], 1, 0); ext4_free_blocks(handle, inode, new_blocks[i], num, 0); @@ -755,10 +748,8 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, { int i; int err = 0; - struct ext4_block_alloc_info *block_i; ext4_fsblk_t current_block; - block_i = EXT4_I(inode)->i_block_alloc_info; /* * If we're splicing into a [td]indirect block (as opposed to the * inode) then we need to get write access to the [td]indirect block @@ -781,18 +772,7 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, if (num == 0 && blks > 1) { current_block = le32_to_cpu(where->key) + 1; for (i = 1; i < blks; i++) - *(where->p + i ) = cpu_to_le32(current_block++); - } - - /* - * update the most recently allocated logical & physical block - * in i_block_alloc_info, to assist find the proper goal block for next - * allocation - */ - if (block_i) { - block_i->last_alloc_logical_block = block + blks - 1; - block_i->last_alloc_physical_block = - le32_to_cpu(where[num].key) + blks - 1; + *(where->p + i) = cpu_to_le32(current_block++); } /* We are done with atomic stuff, now do the rest of housekeeping */ @@ -912,12 +892,8 @@ int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, goto cleanup; /* - * Okay, we need to do block allocation. Lazily initialize the block - * allocation info here if necessary + * Okay, we need to do block allocation. */ - if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info)) - ext4_init_block_alloc_info(inode); - goal = ext4_find_goal(inode, iblock, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ @@ -1005,6 +981,9 @@ static int ext4_indirect_calc_metadata_amount(struct inode *inode, int blocks) */ static int ext4_calc_metadata_amount(struct inode *inode, int blocks) { + if (!blocks) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) return ext4_ext_calc_metadata_amount(inode, blocks); @@ -1025,34 +1004,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); mdb_free = EXT4_I(inode)->i_reserved_meta_blocks - mdb; - /* Account for allocated meta_blocks */ - mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; + if (mdb_free) { + /* Account for allocated meta_blocks */ + mdb_free -= EXT4_I(inode)->i_allocated_meta_blocks; - /* update fs free blocks counter for truncate case */ - percpu_counter_add(&sbi->s_freeblocks_counter, mdb_free); + /* update fs dirty blocks counter */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, mdb_free); + EXT4_I(inode)->i_allocated_meta_blocks = 0; + EXT4_I(inode)->i_reserved_meta_blocks = mdb; + } /* update per-inode reservations */ BUG_ON(used > EXT4_I(inode)->i_reserved_data_blocks); EXT4_I(inode)->i_reserved_data_blocks -= used; - BUG_ON(mdb > EXT4_I(inode)->i_reserved_meta_blocks); - EXT4_I(inode)->i_reserved_meta_blocks = mdb; - EXT4_I(inode)->i_allocated_meta_blocks = 0; spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); } -/* Maximum number of blocks we map for direct IO at once. */ -#define DIO_MAX_BLOCKS 4096 -/* - * Number of credits we need for writing DIO_MAX_BLOCKS: - * We need sb + group descriptor + bitmap + inode -> 4 - * For B blocks with A block pointers per block we need: - * 1 (triple ind.) + (B/A/A + 2) (doubly ind.) + (B/A + 2) (indirect). - * If we plug in 4096 for B and 256 for A (for 1KB block size), we get 25. - */ -#define DIO_CREDITS 25 - - /* * The ext4_get_blocks_wrap() function try to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1164,19 +1132,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, return retval; } -static int ext4_get_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +/* Maximum number of blocks we map for direct IO at once. */ +#define DIO_MAX_BLOCKS 4096 + +int ext4_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { handle_t *handle = ext4_journal_current_handle(); int ret = 0, started = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + int dio_credits; if (create && !handle) { /* Direct IO write... */ if (max_blocks > DIO_MAX_BLOCKS) max_blocks = DIO_MAX_BLOCKS; - handle = ext4_journal_start(inode, DIO_CREDITS + - 2 * EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb)); + dio_credits = ext4_chunk_trans_blocks(inode, max_blocks); + handle = ext4_journal_start(inode, dio_credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); goto out; @@ -1244,7 +1216,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, BUFFER_TRACE(bh, "call get_create_access"); fatal = ext4_journal_get_create_access(handle, bh); if (!fatal && !buffer_uptodate(bh)) { - memset(bh->b_data,0,inode->i_sb->s_blocksize); + memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); @@ -1269,7 +1241,7 @@ err: struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, ext4_lblk_t block, int create, int *err) { - struct buffer_head * bh; + struct buffer_head *bh; bh = ext4_getblk(handle, inode, block, create, err); if (!bh) @@ -1285,13 +1257,13 @@ struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, return NULL; } -static int walk_page_buffers( handle_t *handle, - struct buffer_head *head, - unsigned from, - unsigned to, - int *partial, - int (*fn)( handle_t *handle, - struct buffer_head *bh)) +static int walk_page_buffers(handle_t *handle, + struct buffer_head *head, + unsigned from, + unsigned to, + int *partial, + int (*fn)(handle_t *handle, + struct buffer_head *bh)) { struct buffer_head *bh; unsigned block_start, block_end; @@ -1299,9 +1271,9 @@ static int walk_page_buffers( handle_t *handle, int err, ret = 0; struct buffer_head *next; - for ( bh = head, block_start = 0; - ret == 0 && (bh != head || !block_start); - block_start = block_end, bh = next) + for (bh = head, block_start = 0; + ret == 0 && (bh != head || !block_start); + block_start = block_end, bh = next) { next = bh->b_this_page; block_end = block_start + blocksize; @@ -1354,23 +1326,23 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) { - struct inode *inode = mapping->host; + struct inode *inode = mapping->host; int ret, needed_blocks = ext4_writepage_trans_blocks(inode); handle_t *handle; int retries = 0; - struct page *page; + struct page *page; pgoff_t index; - unsigned from, to; + unsigned from, to; index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; + from = pos & (PAGE_CACHE_SIZE - 1); + to = from + len; retry: - handle = ext4_journal_start(inode, needed_blocks); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; + handle = ext4_journal_start(inode, needed_blocks); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; } page = __grab_cache_page(mapping, index); @@ -1390,9 +1362,16 @@ retry: } if (ret) { - unlock_page(page); + unlock_page(page); ext4_journal_stop(handle); - page_cache_release(page); + page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -1429,16 +1408,18 @@ static int ext4_ordered_write_end(struct file *file, ret = ext4_jbd2_file_inode(handle, inode); if (ret == 0) { - /* - * generic_write_end() will run mark_inode_dirty() if i_size - * changes. So let's piggyback the i_disksize mark_inode_dirty - * into that. - */ loff_t new_i_size; new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } + ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); copied = ret2; @@ -1463,8 +1444,14 @@ static int ext4_writeback_write_end(struct file *file, loff_t new_i_size; new_i_size = pos + copied; - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); + } ret2 = generic_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -1489,6 +1476,7 @@ static int ext4_journalled_write_end(struct file *file, int ret = 0, ret2; int partial = 0; unsigned from, to; + loff_t new_i_size; from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; @@ -1503,11 +1491,12 @@ static int ext4_journalled_write_end(struct file *file, to, &partial, write_end_fn); if (!partial) SetPageUptodate(page); - if (pos+copied > inode->i_size) + new_i_size = pos + copied; + if (new_i_size > inode->i_size) i_size_write(inode, pos+copied); EXT4_I(inode)->i_state |= EXT4_STATE_JDATA; - if (inode->i_size > EXT4_I(inode)->i_disksize) { - EXT4_I(inode)->i_disksize = inode->i_size; + if (new_i_size > EXT4_I(inode)->i_disksize) { + ext4_update_i_disksize(inode, new_i_size); ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) ret = ret2; @@ -1524,6 +1513,7 @@ static int ext4_journalled_write_end(struct file *file, static int ext4_da_reserve_space(struct inode *inode, int nrblocks) { + int retries = 0; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); unsigned long md_needed, mdblocks, total = 0; @@ -1532,6 +1522,7 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) * in order to allocate nrblocks * worse case is one extent per block */ +repeat: spin_lock(&EXT4_I(inode)->i_block_reservation_lock); total = EXT4_I(inode)->i_reserved_data_blocks + nrblocks; mdblocks = ext4_calc_metadata_amount(inode, total); @@ -1540,13 +1531,14 @@ static int ext4_da_reserve_space(struct inode *inode, int nrblocks) md_needed = mdblocks - EXT4_I(inode)->i_reserved_meta_blocks; total = md_needed + nrblocks; - if (ext4_has_free_blocks(sbi, total) < total) { + if (ext4_claim_free_blocks(sbi, total)) { spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + if (ext4_should_retry_alloc(inode->i_sb, &retries)) { + yield(); + goto repeat; + } return -ENOSPC; } - /* reduce fs free blocks counter */ - percpu_counter_sub(&sbi->s_freeblocks_counter, total); - EXT4_I(inode)->i_reserved_data_blocks += nrblocks; EXT4_I(inode)->i_reserved_meta_blocks = mdblocks; @@ -1559,7 +1551,25 @@ static void ext4_da_release_space(struct inode *inode, int to_free) struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int total, mdb, mdb_free, release; + if (!to_free) + return; /* Nothing to release, exit */ + spin_lock(&EXT4_I(inode)->i_block_reservation_lock); + + if (!EXT4_I(inode)->i_reserved_data_blocks) { + /* + * if there is no reserved blocks, but we try to free some + * then the counter is messed up somewhere. + * but since this function is called from invalidate + * page, it's harmless to return without any action + */ + printk(KERN_INFO "ext4 delalloc try to release %d reserved " + "blocks for inode %lu, but there is no reserved " + "data blocks\n", to_free, inode->i_ino); + spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); + return; + } + /* recalculate the number of metablocks still need to be reserved */ total = EXT4_I(inode)->i_reserved_data_blocks - to_free; mdb = ext4_calc_metadata_amount(inode, total); @@ -1570,8 +1580,8 @@ static void ext4_da_release_space(struct inode *inode, int to_free) release = to_free + mdb_free; - /* update fs free blocks counter for truncate case */ - percpu_counter_add(&sbi->s_freeblocks_counter, release); + /* update fs dirty blocks counter for truncate case */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, release); /* update per-inode reservations */ BUG_ON(to_free > EXT4_I(inode)->i_reserved_data_blocks); @@ -1613,11 +1623,14 @@ struct mpage_da_data { unsigned long first_page, next_page; /* extent of pages */ get_block_t *get_block; struct writeback_control *wbc; + int io_done; + long pages_written; + int retval; }; /* * mpage_da_submit_io - walks through extent of pages and try to write - * them with __mpage_writepage() + * them with writepage() call back * * @mpd->inode: inode * @mpd->first_page: first page of the extent @@ -1632,37 +1645,42 @@ struct mpage_da_data { static int mpage_da_submit_io(struct mpage_da_data *mpd) { struct address_space *mapping = mpd->inode->i_mapping; - struct mpage_data mpd_pp = { - .bio = NULL, - .last_block_in_bio = 0, - .get_block = mpd->get_block, - .use_writepage = 1, - }; int ret = 0, err, nr_pages, i; unsigned long index, end; struct pagevec pvec; + long pages_skipped; BUG_ON(mpd->next_page <= mpd->first_page); - pagevec_init(&pvec, 0); index = mpd->first_page; end = mpd->next_page - 1; while (index <= end) { - /* XXX: optimize tail */ - nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + /* + * We can use PAGECACHE_TAG_DIRTY lookup here because + * even though we have cleared the dirty flag on the page + * We still keep the page in the radix tree with tag + * PAGECACHE_TAG_DIRTY. See clear_page_dirty_for_io. + * The PAGECACHE_TAG_DIRTY is cleared in set_page_writeback + * which is called via the below writepage callback. + */ + nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, + min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1); if (nr_pages == 0) break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - index = page->index; - if (index > end) - break; - index++; - - err = __mpage_writepage(page, mpd->wbc, &mpd_pp); - + pages_skipped = mpd->wbc->pages_skipped; + err = mapping->a_ops->writepage(page, mpd->wbc); + if (!err && (pages_skipped == mpd->wbc->pages_skipped)) + /* + * have successfully written the page + * without skipping the same + */ + mpd->pages_written++; /* * In error case, we have to continue because * remaining pages are still locked @@ -1673,9 +1691,6 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) } pagevec_release(&pvec); } - if (mpd_pp.bio) - mpage_bio_submit(WRITE, mpd_pp.bio); - return ret; } @@ -1698,7 +1713,7 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, int blocks = exbh->b_size >> inode->i_blkbits; sector_t pblock = exbh->b_blocknr, cur_logical; struct buffer_head *head, *bh; - unsigned long index, end; + pgoff_t index, end; struct pagevec pvec; int nr_pages, i; @@ -1741,6 +1756,13 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, if (buffer_delay(bh)) { bh->b_blocknr = pblock; clear_buffer_delay(bh); + bh->b_bdev = inode->i_sb->s_bdev; + } else if (buffer_unwritten(bh)) { + bh->b_blocknr = pblock; + clear_buffer_unwritten(bh); + set_buffer_mapped(bh); + set_buffer_new(bh); + bh->b_bdev = inode->i_sb->s_bdev; } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -1768,6 +1790,57 @@ static inline void __unmap_underlying_blocks(struct inode *inode, unmap_underlying_metadata(bdev, bh->b_blocknr + i); } +static void ext4_da_block_invalidatepages(struct mpage_da_data *mpd, + sector_t logical, long blk_cnt) +{ + int nr_pages, i; + pgoff_t index, end; + struct pagevec pvec; + struct inode *inode = mpd->inode; + struct address_space *mapping = inode->i_mapping; + + index = logical >> (PAGE_CACHE_SHIFT - inode->i_blkbits); + end = (logical + blk_cnt - 1) >> + (PAGE_CACHE_SHIFT - inode->i_blkbits); + while (index <= end) { + nr_pages = pagevec_lookup(&pvec, mapping, index, PAGEVEC_SIZE); + if (nr_pages == 0) + break; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + index = page->index; + if (index > end) + break; + index++; + + BUG_ON(!PageLocked(page)); + BUG_ON(PageWriteback(page)); + block_invalidatepage(page, 0); + ClearPageUptodate(page); + unlock_page(page); + } + } + return; +} + +static void ext4_print_free_blocks(struct inode *inode) +{ + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + printk(KERN_EMERG "Total free blocks count %lld\n", + ext4_count_free_blocks(inode->i_sb)); + printk(KERN_EMERG "Free/Dirty block details\n"); + printk(KERN_EMERG "free_blocks=%lld\n", + percpu_counter_sum(&sbi->s_freeblocks_counter)); + printk(KERN_EMERG "dirty_blocks=%lld\n", + percpu_counter_sum(&sbi->s_dirtyblocks_counter)); + printk(KERN_EMERG "Block reservation details\n"); + printk(KERN_EMERG "i_reserved_data_blocks=%lu\n", + EXT4_I(inode)->i_reserved_data_blocks); + printk(KERN_EMERG "i_reserved_meta_blocks=%lu\n", + EXT4_I(inode)->i_reserved_meta_blocks); + return; +} + /* * mpage_da_map_blocks - go through given space * @@ -1776,54 +1849,87 @@ static inline void __unmap_underlying_blocks(struct inode *inode, * * The function skips space we know is already mapped to disk blocks. * - * The function ignores errors ->get_block() returns, thus real - * error handling is postponed to __mpage_writepage() */ -static void mpage_da_map_blocks(struct mpage_da_data *mpd) +static int mpage_da_map_blocks(struct mpage_da_data *mpd) { - struct buffer_head *lbh = &mpd->lbh; - int err = 0, remain = lbh->b_size; - sector_t next = lbh->b_blocknr; + int err = 0; struct buffer_head new; + struct buffer_head *lbh = &mpd->lbh; + sector_t next; /* * We consider only non-mapped and non-allocated blocks */ if (buffer_mapped(lbh) && !buffer_delay(lbh)) - return; + return 0; + new.b_state = lbh->b_state; + new.b_blocknr = 0; + new.b_size = lbh->b_size; + next = lbh->b_blocknr; + /* + * If we didn't accumulate anything + * to write simply return + */ + if (!new.b_size) + return 0; + err = mpd->get_block(mpd->inode, next, &new, 1); + if (err) { - while (remain) { - new.b_state = lbh->b_state; - new.b_blocknr = 0; - new.b_size = remain; - err = mpd->get_block(mpd->inode, next, &new, 1); - if (err) { - /* - * Rather than implement own error handling - * here, we just leave remaining blocks - * unallocated and try again with ->writepage() - */ - break; - } - BUG_ON(new.b_size == 0); + /* If get block returns with error + * we simply return. Later writepage + * will redirty the page and writepages + * will find the dirty page again + */ + if (err == -EAGAIN) + return 0; - if (buffer_new(&new)) - __unmap_underlying_blocks(mpd->inode, &new); + if (err == -ENOSPC && + ext4_count_free_blocks(mpd->inode->i_sb)) { + mpd->retval = err; + return 0; + } /* - * If blocks are delayed marked, we need to - * put actual blocknr and drop delayed bit + * get block failure will cause us + * to loop in writepages. Because + * a_ops->writepage won't be able to + * make progress. The page will be redirtied + * by writepage and writepages will again + * try to write the same. */ - if (buffer_delay(lbh)) - mpage_put_bnr_to_bhs(mpd, next, &new); - - /* go for the remaining blocks */ - next += new.b_size >> mpd->inode->i_blkbits; - remain -= new.b_size; + printk(KERN_EMERG "%s block allocation failed for inode %lu " + "at logical offset %llu with max blocks " + "%zd with error %d\n", + __func__, mpd->inode->i_ino, + (unsigned long long)next, + lbh->b_size >> mpd->inode->i_blkbits, err); + printk(KERN_EMERG "This should not happen.!! " + "Data will be lost\n"); + if (err == -ENOSPC) { + ext4_print_free_blocks(mpd->inode); + } + /* invlaidate all the pages */ + ext4_da_block_invalidatepages(mpd, next, + lbh->b_size >> mpd->inode->i_blkbits); + return err; } + BUG_ON(new.b_size == 0); + + if (buffer_new(&new)) + __unmap_underlying_blocks(mpd->inode, &new); + + /* + * If blocks are delayed marked, we need to + * put actual blocknr and drop delayed bit + */ + if (buffer_delay(lbh) || buffer_unwritten(lbh)) + mpage_put_bnr_to_bhs(mpd, next, &new); + + return 0; } -#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | (1 << BH_Delay)) +#define BH_FLAGS ((1 << BH_Uptodate) | (1 << BH_Mapped) | \ + (1 << BH_Delay) | (1 << BH_Unwritten)) /* * mpage_add_bh_to_extent - try to add one more block to extent of blocks @@ -1837,41 +1943,61 @@ static void mpage_da_map_blocks(struct mpage_da_data *mpd) static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *bh) { - struct buffer_head *lbh = &mpd->lbh; sector_t next; + size_t b_size = bh->b_size; + struct buffer_head *lbh = &mpd->lbh; + int nrblocks = lbh->b_size >> mpd->inode->i_blkbits; - next = lbh->b_blocknr + (lbh->b_size >> mpd->inode->i_blkbits); - + /* check if thereserved journal credits might overflow */ + if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) { + if (nrblocks >= EXT4_MAX_TRANS_DATA) { + /* + * With non-extent format we are limited by the journal + * credit available. Total credit needed to insert + * nrblocks contiguous blocks is dependent on the + * nrblocks. So limit nrblocks. + */ + goto flush_it; + } else if ((nrblocks + (b_size >> mpd->inode->i_blkbits)) > + EXT4_MAX_TRANS_DATA) { + /* + * Adding the new buffer_head would make it cross the + * allowed limit for which we have journal credit + * reserved. So limit the new bh->b_size + */ + b_size = (EXT4_MAX_TRANS_DATA - nrblocks) << + mpd->inode->i_blkbits; + /* we will do mpage_da_submit_io in the next loop */ + } + } /* * First block in the extent */ if (lbh->b_size == 0) { lbh->b_blocknr = logical; - lbh->b_size = bh->b_size; + lbh->b_size = b_size; lbh->b_state = bh->b_state & BH_FLAGS; return; } + next = lbh->b_blocknr + nrblocks; /* * Can we merge the block to our big extent? */ if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) { - lbh->b_size += bh->b_size; + lbh->b_size += b_size; return; } +flush_it: /* * We couldn't merge the block to our extent, so we * need to flush current extent and start new one */ - mpage_da_map_blocks(mpd); - - /* - * Now start a new extent - */ - lbh->b_size = bh->b_size; - lbh->b_state = bh->b_state & BH_FLAGS; - lbh->b_blocknr = logical; + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + mpd->io_done = 1; + return; } /* @@ -1891,17 +2017,35 @@ static int __mpage_da_writepage(struct page *page, struct buffer_head *bh, *head, fake; sector_t logical; + if (mpd->io_done) { + /* + * Rest of the page in the page_vec + * redirty then and skip then. We will + * try to to write them again after + * starting a new transaction + */ + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; + } /* * Can we merge this page to current extent? */ if (mpd->next_page != page->index) { /* * Nope, we can't. So, we map non-allocated blocks - * and start IO on them using __mpage_writepage() + * and start IO on them using writepage() */ if (mpd->next_page != mpd->first_page) { - mpage_da_map_blocks(mpd); - mpage_da_submit_io(mpd); + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + /* + * skip rest of the page in the page_vec + */ + mpd->io_done = 1; + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return MPAGE_DA_EXTENT_TAIL; } /* @@ -1932,6 +2076,8 @@ static int __mpage_da_writepage(struct page *page, set_buffer_dirty(bh); set_buffer_uptodate(bh); mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; } else { /* * Page with regular buffer heads, just add all dirty ones @@ -1940,8 +2086,12 @@ static int __mpage_da_writepage(struct page *page, bh = head; do { BUG_ON(buffer_locked(bh)); - if (buffer_dirty(bh)) + if (buffer_dirty(bh) && + (!buffer_mapped(bh) || buffer_delay(bh))) { mpage_add_bh_to_extent(mpd, logical, bh); + if (mpd->io_done) + return MPAGE_DA_EXTENT_TAIL; + } logical++; } while ((bh = bh->b_this_page) != head); } @@ -1960,46 +2110,37 @@ static int __mpage_da_writepage(struct page *page, * * This is a library function, which implements the writepages() * address_space_operation. - * - * In order to avoid duplication of logic that deals with partial pages, - * multiple bio per page, etc, we find non-allocated blocks, allocate - * them with minimal calls to ->get_block() and re-use __mpage_writepage() - * - * It's important that we call __mpage_writepage() only once for each - * involved page, otherwise we'd have to implement more complicated logic - * to deal with pages w/o PG_lock or w/ PG_writeback and so on. - * - * See comments to mpage_writepages() */ static int mpage_da_writepages(struct address_space *mapping, struct writeback_control *wbc, - get_block_t get_block) + struct mpage_da_data *mpd) { - struct mpage_da_data mpd; int ret; - if (!get_block) + if (!mpd->get_block) return generic_writepages(mapping, wbc); - mpd.wbc = wbc; - mpd.inode = mapping->host; - mpd.lbh.b_size = 0; - mpd.lbh.b_state = 0; - mpd.lbh.b_blocknr = 0; - mpd.first_page = 0; - mpd.next_page = 0; - mpd.get_block = get_block; - - ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, &mpd); + mpd->lbh.b_size = 0; + mpd->lbh.b_state = 0; + mpd->lbh.b_blocknr = 0; + mpd->first_page = 0; + mpd->next_page = 0; + mpd->io_done = 0; + mpd->pages_written = 0; + mpd->retval = 0; + ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd); /* * Handle last extent of pages */ - if (mpd.next_page != mpd.first_page) { - mpage_da_map_blocks(&mpd); - mpage_da_submit_io(&mpd); - } + if (!mpd->io_done && mpd->next_page != mpd->first_page) { + if (mpage_da_map_blocks(mpd) == 0) + mpage_da_submit_io(mpd); + mpd->io_done = 1; + ret = MPAGE_DA_EXTENT_TAIL; + } + wbc->nr_to_write -= mpd->pages_written; return ret; } @@ -2052,18 +2193,24 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle_t *handle = NULL; handle = ext4_journal_current_handle(); - if (!handle) { - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, 0, 0, 0); - BUG_ON(!ret); - } else { - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); - } - + BUG_ON(!handle); + ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, + bh_result, create, 0, EXT4_DELALLOC_RSVED); if (ret > 0) { + bh_result->b_size = (ret << inode->i_blkbits); + if (ext4_should_order_data(inode)) { + int retval; + retval = ext4_jbd2_file_inode(handle, inode); + if (retval) + /* + * Failed to add inode for ordered + * mode. Don't update file size + */ + return retval; + } + /* * Update on-disk size along with block allocation * we don't use 'extend_disksize' as size may change @@ -2073,18 +2220,9 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, if (disksize > i_size_read(inode)) disksize = i_size_read(inode); if (disksize > EXT4_I(inode)->i_disksize) { - /* - * XXX: replace with spinlock if seen contended -bzzz - */ - down_write(&EXT4_I(inode)->i_data_sem); - if (disksize > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = disksize; - up_write(&EXT4_I(inode)->i_data_sem); - - if (EXT4_I(inode)->i_disksize == disksize) { - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } + ext4_update_i_disksize(inode, disksize); + ret = ext4_mark_inode_dirty(handle, inode); + return ret; } ret = 0; } @@ -2204,102 +2342,177 @@ static int ext4_da_writepage(struct page *page, } /* - * For now just follow the DIO way to estimate the max credits - * needed to write out EXT4_MAX_WRITEBACK_PAGES. - * todo: need to calculate the max credits need for - * extent based files, currently the DIO credits is based on - * indirect-blocks mapping way. - * - * Probably should have a generic way to calculate credits - * for DIO, writepages, and truncate + * This is called via ext4_da_writepages() to + * calulate the total number of credits to reserve to fit + * a single extent allocation into a single transaction, + * ext4_da_writpeages() will loop calling this before + * the block allocation. */ -#define EXT4_MAX_WRITEBACK_PAGES DIO_MAX_BLOCKS -#define EXT4_MAX_WRITEBACK_CREDITS DIO_CREDITS + +static int ext4_da_writepages_trans_blocks(struct inode *inode) +{ + int max_blocks = EXT4_I(inode)->i_reserved_data_blocks; + + /* + * With non-extent format the journal credit needed to + * insert nrblocks contiguous block is dependent on + * number of contiguous block. So we will limit + * number of contiguous block to a sane value + */ + if (!(inode->i_flags & EXT4_EXTENTS_FL) && + (max_blocks > EXT4_MAX_TRANS_DATA)) + max_blocks = EXT4_MAX_TRANS_DATA; + + return ext4_chunk_trans_blocks(inode, max_blocks); +} static int ext4_da_writepages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc) { - struct inode *inode = mapping->host; + pgoff_t index; + int range_whole = 0; handle_t *handle = NULL; - int needed_blocks; - int ret = 0; - long to_write; - loff_t range_start = 0; + struct mpage_da_data mpd; + struct inode *inode = mapping->host; + int no_nrwrite_index_update; + long pages_written = 0, pages_skipped; + int needed_blocks, ret = 0, nr_to_writebump = 0; + struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); /* * No pages to write? This is mainly a kludge to avoid starting * a transaction for special inodes like journal inode on last iput() * because that could violate lock ordering on umount */ - if (!mapping->nrpages) + if (!mapping->nrpages || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; + /* + * Make sure nr_to_write is >= sbi->s_mb_stream_request + * This make sure small files blocks are allocated in + * single attempt. This ensure that small files + * get less fragmented. + */ + if (wbc->nr_to_write < sbi->s_mb_stream_request) { + nr_to_writebump = sbi->s_mb_stream_request - wbc->nr_to_write; + wbc->nr_to_write = sbi->s_mb_stream_request; + } + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + + if (wbc->range_cyclic) + index = mapping->writeback_index; + else + index = wbc->range_start >> PAGE_CACHE_SHIFT; + + mpd.wbc = wbc; + mpd.inode = mapping->host; /* - * Estimate the worse case needed credits to write out - * EXT4_MAX_BUF_BLOCKS pages + * we don't want write_cache_pages to update + * nr_to_write and writeback_index */ - needed_blocks = EXT4_MAX_WRITEBACK_CREDITS; + no_nrwrite_index_update = wbc->no_nrwrite_index_update; + wbc->no_nrwrite_index_update = 1; + pages_skipped = wbc->pages_skipped; + + while (!ret && wbc->nr_to_write > 0) { - to_write = wbc->nr_to_write; - if (!wbc->range_cyclic) { /* - * If range_cyclic is not set force range_cont - * and save the old writeback_index + * we insert one extent at a time. So we need + * credit needed for single extent allocation. + * journalled mode is currently not supported + * by delalloc */ - wbc->range_cont = 1; - range_start = wbc->range_start; - } + BUG_ON(ext4_should_journal_data(inode)); + needed_blocks = ext4_da_writepages_trans_blocks(inode); - while (!ret && to_write) { /* start a new transaction*/ handle = ext4_journal_start(inode, needed_blocks); if (IS_ERR(handle)) { ret = PTR_ERR(handle); + printk(KERN_EMERG "%s: jbd2_start: " + "%ld pages, ino %lu; err %d\n", __func__, + wbc->nr_to_write, inode->i_ino, ret); + dump_stack(); goto out_writepages; } - if (ext4_should_order_data(inode)) { - /* - * With ordered mode we need to add - * the inode to the journal handle - * when we do block allocation. - */ - ret = ext4_jbd2_file_inode(handle, inode); - if (ret) { - ext4_journal_stop(handle); - goto out_writepages; - } - - } - /* - * set the max dirty pages could be write at a time - * to fit into the reserved transaction credits - */ - if (wbc->nr_to_write > EXT4_MAX_WRITEBACK_PAGES) - wbc->nr_to_write = EXT4_MAX_WRITEBACK_PAGES; + mpd.get_block = ext4_da_get_block_write; + ret = mpage_da_writepages(mapping, wbc, &mpd); - to_write -= wbc->nr_to_write; - ret = mpage_da_writepages(mapping, wbc, - ext4_da_get_block_write); ext4_journal_stop(handle); - if (wbc->nr_to_write) { + + if (mpd.retval == -ENOSPC) { + /* commit the transaction which would + * free blocks released in the transaction + * and try again + */ + jbd2_journal_force_commit_nested(sbi->s_journal); + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (ret == MPAGE_DA_EXTENT_TAIL) { + /* + * got one extent now try with + * rest of the pages + */ + pages_written += mpd.pages_written; + wbc->pages_skipped = pages_skipped; + ret = 0; + } else if (wbc->nr_to_write) /* * There is no more writeout needed * or we requested for a noblocking writeout * and we found the device congested */ - to_write += wbc->nr_to_write; break; - } - wbc->nr_to_write = to_write; } + if (pages_skipped != wbc->pages_skipped) + printk(KERN_EMERG "This should not happen leaving %s " + "with nr_to_write = %ld ret = %d\n", + __func__, wbc->nr_to_write, ret); + + /* Update index */ + index += pages_written; + if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) + /* + * set the writeback_index so that range_cyclic + * mode will write it back later + */ + mapping->writeback_index = index; out_writepages: - wbc->nr_to_write = to_write; - if (range_start) - wbc->range_start = range_start; + if (!no_nrwrite_index_update) + wbc->no_nrwrite_index_update = 0; + wbc->nr_to_write -= nr_to_writebump; return ret; } +#define FALL_BACK_TO_NONDELALLOC 1 +static int ext4_nonda_switch(struct super_block *sb) +{ + s64 free_blocks, dirty_blocks; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + /* + * switch to non delalloc mode if we are running low + * on free block. The free block accounting via percpu + * counters can get slightly wrong with FBC_BATCH getting + * accumulated on each CPU without updating global counters + * Delalloc need an accurate free block accounting. So switch + * to non delalloc when we are near to error range. + */ + free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + dirty_blocks = percpu_counter_read_positive(&sbi->s_dirtyblocks_counter); + if (2 * free_blocks < 3 * dirty_blocks || + free_blocks < (dirty_blocks + EXT4_FREEBLOCKS_WATERMARK)) { + /* + * free block count is less that 150% of dirty blocks + * or free blocks is less that watermark + */ + return 1; + } + return 0; +} + static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2315,6 +2528,12 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, from = pos & (PAGE_CACHE_SIZE - 1); to = from + len; + if (ext4_nonda_switch(inode->i_sb)) { + *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; + return ext4_write_begin(file, mapping, pos, + len, flags, pagep, fsdata); + } + *fsdata = (void *)0; retry: /* * With delayed allocation, we don't log the i_disksize update @@ -2342,6 +2561,13 @@ retry: unlock_page(page); ext4_journal_stop(handle); page_cache_release(page); + /* + * block_write_begin may have instantiated a few blocks + * outside i_size. Trim these off again. Don't need + * i_size_read because we hold i_mutex. + */ + if (pos + len > inode->i_size) + vmtruncate(inode, inode->i_size); } if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) @@ -2365,7 +2591,7 @@ static int ext4_da_should_update_i_disksize(struct page *page, bh = page_buffers(page); idx = offset >> inode->i_blkbits; - for (i=0; i < idx; i++) + for (i = 0; i < idx; i++) bh = bh->b_this_page; if (!buffer_mapped(bh) || (buffer_delay(bh))) @@ -2383,9 +2609,22 @@ static int ext4_da_write_end(struct file *file, handle_t *handle = ext4_journal_current_handle(); loff_t new_i_size; unsigned long start, end; + int write_mode = (int)(unsigned long)fsdata; + + if (write_mode == FALL_BACK_TO_NONDELALLOC) { + if (ext4_should_order_data(inode)) { + return ext4_ordered_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else if (ext4_should_writeback_data(inode)) { + return ext4_writeback_write_end(file, mapping, pos, + len, copied, page, fsdata); + } else { + BUG(); + } + } start = pos & (PAGE_CACHE_SIZE - 1); - end = start + copied -1; + end = start + copied - 1; /* * generic_write_end() will run mark_inode_dirty() if i_size @@ -2409,6 +2648,11 @@ static int ext4_da_write_end(struct file *file, EXT4_I(inode)->i_disksize = new_i_size; } up_write(&EXT4_I(inode)->i_data_sem); + /* We need to mark inode dirty even if + * new_i_size is less that inode->i_size + * bu greater than i_disksize.(hint delalloc) + */ + ext4_mark_inode_dirty(handle, inode); } } ret2 = generic_write_end(file, mapping, pos, len, copied, @@ -2500,7 +2744,7 @@ static sector_t ext4_bmap(struct address_space *mapping, sector_t block) return 0; } - return generic_block_bmap(mapping,block,ext4_get_block); + return generic_block_bmap(mapping, block, ext4_get_block); } static int bget_one(handle_t *handle, struct buffer_head *bh) @@ -3106,7 +3350,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth, if (!partial->key && *partial->p) /* Writer: end */ goto no_top; - for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--) + for (p = partial; (p > chain) && all_zeroes((__le32 *) p->bh->b_data, p->p); p--) ; /* * OK, we've found the last block that must survive. The rest of our @@ -3125,7 +3369,7 @@ static Indirect *ext4_find_shared(struct inode *inode, int depth, } /* Writer: end */ - while(partial > p) { + while (partial > p) { brelse(partial->bh); partial--; } @@ -3317,9 +3561,9 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, /* This zaps the entire block. Bottom up. */ BUFFER_TRACE(bh, "free child branches"); ext4_free_branches(handle, inode, bh, - (__le32*)bh->b_data, - (__le32*)bh->b_data + addr_per_block, - depth); + (__le32 *) bh->b_data, + (__le32 *) bh->b_data + addr_per_block, + depth); /* * We've probably journalled the indirect block several @@ -3486,6 +3730,9 @@ void ext4_truncate(struct inode *inode) * modify the block allocation tree. */ down_write(&ei->i_data_sem); + + ext4_discard_preallocations(inode); + /* * The orphan list entry will now protect us from any crash which * occurs before the truncate completes, so it is now safe to propagate @@ -3555,8 +3802,6 @@ do_indirects: ; } - ext4_discard_reservation(inode); - up_write(&ei->i_data_sem); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); @@ -3581,41 +3826,6 @@ out_stop: ext4_journal_stop(handle); } -static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, - unsigned long ino, struct ext4_iloc *iloc) -{ - ext4_group_t block_group; - unsigned long offset; - ext4_fsblk_t block; - struct ext4_group_desc *gdp; - - if (!ext4_valid_inum(sb, ino)) { - /* - * This error is already checked for in namei.c unless we are - * looking at an NFS filehandle, in which case no error - * report is needed - */ - return 0; - } - - block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); - gdp = ext4_get_group_desc(sb, block_group, NULL); - if (!gdp) - return 0; - - /* - * Figure out the offset within the block group inode table - */ - offset = ((ino - 1) % EXT4_INODES_PER_GROUP(sb)) * - EXT4_INODE_SIZE(sb); - block = ext4_inode_table(sb, gdp) + - (offset >> EXT4_BLOCK_SIZE_BITS(sb)); - - iloc->block_group = block_group; - iloc->offset = offset & (EXT4_BLOCK_SIZE(sb) - 1); - return block; -} - /* * ext4_get_inode_loc returns with an extra refcount against the inode's * underlying buffer_head on success. If 'in_mem' is true, we have all @@ -3625,19 +3835,35 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, static int __ext4_get_inode_loc(struct inode *inode, struct ext4_iloc *iloc, int in_mem) { - ext4_fsblk_t block; - struct buffer_head *bh; + struct ext4_group_desc *gdp; + struct buffer_head *bh; + struct super_block *sb = inode->i_sb; + ext4_fsblk_t block; + int inodes_per_block, inode_offset; + + iloc->bh = 0; + if (!ext4_valid_inum(sb, inode->i_ino)) + return -EIO; - block = ext4_get_inode_block(inode->i_sb, inode->i_ino, iloc); - if (!block) + iloc->block_group = (inode->i_ino - 1) / EXT4_INODES_PER_GROUP(sb); + gdp = ext4_get_group_desc(sb, iloc->block_group, NULL); + if (!gdp) return -EIO; - bh = sb_getblk(inode->i_sb, block); + /* + * Figure out the offset within the block group inode table + */ + inodes_per_block = (EXT4_BLOCK_SIZE(sb) / EXT4_INODE_SIZE(sb)); + inode_offset = ((inode->i_ino - 1) % + EXT4_INODES_PER_GROUP(sb)); + block = ext4_inode_table(sb, gdp) + (inode_offset / inodes_per_block); + iloc->offset = (inode_offset % inodes_per_block) * EXT4_INODE_SIZE(sb); + + bh = sb_getblk(sb, block); if (!bh) { - ext4_error (inode->i_sb, "ext4_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, "ext4_get_inode_loc", "unable to read " + "inode block - inode=%lu, block=%llu", + inode->i_ino, block); return -EIO; } if (!buffer_uptodate(bh)) { @@ -3665,28 +3891,12 @@ static int __ext4_get_inode_loc(struct inode *inode, */ if (in_mem) { struct buffer_head *bitmap_bh; - struct ext4_group_desc *desc; - int inodes_per_buffer; - int inode_offset, i; - ext4_group_t block_group; - int start; - - block_group = (inode->i_ino - 1) / - EXT4_INODES_PER_GROUP(inode->i_sb); - inodes_per_buffer = bh->b_size / - EXT4_INODE_SIZE(inode->i_sb); - inode_offset = ((inode->i_ino - 1) % - EXT4_INODES_PER_GROUP(inode->i_sb)); - start = inode_offset & ~(inodes_per_buffer - 1); + int i, start; - /* Is the inode bitmap in cache? */ - desc = ext4_get_group_desc(inode->i_sb, - block_group, NULL); - if (!desc) - goto make_io; + start = inode_offset & ~(inodes_per_block - 1); - bitmap_bh = sb_getblk(inode->i_sb, - ext4_inode_bitmap(inode->i_sb, desc)); + /* Is the inode bitmap in cache? */ + bitmap_bh = sb_getblk(sb, ext4_inode_bitmap(sb, gdp)); if (!bitmap_bh) goto make_io; @@ -3699,14 +3909,14 @@ static int __ext4_get_inode_loc(struct inode *inode, brelse(bitmap_bh); goto make_io; } - for (i = start; i < start + inodes_per_buffer; i++) { + for (i = start; i < start + inodes_per_block; i++) { if (i == inode_offset) continue; if (ext4_test_bit(i, bitmap_bh->b_data)) break; } brelse(bitmap_bh); - if (i == start + inodes_per_buffer) { + if (i == start + inodes_per_block) { /* all other inodes are free, so skip I/O */ memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); @@ -3717,6 +3927,36 @@ static int __ext4_get_inode_loc(struct inode *inode, make_io: /* + * If we need to do any I/O, try to pre-readahead extra + * blocks from the inode table. + */ + if (EXT4_SB(sb)->s_inode_readahead_blks) { + ext4_fsblk_t b, end, table; + unsigned num; + + table = ext4_inode_table(sb, gdp); + /* Make sure s_inode_readahead_blks is a power of 2 */ + while (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)) + EXT4_SB(sb)->s_inode_readahead_blks = + (EXT4_SB(sb)->s_inode_readahead_blks & + (EXT4_SB(sb)->s_inode_readahead_blks-1)); + b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1); + if (table > b) + b = table; + end = b + EXT4_SB(sb)->s_inode_readahead_blks; + num = EXT4_INODES_PER_GROUP(sb); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) + num -= le16_to_cpu(gdp->bg_itable_unused); + table += num / inodes_per_block; + if (end > table) + end = table; + while (b <= end) + sb_breadahead(sb, b++); + } + + /* * There are other valid inodes in the buffer, this inode * has in-inode xattrs, or we don't have this inode in memory. * Read the block from disk. @@ -3726,10 +3966,9 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ext4_error(inode->i_sb, "ext4_get_inode_loc", - "unable to read inode block - " - "inode=%lu, block=%llu", - inode->i_ino, block); + ext4_error(sb, __func__, + "unable to read inode block - inode=%lu, " + "block=%llu", inode->i_ino, block); brelse(bh); return -EIO; } @@ -3821,11 +4060,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) return inode; ei = EXT4_I(inode); -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL ei->i_acl = EXT4_ACL_NOT_CACHED; ei->i_default_acl = EXT4_ACL_NOT_CACHED; #endif - ei->i_block_alloc_info = NULL; ret = __ext4_get_inode_loc(inode, &iloc, 0); if (ret < 0) @@ -3835,7 +4073,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); - if(!(test_opt (inode->i_sb, NO_UID32))) { + if (!(test_opt(inode->i_sb, NO_UID32))) { inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } @@ -3853,7 +4091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (inode->i_mode == 0 || !(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_ORPHAN_FS)) { /* this inode is deleted */ - brelse (bh); + brelse(bh); ret = -ESTALE; goto bad_inode; } @@ -3886,7 +4124,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize); if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize > EXT4_INODE_SIZE(inode->i_sb)) { - brelse (bh); + brelse(bh); ret = -EIO; goto bad_inode; } @@ -3939,7 +4177,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) init_special_inode(inode, inode->i_mode, new_decode_dev(le32_to_cpu(raw_inode->i_block[1]))); } - brelse (iloc.bh); + brelse(iloc.bh); ext4_set_inode_flags(inode); unlock_new_inode(inode); return inode; @@ -3956,7 +4194,6 @@ static int ext4_inode_blocks_set(handle_t *handle, struct inode *inode = &(ei->vfs_inode); u64 i_blocks = inode->i_blocks; struct super_block *sb = inode->i_sb; - int err = 0; if (i_blocks <= ~0U) { /* @@ -3966,36 +4203,27 @@ static int ext4_inode_blocks_set(handle_t *handle, raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = 0; ei->i_flags &= ~EXT4_HUGE_FILE_FL; - } else if (i_blocks <= 0xffffffffffffULL) { + return 0; + } + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) + return -EFBIG; + + if (i_blocks <= 0xffffffffffffULL) { /* * i_blocks can be represented in a 48 bit variable * as multiple of 512 bytes */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; - /* i_block is stored in the split 48 bit fields */ raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); ei->i_flags &= ~EXT4_HUGE_FILE_FL; } else { - /* - * i_blocks should be represented in a 48 bit variable - * as multiple of file system block size - */ - err = ext4_update_rocompat_feature(handle, sb, - EXT4_FEATURE_RO_COMPAT_HUGE_FILE); - if (err) - goto err_out; ei->i_flags |= EXT4_HUGE_FILE_FL; /* i_block is stored in file system block size */ i_blocks = i_blocks >> (inode->i_blkbits - 9); raw_inode->i_blocks_lo = cpu_to_le32(i_blocks); raw_inode->i_blocks_high = cpu_to_le16(i_blocks >> 32); } -err_out: - return err; + return 0; } /* @@ -4021,14 +4249,14 @@ static int ext4_do_update_inode(handle_t *handle, ext4_get_inode_flags(ei); raw_inode->i_mode = cpu_to_le16(inode->i_mode); - if(!(test_opt(inode->i_sb, NO_UID32))) { + if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid)); /* * Fix up interoperability with old kernels. Otherwise, old inodes get * re-used with the upper 16 bits of the uid/gid intact */ - if(!ei->i_dtime) { + if (!ei->i_dtime) { raw_inode->i_uid_high = cpu_to_le16(high_16_bits(inode->i_uid)); raw_inode->i_gid_high = @@ -4116,7 +4344,7 @@ static int ext4_do_update_inode(handle_t *handle, ei->i_state &= ~EXT4_STATE_NEW; out_brelse: - brelse (bh); + brelse(bh); ext4_std_error(inode->i_sb, err); return err; } @@ -4324,57 +4552,129 @@ int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, return 0; } +static int ext4_indirect_trans_blocks(struct inode *inode, int nrblocks, + int chunk) +{ + int indirects; + + /* if nrblocks are contiguous */ + if (chunk) { + /* + * With N contiguous data blocks, it need at most + * N/EXT4_ADDR_PER_BLOCK(inode->i_sb) indirect blocks + * 2 dindirect blocks + * 1 tindirect block + */ + indirects = nrblocks / EXT4_ADDR_PER_BLOCK(inode->i_sb); + return indirects + 3; + } + /* + * if nrblocks are not contiguous, worse case, each block touch + * a indirect block, and each indirect block touch a double indirect + * block, plus a triple indirect block + */ + indirects = nrblocks * 2 + 1; + return indirects; +} + +static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) + return ext4_indirect_trans_blocks(inode, nrblocks, 0); + return ext4_ext_index_trans_blocks(inode, nrblocks, 0); +} /* - * How many blocks doth make a writepage()? - * - * With N blocks per page, it may be: - * N data blocks - * 2 indirect block - * 2 dindirect - * 1 tindirect - * N+5 bitmap blocks (from the above) - * N+5 group descriptor summary blocks - * 1 inode block - * 1 superblock. - * 2 * EXT4_SINGLEDATA_TRANS_BLOCKS for the quote files + * Account for index blocks, block groups bitmaps and block group + * descriptor blocks if modify datablocks and index blocks + * worse case, the indexs blocks spread over different block groups * - * 3 * (N + 5) + 2 + 2 * EXT4_SINGLEDATA_TRANS_BLOCKS + * If datablocks are discontiguous, they are possible to spread over + * different block groups too. If they are contiugous, with flexbg, + * they could still across block group boundary. * - * With ordered or writeback data it's the same, less the N data blocks. + * Also account for superblock, inode, quota and xattr blocks + */ +int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) +{ + int groups, gdpblocks; + int idxblocks; + int ret = 0; + + /* + * How many index blocks need to touch to modify nrblocks? + * The "Chunk" flag indicating whether the nrblocks is + * physically contiguous on disk + * + * For Direct IO and fallocate, they calls get_block to allocate + * one single extent at a time, so they could set the "Chunk" flag + */ + idxblocks = ext4_index_trans_blocks(inode, nrblocks, chunk); + + ret = idxblocks; + + /* + * Now let's see how many group bitmaps and group descriptors need + * to account + */ + groups = idxblocks; + if (chunk) + groups += 1; + else + groups += nrblocks; + + gdpblocks = groups; + if (groups > EXT4_SB(inode->i_sb)->s_groups_count) + groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) + gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; + + /* bitmaps and block group descriptor blocks */ + ret += groups + gdpblocks; + + /* Blocks for super block, inode, quota and xattr blocks */ + ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); + + return ret; +} + +/* + * Calulate the total number of credits to reserve to fit + * the modification of a single pages into a single transaction, + * which may include multiple chunks of block allocations. * - * If the inode's direct blocks can hold an integral number of pages then a - * page cannot straddle two indirect blocks, and we can only touch one indirect - * and dindirect block, and the "5" above becomes "3". + * This could be called via ext4_write_begin() * - * This still overestimates under most circumstances. If we were to pass the - * start and end offsets in here as well we could do block_to_path() on each - * block and work out the exact number of indirects which are touched. Pah. + * We need to consider the worse case, when + * one new block per extent. */ - int ext4_writepage_trans_blocks(struct inode *inode) { int bpp = ext4_journal_blocks_per_page(inode); - int indirects = (EXT4_NDIR_BLOCKS % bpp) ? 5 : 3; int ret; - if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) - return ext4_ext_writepage_trans_blocks(inode, bpp); + ret = ext4_meta_trans_blocks(inode, bpp, 0); + /* Account for data blocks for journalled mode */ if (ext4_should_journal_data(inode)) - ret = 3 * (bpp + indirects) + 2; - else - ret = 2 * (bpp + indirects) + 2; - -#ifdef CONFIG_QUOTA - /* We know that structure was already allocated during DQUOT_INIT so - * we will be updating only the data blocks + inodes */ - ret += 2*EXT4_QUOTA_TRANS_BLOCKS(inode->i_sb); -#endif - + ret += bpp; return ret; } /* + * Calculate the journal credits for a chunk of data modification. + * + * This is called from DIO, fallocate or whoever calling + * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * + * journal buffers for data blocks are not included here, as DIO + * and fallocate do no need to journal data buffers. + */ +int ext4_chunk_trans_blocks(struct inode *inode, int nrblocks) +{ + return ext4_meta_trans_blocks(inode, nrblocks, 1); +} + +/* * The caller must have previously called ext4_reserve_inode_write(). * Give this, we know that the caller already has write access to iloc->bh. */ @@ -4647,6 +4947,7 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) loff_t size; unsigned long len; int ret = -EINVAL; + void *fsdata; struct file *file = vma->vm_file; struct inode *inode = file->f_path.dentry->d_inode; struct address_space *mapping = inode->i_mapping; @@ -4685,11 +4986,11 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct page *page) * on the same page though */ ret = mapping->a_ops->write_begin(file, mapping, page_offset(page), - len, AOP_FLAG_UNINTERRUPTIBLE, &page, NULL); + len, AOP_FLAG_UNINTERRUPTIBLE, &page, &fsdata); if (ret < 0) goto out_unlock; ret = mapping->a_ops->write_end(file, mapping, page_offset(page), - len, len, page, NULL); + len, len, page, fsdata); if (ret < 0) goto out_unlock; ret = 0; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7a6c2f1faba..dc99b4776d5 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -23,9 +23,8 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) struct inode *inode = filp->f_dentry->d_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; - unsigned short rsv_window_size; - ext4_debug ("cmd = %u, arg = %lu\n", cmd, arg); + ext4_debug("cmd = %u, arg = %lu\n", cmd, arg); switch (cmd) { case EXT4_IOC_GETFLAGS: @@ -34,7 +33,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { handle_t *handle = NULL; - int err; + int err, migrate = 0; struct ext4_iloc iloc; unsigned int oldflags; unsigned int jflag; @@ -82,6 +81,17 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE)) goto flags_out; } + if (oldflags & EXT4_EXTENTS_FL) { + /* We don't support clearning extent flags */ + if (!(flags & EXT4_EXTENTS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (flags & EXT4_EXTENTS_FL) { + /* migrate the file */ + migrate = 1; + flags &= ~EXT4_EXTENTS_FL; + } handle = ext4_journal_start(inode, 1); if (IS_ERR(handle)) { @@ -109,6 +119,10 @@ flags_err: if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) + err = ext4_ext_migrate(inode); flags_out: mutex_unlock(&inode->i_mutex); mnt_drop_write(filp->f_path.mnt); @@ -175,53 +189,10 @@ setversion_out: return ret; } #endif - case EXT4_IOC_GETRSVSZ: - if (test_opt(inode->i_sb, RESERVATION) - && S_ISREG(inode->i_mode) - && ei->i_block_alloc_info) { - rsv_window_size = ei->i_block_alloc_info->rsv_window_node.rsv_goal_size; - return put_user(rsv_window_size, (int __user *)arg); - } - return -ENOTTY; - case EXT4_IOC_SETRSVSZ: { - int err; - - if (!test_opt(inode->i_sb, RESERVATION) ||!S_ISREG(inode->i_mode)) - return -ENOTTY; - - if (!is_owner_or_cap(inode)) - return -EACCES; - - if (get_user(rsv_window_size, (int __user *)arg)) - return -EFAULT; - - err = mnt_want_write(filp->f_path.mnt); - if (err) - return err; - - if (rsv_window_size > EXT4_MAX_RESERVE_BLOCKS) - rsv_window_size = EXT4_MAX_RESERVE_BLOCKS; - - /* - * need to allocate reservation structure for this inode - * before set the window size - */ - down_write(&ei->i_data_sem); - if (!ei->i_block_alloc_info) - ext4_init_block_alloc_info(inode); - - if (ei->i_block_alloc_info){ - struct ext4_reserve_window_node *rsv = &ei->i_block_alloc_info->rsv_window_node; - rsv->rsv_goal_size = rsv_window_size; - } - up_write(&ei->i_data_sem); - mnt_drop_write(filp->f_path.mnt); - return 0; - } case EXT4_IOC_GROUP_EXTEND: { ext4_fsblk_t n_blocks_count; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -235,8 +206,10 @@ setversion_out: err = ext4_group_extend(sb, EXT4_SB(sb)->s_es, n_blocks_count); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; mnt_drop_write(filp->f_path.mnt); return err; @@ -244,7 +217,7 @@ setversion_out: case EXT4_IOC_GROUP_ADD: { struct ext4_new_group_data input; struct super_block *sb = inode->i_sb; - int err; + int err, err2; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; @@ -259,15 +232,36 @@ setversion_out: err = ext4_group_add(sb, &input); jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err == 0) + err = err2; mnt_drop_write(filp->f_path.mnt); return err; } case EXT4_IOC_MIGRATE: - return ext4_ext_migrate(inode, filp, cmd, arg); + { + int err; + if (!is_owner_or_cap(inode)) + return -EACCES; + + err = mnt_want_write(filp->f_path.mnt); + if (err) + return err; + /* + * inode_mutex prevent write and truncate on the file. + * Read still goes through. We take i_data_sem in + * ext4_ext_swap_inode_data before we switch the + * inode format to prevent read. + */ + mutex_lock(&(inode->i_mutex)); + err = ext4_ext_migrate(inode); + mutex_unlock(&(inode->i_mutex)); + mnt_drop_write(filp->f_path.mnt); + return err; + } default: return -ENOTTY; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 865e9ddb44d..dfe17a13405 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -477,9 +477,10 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) b2 = (unsigned char *) bitmap; for (i = 0; i < e4b->bd_sb->s_blocksize; i++) { if (b1[i] != b2[i]) { - printk("corruption in group %lu at byte %u(%u):" - " %x in copy != %x on disk/prealloc\n", - e4b->bd_group, i, i * 8, b1[i], b2[i]); + printk(KERN_ERR "corruption in group %lu " + "at byte %u(%u): %x in copy != %x " + "on disk/prealloc\n", + e4b->bd_group, i, i * 8, b1[i], b2[i]); BUG(); } } @@ -533,9 +534,6 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, void *buddy; void *buddy2; - if (!test_opt(sb, MBALLOC)) - return 0; - { static int mb_check_counter; if (mb_check_counter++ % 100 != 0) @@ -784,9 +782,11 @@ static int ext4_mb_init_cache(struct page *page, char *incore) if (bh[i] == NULL) goto out; - if (bh_uptodate_or_lock(bh[i])) + if (buffer_uptodate(bh[i]) && + !(desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) continue; + lock_buffer(bh[i]); spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], @@ -2169,9 +2169,10 @@ static void ext4_mb_history_release(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); - remove_proc_entry("mb_groups", sbi->s_mb_proc); - remove_proc_entry("mb_history", sbi->s_mb_proc); - + if (sbi->s_proc != NULL) { + remove_proc_entry("mb_groups", sbi->s_proc); + remove_proc_entry("mb_history", sbi->s_proc); + } kfree(sbi->s_mb_history); } @@ -2180,10 +2181,10 @@ static void ext4_mb_history_init(struct super_block *sb) struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - if (sbi->s_mb_proc != NULL) { - proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, + if (sbi->s_proc != NULL) { + proc_create_data("mb_history", S_IRUGO, sbi->s_proc, &ext4_mb_seq_history_fops, sb); - proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, + proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); } @@ -2299,6 +2300,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, } INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); + meta_group_info[i]->bb_free_root.rb_node = NULL;; #ifdef DOUBLE_CHECK { @@ -2485,19 +2487,14 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) unsigned max; int ret; - if (!test_opt(sb, MBALLOC)) - return 0; - i = (sb->s_blocksize_bits + 2) * sizeof(unsigned short); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); return -ENOMEM; } sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_maxs); return -ENOMEM; } @@ -2520,16 +2517,12 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) { - clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); return ret; } spin_lock_init(&sbi->s_md_lock); - INIT_LIST_HEAD(&sbi->s_active_transaction); - INIT_LIST_HEAD(&sbi->s_closed_transaction); - INIT_LIST_HEAD(&sbi->s_committed_transaction); spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_max_to_scan = MB_DEFAULT_MAX_TO_SCAN; @@ -2540,17 +2533,15 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) sbi->s_mb_history_filter = EXT4_MB_HISTORY_DEFAULT; sbi->s_mb_group_prealloc = MB_DEFAULT_GROUP_PREALLOC; - i = sizeof(struct ext4_locality_group) * nr_cpu_ids; - sbi->s_locality_groups = kmalloc(i, GFP_KERNEL); + sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group); if (sbi->s_locality_groups == NULL) { - clear_opt(sbi->s_mount_opt, MBALLOC); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); return -ENOMEM; } - for (i = 0; i < nr_cpu_ids; i++) { + for_each_possible_cpu(i) { struct ext4_locality_group *lg; - lg = &sbi->s_locality_groups[i]; + lg = per_cpu_ptr(sbi->s_locality_groups, i); mutex_init(&lg->lg_mutex); for (j = 0; j < PREALLOC_TB_SIZE; j++) INIT_LIST_HEAD(&lg->lg_prealloc_list[j]); @@ -2560,7 +2551,9 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) ext4_mb_init_per_dev_proc(sb); ext4_mb_history_init(sb); - printk("EXT4-fs: mballoc enabled\n"); + sbi->s_journal->j_commit_callback = release_blocks_on_commit; + + printk(KERN_INFO "EXT4-fs: mballoc enabled\n"); return 0; } @@ -2575,7 +2568,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); list_del(&pa->pa_group_list); count++; - kfree(pa); + kmem_cache_free(ext4_pspace_cachep, pa); } if (count) mb_debug("mballoc: %u PAs left\n", count); @@ -2589,18 +2582,6 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!test_opt(sb, MBALLOC)) - return 0; - - /* release freed, non-committed blocks */ - spin_lock(&sbi->s_md_lock); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_committed_transaction); - spin_unlock(&sbi->s_md_lock); - ext4_mb_free_committed_blocks(sb); - if (sbi->s_group_info) { for (i = 0; i < sbi->s_groups_count; i++) { grinfo = ext4_get_group_info(sb, i); @@ -2647,69 +2628,64 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_mb_discarded)); } - kfree(sbi->s_locality_groups); - + free_percpu(sbi->s_locality_groups); ext4_mb_history_release(sb); ext4_mb_destroy_per_dev_proc(sb); return 0; } -static noinline_for_stack void -ext4_mb_free_committed_blocks(struct super_block *sb) +/* + * This function is called by the jbd2 layer once the commit has finished, + * so we know we can free the blocks that were released with that commit. + */ +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) { - struct ext4_sb_info *sbi = EXT4_SB(sb); - int err; - int i; - int count = 0; - int count2 = 0; - struct ext4_free_metadata *md; + struct super_block *sb = journal->j_private; struct ext4_buddy e4b; + struct ext4_group_info *db; + int err, count = 0, count2 = 0; + struct ext4_free_data *entry; + ext4_fsblk_t discard_block; + struct list_head *l, *ltmp; - if (list_empty(&sbi->s_committed_transaction)) - return; - - /* there is committed blocks to be freed yet */ - do { - /* get next array of blocks */ - md = NULL; - spin_lock(&sbi->s_md_lock); - if (!list_empty(&sbi->s_committed_transaction)) { - md = list_entry(sbi->s_committed_transaction.next, - struct ext4_free_metadata, list); - list_del(&md->list); - } - spin_unlock(&sbi->s_md_lock); - - if (md == NULL) - break; + list_for_each_safe(l, ltmp, &txn->t_private_list) { + entry = list_entry(l, struct ext4_free_data, list); mb_debug("gonna free %u blocks in group %lu (0x%p):", - md->num, md->group, md); + entry->count, entry->group, entry); - err = ext4_mb_load_buddy(sb, md->group, &e4b); + err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ BUG_ON(err != 0); + db = e4b.bd_info; /* there are blocks to put in buddy to make them really free */ - count += md->num; + count += entry->count; count2++; - ext4_lock_group(sb, md->group); - for (i = 0; i < md->num; i++) { - mb_debug(" %u", md->blocks[i]); - mb_free_blocks(NULL, &e4b, md->blocks[i], 1); + ext4_lock_group(sb, entry->group); + /* Take it out of per group rb tree */ + rb_erase(&entry->node, &(db->bb_free_root)); + mb_free_blocks(NULL, &e4b, entry->start_blk, entry->count); + + if (!db->bb_free_root.rb_node) { + /* No more items in the per group rb tree + * balance refcounts from ext4_mb_free_metadata() + */ + page_cache_release(e4b.bd_buddy_page); + page_cache_release(e4b.bd_bitmap_page); } - mb_debug("\n"); - ext4_unlock_group(sb, md->group); - - /* balance refcounts from ext4_mb_free_metadata() */ - page_cache_release(e4b.bd_buddy_page); - page_cache_release(e4b.bd_bitmap_page); - - kfree(md); + ext4_unlock_group(sb, entry->group); + discard_block = (ext4_fsblk_t) entry->group * EXT4_BLOCKS_PER_GROUP(sb) + + entry->start_blk + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + trace_mark(ext4_discard_blocks, "dev %s blk %llu count %u", sb->s_id, + (unsigned long long) discard_block, entry->count); + sb_issue_discard(sb, discard_block, entry->count); + + kmem_cache_free(ext4_free_ext_cachep, entry); ext4_mb_release_desc(&e4b); - - } while (md); + } mb_debug("freed %u blocks in %u structures\n", count, count2); } @@ -2721,119 +2697,52 @@ ext4_mb_free_committed_blocks(struct super_block *sb) #define EXT4_MB_STREAM_REQ "stream_req" #define EXT4_MB_GROUP_PREALLOC "group_prealloc" - - -#define MB_PROC_FOPS(name) \ -static int ext4_mb_##name##_proc_show(struct seq_file *m, void *v) \ -{ \ - struct ext4_sb_info *sbi = m->private; \ - \ - seq_printf(m, "%ld\n", sbi->s_mb_##name); \ - return 0; \ -} \ - \ -static int ext4_mb_##name##_proc_open(struct inode *inode, struct file *file)\ -{ \ - return single_open(file, ext4_mb_##name##_proc_show, PDE(inode)->data);\ -} \ - \ -static ssize_t ext4_mb_##name##_proc_write(struct file *file, \ - const char __user *buf, size_t cnt, loff_t *ppos) \ -{ \ - struct ext4_sb_info *sbi = PDE(file->f_path.dentry->d_inode)->data;\ - char str[32]; \ - long value; \ - if (cnt >= sizeof(str)) \ - return -EINVAL; \ - if (copy_from_user(str, buf, cnt)) \ - return -EFAULT; \ - value = simple_strtol(str, NULL, 0); \ - if (value <= 0) \ - return -ERANGE; \ - sbi->s_mb_##name = value; \ - return cnt; \ -} \ - \ -static const struct file_operations ext4_mb_##name##_proc_fops = { \ - .owner = THIS_MODULE, \ - .open = ext4_mb_##name##_proc_open, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ - .write = ext4_mb_##name##_proc_write, \ -}; - -MB_PROC_FOPS(stats); -MB_PROC_FOPS(max_to_scan); -MB_PROC_FOPS(min_to_scan); -MB_PROC_FOPS(order2_reqs); -MB_PROC_FOPS(stream_request); -MB_PROC_FOPS(group_prealloc); - -#define MB_PROC_HANDLER(name, var) \ -do { \ - proc = proc_create_data(name, mode, sbi->s_mb_proc, \ - &ext4_mb_##var##_proc_fops, sbi); \ - if (proc == NULL) { \ - printk(KERN_ERR "EXT4-fs: can't to create %s\n", name); \ - goto err_out; \ - } \ -} while (0) - static int ext4_mb_init_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS mode_t mode = S_IFREG | S_IRUGO | S_IWUSR; struct ext4_sb_info *sbi = EXT4_SB(sb); struct proc_dir_entry *proc; - char devname[64]; - if (proc_root_ext4 == NULL) { - sbi->s_mb_proc = NULL; + if (sbi->s_proc == NULL) return -EINVAL; - } - bdevname(sb->s_bdev, devname); - sbi->s_mb_proc = proc_mkdir(devname, proc_root_ext4); - - MB_PROC_HANDLER(EXT4_MB_STATS_NAME, stats); - MB_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, max_to_scan); - MB_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, min_to_scan); - MB_PROC_HANDLER(EXT4_MB_ORDER2_REQ, order2_reqs); - MB_PROC_HANDLER(EXT4_MB_STREAM_REQ, stream_request); - MB_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, group_prealloc); + EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats); + EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan); + EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs); + EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request); + EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc); return 0; err_out: - printk(KERN_ERR "EXT4-fs: Unable to create %s\n", devname); - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); - remove_proc_entry(devname, proc_root_ext4); - sbi->s_mb_proc = NULL; - + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); return -ENOMEM; +#else + return 0; +#endif } static int ext4_mb_destroy_per_dev_proc(struct super_block *sb) { +#ifdef CONFIG_PROC_FS struct ext4_sb_info *sbi = EXT4_SB(sb); - char devname[64]; - if (sbi->s_mb_proc == NULL) + if (sbi->s_proc == NULL) return -EINVAL; - bdevname(sb->s_bdev, devname); - remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_mb_proc); - remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_mb_proc); - remove_proc_entry(devname, proc_root_ext4); - + remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc); + remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc); + remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc); + remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc); +#endif return 0; } @@ -2854,11 +2763,16 @@ int __init init_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); return -ENOMEM; } -#ifdef CONFIG_PROC_FS - proc_root_ext4 = proc_mkdir("fs/ext4", NULL); - if (proc_root_ext4 == NULL) - printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n"); -#endif + + ext4_free_ext_cachep = + kmem_cache_create("ext4_free_block_extents", + sizeof(struct ext4_free_data), + 0, SLAB_RECLAIM_ACCOUNT, NULL); + if (ext4_free_ext_cachep == NULL) { + kmem_cache_destroy(ext4_pspace_cachep); + kmem_cache_destroy(ext4_ac_cachep); + return -ENOMEM; + } return 0; } @@ -2867,9 +2781,7 @@ void exit_ext4_mballoc(void) /* XXX: synchronize_rcu(); */ kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); -#ifdef CONFIG_PROC_FS - remove_proc_entry("fs/ext4", NULL); -#endif + kmem_cache_destroy(ext4_free_ext_cachep); } @@ -2879,7 +2791,7 @@ void exit_ext4_mballoc(void) */ static noinline_for_stack int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, - handle_t *handle) + handle_t *handle, unsigned long reserv_blks) { struct buffer_head *bitmap_bh = NULL; struct ext4_super_block *es; @@ -2968,15 +2880,16 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - + percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); /* - * free blocks account has already be reduced/reserved - * at write_begin() time for delayed allocation - * do not double accounting + * Now reduce the dirty block count also. Should not go negative */ if (!(ac->ac_flags & EXT4_MB_DELALLOC_RESERVED)) - percpu_counter_sub(&sbi->s_freeblocks_counter, - ac->ac_b_ex.fe_len); + /* release all the reserved blocks if non delalloc */ + percpu_counter_sub(&sbi->s_dirtyblocks_counter, reserv_blks); + else + percpu_counter_sub(&sbi->s_dirtyblocks_counter, + ac->ac_b_ex.fe_len); if (sbi->s_log_groups_per_flex) { ext4_group_t flex_group = ext4_flex_group(sbi, @@ -3282,6 +3195,35 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, } /* + * Return the prealloc space that have minimal distance + * from the goal block. @cpa is the prealloc + * space that is having currently known minimal distance + * from the goal block. + */ +static struct ext4_prealloc_space * +ext4_mb_check_group_pa(ext4_fsblk_t goal_block, + struct ext4_prealloc_space *pa, + struct ext4_prealloc_space *cpa) +{ + ext4_fsblk_t cur_distance, new_distance; + + if (cpa == NULL) { + atomic_inc(&pa->pa_count); + return pa; + } + cur_distance = abs(goal_block - cpa->pa_pstart); + new_distance = abs(goal_block - pa->pa_pstart); + + if (cur_distance < new_distance) + return cpa; + + /* drop the previous reference */ + atomic_dec(&cpa->pa_count); + atomic_inc(&pa->pa_count); + return pa; +} + +/* * search goal blocks in preallocated space */ static noinline_for_stack int @@ -3290,7 +3232,8 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) int order, i; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; - struct ext4_prealloc_space *pa; + struct ext4_prealloc_space *pa, *cpa = NULL; + ext4_fsblk_t goal_block; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3333,6 +3276,13 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* The max size of hash table is PREALLOC_TB_SIZE */ order = PREALLOC_TB_SIZE - 1; + goal_block = ac->ac_g_ex.fe_group * EXT4_BLOCKS_PER_GROUP(ac->ac_sb) + + ac->ac_g_ex.fe_start + + le32_to_cpu(EXT4_SB(ac->ac_sb)->s_es->s_first_data_block); + /* + * search for the prealloc space that is having + * minimal distance from the goal block. + */ for (i = order; i < PREALLOC_TB_SIZE; i++) { rcu_read_lock(); list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i], @@ -3340,17 +3290,19 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { - atomic_inc(&pa->pa_count); - ext4_mb_use_group_pa(ac, pa); - spin_unlock(&pa->pa_lock); - ac->ac_criteria = 20; - rcu_read_unlock(); - return 1; + + cpa = ext4_mb_check_group_pa(goal_block, + pa, cpa); } spin_unlock(&pa->pa_lock); } rcu_read_unlock(); } + if (cpa) { + ext4_mb_use_group_pa(ac, cpa); + ac->ac_criteria = 20; + return 1; + } return 0; } @@ -3845,7 +3797,7 @@ out: * * FIXME!! Make sure it is valid at all the call sites */ -void ext4_mb_discard_inode_preallocations(struct inode *inode) +void ext4_discard_preallocations(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); struct super_block *sb = inode->i_sb; @@ -3857,7 +3809,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode) struct ext4_buddy e4b; int err; - if (!test_opt(sb, MBALLOC) || !S_ISREG(inode->i_mode)) { + if (!S_ISREG(inode->i_mode)) { /*BUG_ON(!list_empty(&ei->i_prealloc_list));*/ return; } @@ -4055,8 +4007,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ - ac->ac_lg = &sbi->s_locality_groups[get_cpu()]; - put_cpu(); + ac->ac_lg = per_cpu_ptr(sbi->s_locality_groups, raw_smp_processor_id()); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; @@ -4330,33 +4281,32 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { + int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; - int freed; - int inquota; + unsigned long inquota; + unsigned long reserv_blks = 0; sb = ar->inode->i_sb; sbi = EXT4_SB(sb); - if (!test_opt(sb, MBALLOC)) { - block = ext4_old_new_blocks(handle, ar->inode, ar->goal, - &(ar->len), errp); - return block; - } if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) { /* * With delalloc we already reserved the blocks */ - ar->len = ext4_has_free_blocks(sbi, ar->len); - } - - if (ar->len == 0) { - *errp = -ENOSPC; - return 0; + while (ar->len && ext4_claim_free_blocks(sbi, ar->len)) { + /* let others to free the space */ + yield(); + ar->len = ar->len >> 1; + } + if (!ar->len) { + *errp = -ENOSPC; + return 0; + } + reserv_blks = ar->len; } - while (ar->len && DQUOT_ALLOC_BLOCK(ar->inode, ar->len)) { ar->flags |= EXT4_MB_HINT_NOPREALLOC; ar->len--; @@ -4377,8 +4327,6 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, goto out1; } - ext4_mb_poll_new_transaction(sb, handle); - *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; @@ -4402,7 +4350,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { - *errp = ext4_mb_mark_diskspace_used(ac, handle); + *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); if (*errp == -EAGAIN) { ac->ac_b_ex.fe_group = 0; ac->ac_b_ex.fe_start = 0; @@ -4437,35 +4385,20 @@ out1: return block; } -static void ext4_mb_poll_new_transaction(struct super_block *sb, - handle_t *handle) -{ - struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sbi->s_last_transaction == handle->h_transaction->t_tid) - return; - - /* new transaction! time to close last one and free blocks for - * committed transaction. we know that only transaction can be - * active, so previos transaction can be being logged and we - * know that transaction before previous is known to be already - * logged. this means that now we may free blocks freed in all - * transactions before previous one. hope I'm clear enough ... */ - - spin_lock(&sbi->s_md_lock); - if (sbi->s_last_transaction != handle->h_transaction->t_tid) { - mb_debug("new transaction %lu, old %lu\n", - (unsigned long) handle->h_transaction->t_tid, - (unsigned long) sbi->s_last_transaction); - list_splice_init(&sbi->s_closed_transaction, - &sbi->s_committed_transaction); - list_splice_init(&sbi->s_active_transaction, - &sbi->s_closed_transaction); - sbi->s_last_transaction = handle->h_transaction->t_tid; - } - spin_unlock(&sbi->s_md_lock); - - ext4_mb_free_committed_blocks(sb); +/* + * We can merge two free data extents only if the physical blocks + * are contiguous, AND the extents were freed by the same transaction, + * AND the blocks are associated with the same group. + */ +static int can_merge(struct ext4_free_data *entry1, + struct ext4_free_data *entry2) +{ + if ((entry1->t_tid == entry2->t_tid) && + (entry1->group == entry2->group) && + ((entry1->start_blk + entry1->count) == entry2->start_blk)) + return 1; + return 0; } static noinline_for_stack int @@ -4475,57 +4408,80 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_group_info *db = e4b->bd_info; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_free_metadata *md; - int i; + struct ext4_free_data *entry, *new_entry; + struct rb_node **n = &db->bb_free_root.rb_node, *node; + struct rb_node *parent = NULL, *new_node; + BUG_ON(e4b->bd_bitmap_page == NULL); BUG_ON(e4b->bd_buddy_page == NULL); + new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS); + new_entry->start_blk = block; + new_entry->group = group; + new_entry->count = count; + new_entry->t_tid = handle->h_transaction->t_tid; + new_node = &new_entry->node; + ext4_lock_group(sb, group); - for (i = 0; i < count; i++) { - md = db->bb_md_cur; - if (md && db->bb_tid != handle->h_transaction->t_tid) { - db->bb_md_cur = NULL; - md = NULL; + if (!*n) { + /* first free block exent. We need to + protect buddy cache from being freed, + * otherwise we'll refresh it from + * on-disk bitmap and lose not-yet-available + * blocks */ + page_cache_get(e4b->bd_buddy_page); + page_cache_get(e4b->bd_bitmap_page); + } + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_free_data, node); + if (block < entry->start_blk) + n = &(*n)->rb_left; + else if (block >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + ext4_error(sb, __func__, + "Double free of blocks %d (%d %d)\n", + block, entry->start_blk, entry->count); + return 0; } + } - if (md == NULL) { - ext4_unlock_group(sb, group); - md = kmalloc(sizeof(*md), GFP_NOFS); - if (md == NULL) - return -ENOMEM; - md->num = 0; - md->group = group; - - ext4_lock_group(sb, group); - if (db->bb_md_cur == NULL) { - spin_lock(&sbi->s_md_lock); - list_add(&md->list, &sbi->s_active_transaction); - spin_unlock(&sbi->s_md_lock); - /* protect buddy cache from being freed, - * otherwise we'll refresh it from - * on-disk bitmap and lose not-yet-available - * blocks */ - page_cache_get(e4b->bd_buddy_page); - page_cache_get(e4b->bd_bitmap_page); - db->bb_md_cur = md; - db->bb_tid = handle->h_transaction->t_tid; - mb_debug("new md 0x%p for group %lu\n", - md, md->group); - } else { - kfree(md); - md = db->bb_md_cur; - } + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &db->bb_free_root); + + /* Now try to see the extent can be merged to left and right */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } + } - BUG_ON(md->num >= EXT4_BB_MAX_BLOCKS); - md->blocks[md->num] = block + i; - md->num++; - if (md->num == EXT4_BB_MAX_BLOCKS) { - /* no more space, put full container on a sb's list */ - db->bb_md_cur = NULL; + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_free_data, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &(db->bb_free_root)); + spin_lock(&sbi->s_md_lock); + list_del(&entry->list); + spin_unlock(&sbi->s_md_lock); + kmem_cache_free(ext4_free_ext_cachep, entry); } } + /* Add the extent to transaction's private list */ + spin_lock(&sbi->s_md_lock); + list_add(&new_entry->list, &handle->h_transaction->t_private_list); + spin_unlock(&sbi->s_md_lock); ext4_unlock_group(sb, group); return 0; } @@ -4553,8 +4509,6 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, *freed = 0; - ext4_mb_poll_new_transaction(sb, handle); - sbi = EXT4_SB(sb); es = EXT4_SB(sb)->s_es; if (block < le32_to_cpu(es->s_first_data_block) || diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index c7c9906c2a7..b5dff1fff1e 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -18,6 +18,8 @@ #include <linux/pagemap.h> #include <linux/seq_file.h> #include <linux/version.h> +#include <linux/blkdev.h> +#include <linux/marker.h> #include "ext4_jbd2.h" #include "ext4.h" #include "group.h" @@ -98,23 +100,29 @@ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; +static struct kmem_cache *ext4_free_ext_cachep; -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 +struct ext4_free_data { + /* this links the free block information from group_info */ + struct rb_node node; -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + /* this links the free block information from ext4_sb_info */ struct list_head list; + + /* group which free block extent belongs */ + ext4_group_t group; + + /* free block extent */ + ext4_grpblk_t start_blk; + ext4_grpblk_t count; + + /* transaction which freed this extent */ + tid_t t_tid; }; struct ext4_group_info { unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; + struct rb_root bb_free_root; unsigned short bb_first_free; unsigned short bb_free; unsigned short bb_fragments; @@ -257,13 +265,10 @@ static void ext4_mb_store_history(struct ext4_allocation_context *ac); #define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) -static struct proc_dir_entry *proc_root_ext4; struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); static void ext4_mb_return_to_preallocation(struct inode *inode, struct ext4_buddy *e4b, sector_t block, int count); @@ -271,6 +276,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *, struct super_block *, struct ext4_prealloc_space *pa); static int ext4_mb_init_per_dev_proc(struct super_block *sb); static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); +static void release_blocks_on_commit(journal_t *journal, transaction_t *txn); static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index b9e077ba07e..f2a9cf498ec 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -53,7 +53,8 @@ static int finish_range(handle_t *handle, struct inode *inode, * credit. But below we try to not accumalate too much * of them by restarting the journal. */ - needed = ext4_ext_calc_credits_for_insert(inode, path); + needed = ext4_ext_calc_credits_for_single_extent(inode, + lb->last_block - lb->first_block + 1, path); /* * Make sure the credit we accumalated is not really high @@ -446,8 +447,7 @@ static int free_ext_block(handle_t *handle, struct inode *inode) } -int ext4_ext_migrate(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) +int ext4_ext_migrate(struct inode *inode) { handle_t *handle; int retval = 0, i; @@ -515,12 +515,6 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp, * when we add extents we extent the journal */ /* - * inode_mutex prevent write and truncate on the file. Read still goes - * through. We take i_data_sem in ext4_ext_swap_inode_data before we - * switch the inode format to prevent read. - */ - mutex_lock(&(inode->i_mutex)); - /* * Even though we take i_mutex we can still cause block allocation * via mmap write to holes. If we have allocated new blocks we fail * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. @@ -622,7 +616,6 @@ err_out: tmp_inode->i_nlink = 0; ext4_journal_stop(handle); - mutex_unlock(&(inode->i_mutex)); if (tmp_inode) iput(tmp_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 387ad98350c..92db9e94514 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -151,34 +151,36 @@ struct dx_map_entry static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash (struct dx_entry *entry); -static void dx_set_hash (struct dx_entry *entry, unsigned value); -static unsigned dx_get_count (struct dx_entry *entries); -static unsigned dx_get_limit (struct dx_entry *entries); -static void dx_set_count (struct dx_entry *entries, unsigned value); -static void dx_set_limit (struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit (struct inode *dir, unsigned infosize); -static unsigned dx_node_limit (struct inode *dir); -static struct dx_frame *dx_probe(struct dentry *dentry, +static inline unsigned dx_get_hash(struct dx_entry *entry); +static void dx_set_hash(struct dx_entry *entry, unsigned value); +static unsigned dx_get_count(struct dx_entry *entries); +static unsigned dx_get_limit(struct dx_entry *entries); +static void dx_set_count(struct dx_entry *entries, unsigned value); +static void dx_set_limit(struct dx_entry *entries, unsigned value); +static unsigned dx_root_limit(struct inode *dir, unsigned infosize); +static unsigned dx_node_limit(struct inode *dir); +static struct dx_frame *dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame, int *err); -static void dx_release (struct dx_frame *frames); -static int dx_make_map (struct ext4_dir_entry_2 *de, int size, - struct dx_hash_info *hinfo, struct dx_map_entry map[]); +static void dx_release(struct dx_frame *frames); +static int dx_make_map(struct ext4_dir_entry_2 *de, int size, + struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents (char *from, char *to, +static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, struct dx_map_entry *offsets, int count); -static struct ext4_dir_entry_2* dx_pack_dirents (char *base, int size); +static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, struct dx_frame *frame, struct dx_frame *frames, __u32 *start_hash); -static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, - struct ext4_dir_entry_2 **res_dir, int *err); +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, + const struct qstr *d_name, + struct ext4_dir_entry_2 **res_dir, + int *err); static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -207,44 +209,44 @@ static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) entry->block = cpu_to_le32(value); } -static inline unsigned dx_get_hash (struct dx_entry *entry) +static inline unsigned dx_get_hash(struct dx_entry *entry) { return le32_to_cpu(entry->hash); } -static inline void dx_set_hash (struct dx_entry *entry, unsigned value) +static inline void dx_set_hash(struct dx_entry *entry, unsigned value) { entry->hash = cpu_to_le32(value); } -static inline unsigned dx_get_count (struct dx_entry *entries) +static inline unsigned dx_get_count(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->count); } -static inline unsigned dx_get_limit (struct dx_entry *entries) +static inline unsigned dx_get_limit(struct dx_entry *entries) { return le16_to_cpu(((struct dx_countlimit *) entries)->limit); } -static inline void dx_set_count (struct dx_entry *entries, unsigned value) +static inline void dx_set_count(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->count = cpu_to_le16(value); } -static inline void dx_set_limit (struct dx_entry *entries, unsigned value) +static inline void dx_set_limit(struct dx_entry *entries, unsigned value) { ((struct dx_countlimit *) entries)->limit = cpu_to_le16(value); } -static inline unsigned dx_root_limit (struct inode *dir, unsigned infosize) +static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; return entry_space / sizeof(struct dx_entry); } -static inline unsigned dx_node_limit (struct inode *dir) +static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); return entry_space / sizeof(struct dx_entry); @@ -254,12 +256,12 @@ static inline unsigned dx_node_limit (struct inode *dir) * Debug */ #ifdef DX_DEBUG -static void dx_show_index (char * label, struct dx_entry *entries) +static void dx_show_index(char * label, struct dx_entry *entries) { int i, n = dx_get_count (entries); - printk("%s index ", label); + printk(KERN_DEBUG "%s index ", label); for (i = 0; i < n; i++) { - printk("%x->%lu ", i? dx_get_hash(entries + i) : + printk("%x->%lu ", i ? dx_get_hash(entries + i) : 0, (unsigned long)dx_get_block(entries + i)); } printk("\n"); @@ -306,7 +308,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, struct dx_entry *entries, int levels) { unsigned blocksize = dir->i_sb->s_blocksize; - unsigned count = dx_get_count (entries), names = 0, space = 0, i; + unsigned count = dx_get_count(entries), names = 0, space = 0, i; unsigned bcount = 0; struct buffer_head *bh; int err; @@ -325,11 +327,12 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, names += stats.names; space += stats.space; bcount += stats.bcount; - brelse (bh); + brelse(bh); } if (bcount) - printk("%snames %u, fullness %u (%u%%)\n", levels?"":" ", - names, space/bcount,(space/bcount)*100/blocksize); + printk(KERN_DEBUG "%snames %u, fullness %u (%u%%)\n", + levels ? "" : " ", names, space/bcount, + (space/bcount)*100/blocksize); return (struct stats) { names, space, bcount}; } #endif /* DX_DEBUG */ @@ -344,7 +347,7 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, * back to userspace. */ static struct dx_frame * -dx_probe(struct dentry *dentry, struct inode *dir, +dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) { unsigned count, indirect; @@ -355,8 +358,6 @@ dx_probe(struct dentry *dentry, struct inode *dir, u32 hash; frame->bh = NULL; - if (dentry) - dir = dentry->d_parent->d_inode; if (!(bh = ext4_bread (NULL,dir, 0, 0, err))) goto fail; root = (struct dx_root *) bh->b_data; @@ -372,8 +373,8 @@ dx_probe(struct dentry *dentry, struct inode *dir, } hinfo->hash_version = root->info.hash_version; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (dentry) - ext4fs_dirhash(dentry->d_name.name, dentry->d_name.len, hinfo); + if (d_name) + ext4fs_dirhash(d_name->name, d_name->len, hinfo); hash = hinfo->hash; if (root->info.unused_flags & 1) { @@ -406,7 +407,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, goto fail; } - dxtrace (printk("Look up %x", hash)); + dxtrace(printk("Look up %x", hash)); while (1) { count = dx_get_count(entries); @@ -555,7 +556,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, 0, &err))) return err; /* Failure */ p++; - brelse (p->bh); + brelse(p->bh); p->bh = bh; p->at = p->entries = ((struct dx_node *) bh->b_data)->entries; } @@ -593,7 +594,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* On error, skip the f_pos to the next block. */ dir_file->f_pos = (dir_file->f_pos | (dir->i_sb->s_blocksize - 1)) + 1; - brelse (bh); + brelse(bh); return count; } ext4fs_dirhash(de->name, de->name_len, hinfo); @@ -635,8 +636,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, int ret, err; __u32 hashval; - dxtrace(printk("In htree_fill_tree, start hash: %x:%x\n", start_hash, - start_minor_hash)); + dxtrace(printk(KERN_DEBUG "In htree_fill_tree, start hash: %x:%x\n", + start_hash, start_minor_hash)); dir = dir_file->f_path.dentry->d_inode; if (!(EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) { hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; @@ -648,7 +649,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir_file->f_path.dentry->d_inode, &hinfo, frames, &err); + frame = dx_probe(NULL, dir, &hinfo, frames, &err); if (!frame) return err; @@ -694,8 +695,8 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, break; } dx_release(frames); - dxtrace(printk("Fill tree: returned %d entries, next hash: %x\n", - count, *next_hash)); + dxtrace(printk(KERN_DEBUG "Fill tree: returned %d entries, " + "next hash: %x\n", count, *next_hash)); return count; errout: dx_release(frames); @@ -802,17 +803,17 @@ static inline int ext4_match (int len, const char * const name, /* * Returns 0 if not found, -1 on failure, and 1 on success */ -static inline int search_dirblock(struct buffer_head * bh, +static inline int search_dirblock(struct buffer_head *bh, struct inode *dir, - struct dentry *dentry, + const struct qstr *d_name, unsigned long offset, struct ext4_dir_entry_2 ** res_dir) { struct ext4_dir_entry_2 * de; char * dlimit; int de_len; - const char *name = dentry->d_name.name; - int namelen = dentry->d_name.len; + const char *name = d_name->name; + int namelen = d_name->len; de = (struct ext4_dir_entry_2 *) bh->b_data; dlimit = bh->b_data + dir->i_sb->s_blocksize; @@ -851,12 +852,13 @@ static inline int search_dirblock(struct buffer_head * bh, * The returned buffer_head has ->b_count elevated. The caller is expected * to brelse() it when appropriate. */ -static struct buffer_head * ext4_find_entry (struct dentry *dentry, +static struct buffer_head * ext4_find_entry (struct inode *dir, + const struct qstr *d_name, struct ext4_dir_entry_2 ** res_dir) { - struct super_block * sb; - struct buffer_head * bh_use[NAMEI_RA_SIZE]; - struct buffer_head * bh, *ret = NULL; + struct super_block *sb; + struct buffer_head *bh_use[NAMEI_RA_SIZE]; + struct buffer_head *bh, *ret = NULL; ext4_lblk_t start, block, b; int ra_max = 0; /* Number of bh's in the readahead buffer, bh_use[] */ @@ -865,16 +867,15 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry, int num = 0; ext4_lblk_t nblocks; int i, err; - struct inode *dir = dentry->d_parent->d_inode; int namelen; *res_dir = NULL; sb = dir->i_sb; - namelen = dentry->d_name.len; + namelen = d_name->len; if (namelen > EXT4_NAME_LEN) return NULL; if (is_dx(dir)) { - bh = ext4_dx_find_entry(dentry, res_dir, &err); + bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the @@ -882,7 +883,8 @@ static struct buffer_head * ext4_find_entry (struct dentry *dentry, */ if (bh || (err != ERR_BAD_DX_DIR)) return bh; - dxtrace(printk("ext4_find_entry: dx failed, falling back\n")); + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); start = EXT4_I(dir)->i_dir_start_lookup; @@ -926,7 +928,7 @@ restart: brelse(bh); goto next; } - i = search_dirblock(bh, dir, dentry, + i = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); if (i == 1) { EXT4_I(dir)->i_dir_start_lookup = block; @@ -956,11 +958,11 @@ restart: cleanup_and_exit: /* Clean up the read-ahead blocks */ for (; ra_ptr < ra_max; ra_ptr++) - brelse (bh_use[ra_ptr]); + brelse(bh_use[ra_ptr]); return ret; } -static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, +static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, struct ext4_dir_entry_2 **res_dir, int *err) { struct super_block * sb; @@ -971,14 +973,13 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, struct buffer_head *bh; ext4_lblk_t block; int retval; - int namelen = dentry->d_name.len; - const u8 *name = dentry->d_name.name; - struct inode *dir = dentry->d_parent->d_inode; + int namelen = d_name->len; + const u8 *name = d_name->name; sb = dir->i_sb; /* NFS may look up ".." - look at dx_root directory block */ if (namelen > 2 || name[0] != '.'||(name[1] != '.' && name[1] != '\0')){ - if (!(frame = dx_probe(dentry, NULL, &hinfo, frames, err))) + if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) return NULL; } else { frame = frames; @@ -1010,7 +1011,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, return bh; } } - brelse (bh); + brelse(bh); /* Check to see if we should continue to search */ retval = ext4_htree_next_block(dir, hash, frame, frames, NULL); @@ -1025,25 +1026,25 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, *err = -ENOENT; errout: - dxtrace(printk("%s not found\n", name)); + dxtrace(printk(KERN_DEBUG "%s not found\n", name)); dx_release (frames); return NULL; } -static struct dentry *ext4_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) +static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - struct inode * inode; - struct ext4_dir_entry_2 * de; - struct buffer_head * bh; + struct inode *inode; + struct ext4_dir_entry_2 *de; + struct buffer_head *bh; if (dentry->d_name.len > EXT4_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); - bh = ext4_find_entry(dentry, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de); inode = NULL; if (bh) { unsigned long ino = le32_to_cpu(de->inode); - brelse (bh); + brelse(bh); if (!ext4_valid_inum(dir->i_sb, ino)) { ext4_error(dir->i_sb, "ext4_lookup", "bad inode number: %lu", ino); @@ -1062,15 +1063,14 @@ struct dentry *ext4_get_parent(struct dentry *child) unsigned long ino; struct dentry *parent; struct inode *inode; - struct dentry dotdot; + static const struct qstr dotdot = { + .name = "..", + .len = 2, + }; struct ext4_dir_entry_2 * de; struct buffer_head *bh; - dotdot.d_name.name = ".."; - dotdot.d_name.len = 2; - dotdot.d_parent = child; /* confusing, isn't it! */ - - bh = ext4_find_entry(&dotdot, &de); + bh = ext4_find_entry(child->d_inode, &dotdot, &de); inode = NULL; if (!bh) return ERR_PTR(-ENOENT); @@ -1201,10 +1201,10 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* create map in the end of data2 block */ map = (struct dx_map_entry *) (data2 + blocksize); - count = dx_make_map ((struct ext4_dir_entry_2 *) data1, + count = dx_make_map((struct ext4_dir_entry_2 *) data1, blocksize, hinfo, map); map -= count; - dx_sort_map (map, count); + dx_sort_map(map, count); /* Split the existing block in the middle, size-wise */ size = 0; move = 0; @@ -1225,7 +1225,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, /* Fancy dance to stay within two buffers */ de2 = dx_move_dirents(data1, data2, map + split, count - split); - de = dx_pack_dirents(data1,blocksize); + de = dx_pack_dirents(data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de); de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2); dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1)); @@ -1237,15 +1237,15 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, swap(*bh, bh2); de = de2; } - dx_insert_block (frame, hash2 + continued, newblock); - err = ext4_journal_dirty_metadata (handle, bh2); + dx_insert_block(frame, hash2 + continued, newblock); + err = ext4_journal_dirty_metadata(handle, bh2); if (err) goto journal_error; - err = ext4_journal_dirty_metadata (handle, frame->bh); + err = ext4_journal_dirty_metadata(handle, frame->bh); if (err) goto journal_error; - brelse (bh2); - dxtrace(dx_show_index ("frame", frame->entries)); + brelse(bh2); + dxtrace(dx_show_index("frame", frame->entries)); return de; journal_error: @@ -1271,7 +1271,7 @@ errout: */ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, struct inode *inode, struct ext4_dir_entry_2 *de, - struct buffer_head * bh) + struct buffer_head *bh) { struct inode *dir = dentry->d_parent->d_inode; const char *name = dentry->d_name.name; @@ -1288,11 +1288,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, while ((char *) de <= top) { if (!ext4_check_dir_entry("ext4_add_entry", dir, de, bh, offset)) { - brelse (bh); + brelse(bh); return -EIO; } - if (ext4_match (namelen, name, de)) { - brelse (bh); + if (ext4_match(namelen, name, de)) { + brelse(bh); return -EEXIST; } nlen = EXT4_DIR_REC_LEN(de->name_len); @@ -1329,7 +1329,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, } else de->inode = 0; de->name_len = namelen; - memcpy (de->name, name, namelen); + memcpy(de->name, name, namelen); /* * XXX shouldn't update any times until successful * completion of syscall, but too many callers depend @@ -1377,7 +1377,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; blocksize = dir->i_sb->s_blocksize; - dxtrace(printk("Creating index\n")); + dxtrace(printk(KERN_DEBUG "Creating index\n")); retval = ext4_journal_get_write_access(handle, bh); if (retval) { ext4_std_error(dir->i_sb, retval); @@ -1386,7 +1386,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, } root = (struct dx_root *) bh->b_data; - bh2 = ext4_append (handle, dir, &block, &retval); + bh2 = ext4_append(handle, dir, &block, &retval); if (!(bh2)) { brelse(bh); return retval; @@ -1412,9 +1412,9 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, root->info.info_length = sizeof(root->info); root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; entries = root->entries; - dx_set_block (entries, 1); - dx_set_count (entries, 1); - dx_set_limit (entries, dx_root_limit(dir, sizeof(root->info))); + dx_set_block(entries, 1); + dx_set_count(entries, 1); + dx_set_limit(entries, dx_root_limit(dir, sizeof(root->info))); /* Initialize as for dx_probe */ hinfo.hash_version = root->info.hash_version; @@ -1443,14 +1443,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, * may not sleep between calling this and putting something into * the entry, as someone else might have used it while you slept. */ -static int ext4_add_entry (handle_t *handle, struct dentry *dentry, - struct inode *inode) +static int ext4_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; unsigned long offset; - struct buffer_head * bh; + struct buffer_head *bh; struct ext4_dir_entry_2 *de; - struct super_block * sb; + struct super_block *sb; int retval; int dx_fallback=0; unsigned blocksize; @@ -1500,13 +1500,13 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; struct dx_hash_info hinfo; - struct buffer_head * bh; + struct buffer_head *bh; struct inode *dir = dentry->d_parent->d_inode; - struct super_block * sb = dir->i_sb; + struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(dentry, NULL, &hinfo, frames, &err); + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); if (!frame) return err; entries = frame->entries; @@ -1527,7 +1527,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, } /* Block full, should compress but for now just split */ - dxtrace(printk("using %u of %u node entries\n", + dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dx_get_count(entries), dx_get_limit(entries))); /* Need to split index? */ if (dx_get_count(entries) == dx_get_limit(entries)) { @@ -1559,7 +1559,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); - dxtrace(printk("Split index %i/%i\n", icount1, icount2)); + dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", + icount1, icount2)); BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ err = ext4_journal_get_write_access(handle, @@ -1567,11 +1568,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (err) goto journal_error; - memcpy ((char *) entries2, (char *) (entries + icount1), - icount2 * sizeof(struct dx_entry)); - dx_set_count (entries, icount1); - dx_set_count (entries2, icount2); - dx_set_limit (entries2, dx_node_limit(dir)); + memcpy((char *) entries2, (char *) (entries + icount1), + icount2 * sizeof(struct dx_entry)); + dx_set_count(entries, icount1); + dx_set_count(entries2, icount2); + dx_set_limit(entries2, dx_node_limit(dir)); /* Which index block gets the new entry? */ if (at - entries >= icount1) { @@ -1579,16 +1580,17 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, frame->entries = entries = entries2; swap(frame->bh, bh2); } - dx_insert_block (frames + 0, hash2, newblock); - dxtrace(dx_show_index ("node", frames[1].entries)); - dxtrace(dx_show_index ("node", + dx_insert_block(frames + 0, hash2, newblock); + dxtrace(dx_show_index("node", frames[1].entries)); + dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_journal_dirty_metadata(handle, bh2); if (err) goto journal_error; brelse (bh2); } else { - dxtrace(printk("Creating second level index...\n")); + dxtrace(printk(KERN_DEBUG + "Creating second level index...\n")); memcpy((char *) entries2, (char *) entries, icount * sizeof(struct dx_entry)); dx_set_limit(entries2, dx_node_limit(dir)); @@ -1630,12 +1632,12 @@ cleanup: * ext4_delete_entry deletes a directory entry by merging it with the * previous entry */ -static int ext4_delete_entry (handle_t *handle, - struct inode * dir, - struct ext4_dir_entry_2 * de_del, - struct buffer_head * bh) +static int ext4_delete_entry(handle_t *handle, + struct inode *dir, + struct ext4_dir_entry_2 *de_del, + struct buffer_head *bh) { - struct ext4_dir_entry_2 * de, * pde; + struct ext4_dir_entry_2 *de, *pde; int i; i = 0; @@ -1716,11 +1718,11 @@ static int ext4_add_nondir(handle_t *handle, * If the create succeeds, we fill in the inode information * with d_instantiate(). */ -static int ext4_create (struct inode * dir, struct dentry * dentry, int mode, - struct nameidata *nd) +static int ext4_create(struct inode *dir, struct dentry *dentry, int mode, + struct nameidata *nd) { handle_t *handle; - struct inode * inode; + struct inode *inode; int err, retries = 0; retry: @@ -1747,8 +1749,8 @@ retry: return err; } -static int ext4_mknod (struct inode * dir, struct dentry *dentry, - int mode, dev_t rdev) +static int ext4_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) { handle_t *handle; struct inode *inode; @@ -1767,11 +1769,11 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext4_new_inode (handle, dir, mode); + inode = ext4_new_inode(handle, dir, mode); err = PTR_ERR(inode); if (!IS_ERR(inode)) { init_special_inode(inode, inode->i_mode, rdev); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR inode->i_op = &ext4_special_inode_operations; #endif err = ext4_add_nondir(handle, dentry, inode); @@ -1782,12 +1784,12 @@ retry: return err; } -static int ext4_mkdir(struct inode * dir, struct dentry * dentry, int mode) +static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode) { handle_t *handle; - struct inode * inode; - struct buffer_head * dir_block; - struct ext4_dir_entry_2 * de; + struct inode *inode; + struct buffer_head *dir_block; + struct ext4_dir_entry_2 *de; int err, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) @@ -1803,7 +1805,7 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext4_new_inode (handle, dir, S_IFDIR | mode); + inode = ext4_new_inode(handle, dir, S_IFDIR | mode); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; @@ -1811,7 +1813,7 @@ retry: inode->i_op = &ext4_dir_inode_operations; inode->i_fop = &ext4_dir_operations; inode->i_size = EXT4_I(inode)->i_disksize = inode->i_sb->s_blocksize; - dir_block = ext4_bread (handle, inode, 0, 1, &err); + dir_block = ext4_bread(handle, inode, 0, 1, &err); if (!dir_block) goto out_clear_inode; BUFFER_TRACE(dir_block, "get_write_access"); @@ -1820,26 +1822,26 @@ retry: de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len)); - strcpy (de->name, "."); + strcpy(de->name, "."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); de = ext4_next_entry(de); de->inode = cpu_to_le32(dir->i_ino); de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1)); de->name_len = 2; - strcpy (de->name, ".."); + strcpy(de->name, ".."); ext4_set_de_type(dir->i_sb, de, S_IFDIR); inode->i_nlink = 2; BUFFER_TRACE(dir_block, "call ext4_journal_dirty_metadata"); ext4_journal_dirty_metadata(handle, dir_block); - brelse (dir_block); + brelse(dir_block); ext4_mark_inode_dirty(handle, inode); - err = ext4_add_entry (handle, dentry, inode); + err = ext4_add_entry(handle, dentry, inode); if (err) { out_clear_inode: clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); - iput (inode); + iput(inode); goto out_stop; } ext4_inc_count(handle, dir); @@ -1856,17 +1858,17 @@ out_stop: /* * routine to check that the specified directory is empty (for rmdir) */ -static int empty_dir (struct inode * inode) +static int empty_dir(struct inode *inode) { unsigned long offset; - struct buffer_head * bh; - struct ext4_dir_entry_2 * de, * de1; - struct super_block * sb; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de, *de1; + struct super_block *sb; int err = 0; sb = inode->i_sb; if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || - !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { + !(bh = ext4_bread(NULL, inode, 0, 0, &err))) { if (err) ext4_error(inode->i_sb, __func__, "error %d reading directory #%lu offset 0", @@ -1881,23 +1883,23 @@ static int empty_dir (struct inode * inode) de1 = ext4_next_entry(de); if (le32_to_cpu(de->inode) != inode->i_ino || !le32_to_cpu(de1->inode) || - strcmp (".", de->name) || - strcmp ("..", de1->name)) { - ext4_warning (inode->i_sb, "empty_dir", - "bad directory (dir #%lu) - no `.' or `..'", - inode->i_ino); - brelse (bh); + strcmp(".", de->name) || + strcmp("..", de1->name)) { + ext4_warning(inode->i_sb, "empty_dir", + "bad directory (dir #%lu) - no `.' or `..'", + inode->i_ino); + brelse(bh); return 1; } offset = ext4_rec_len_from_disk(de->rec_len) + ext4_rec_len_from_disk(de1->rec_len); de = ext4_next_entry(de1); - while (offset < inode->i_size ) { + while (offset < inode->i_size) { if (!bh || (void *) de >= (void *) (bh->b_data+sb->s_blocksize)) { err = 0; - brelse (bh); - bh = ext4_bread (NULL, inode, + brelse(bh); + bh = ext4_bread(NULL, inode, offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) @@ -1917,13 +1919,13 @@ static int empty_dir (struct inode * inode) continue; } if (le32_to_cpu(de->inode)) { - brelse (bh); + brelse(bh); return 0; } offset += ext4_rec_len_from_disk(de->rec_len); de = ext4_next_entry(de); } - brelse (bh); + brelse(bh); return 1; } @@ -1954,8 +1956,8 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. */ - J_ASSERT ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); + J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); BUFFER_TRACE(EXT4_SB(sb)->s_sbh, "get_write_access"); err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh); @@ -2069,12 +2071,12 @@ out_brelse: goto out_err; } -static int ext4_rmdir (struct inode * dir, struct dentry *dentry) +static int ext4_rmdir(struct inode *dir, struct dentry *dentry) { int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext4_dir_entry_2 * de; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; handle_t *handle; /* Initialize quotas before so that eventual writes go in @@ -2085,7 +2087,7 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) return PTR_ERR(handle); retval = -ENOENT; - bh = ext4_find_entry (dentry, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_rmdir; @@ -2099,16 +2101,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) goto end_rmdir; retval = -ENOTEMPTY; - if (!empty_dir (inode)) + if (!empty_dir(inode)) goto end_rmdir; retval = ext4_delete_entry(handle, dir, de, bh); if (retval) goto end_rmdir; if (!EXT4_DIR_LINK_EMPTY(inode)) - ext4_warning (inode->i_sb, "ext4_rmdir", - "empty directory has too many links (%d)", - inode->i_nlink); + ext4_warning(inode->i_sb, "ext4_rmdir", + "empty directory has too many links (%d)", + inode->i_nlink); inode->i_version++; clear_nlink(inode); /* There's no need to set i_disksize: the fact that i_nlink is @@ -2124,16 +2126,16 @@ static int ext4_rmdir (struct inode * dir, struct dentry *dentry) end_rmdir: ext4_journal_stop(handle); - brelse (bh); + brelse(bh); return retval; } -static int ext4_unlink(struct inode * dir, struct dentry *dentry) +static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - struct inode * inode; - struct buffer_head * bh; - struct ext4_dir_entry_2 * de; + struct inode *inode; + struct buffer_head *bh; + struct ext4_dir_entry_2 *de; handle_t *handle; /* Initialize quotas before so that eventual writes go @@ -2147,7 +2149,7 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) handle->h_sync = 1; retval = -ENOENT; - bh = ext4_find_entry (dentry, &de); + bh = ext4_find_entry(dir, &dentry->d_name, &de); if (!bh) goto end_unlink; @@ -2158,9 +2160,9 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) goto end_unlink; if (!inode->i_nlink) { - ext4_warning (inode->i_sb, "ext4_unlink", - "Deleting nonexistent file (%lu), %d", - inode->i_ino, inode->i_nlink); + ext4_warning(inode->i_sb, "ext4_unlink", + "Deleting nonexistent file (%lu), %d", + inode->i_ino, inode->i_nlink); inode->i_nlink = 1; } retval = ext4_delete_entry(handle, dir, de, bh); @@ -2178,15 +2180,15 @@ static int ext4_unlink(struct inode * dir, struct dentry *dentry) end_unlink: ext4_journal_stop(handle); - brelse (bh); + brelse(bh); return retval; } -static int ext4_symlink (struct inode * dir, - struct dentry *dentry, const char * symname) +static int ext4_symlink(struct inode *dir, + struct dentry *dentry, const char *symname) { handle_t *handle; - struct inode * inode; + struct inode *inode; int l, err, retries = 0; l = strlen(symname)+1; @@ -2203,12 +2205,12 @@ retry: if (IS_DIRSYNC(dir)) handle->h_sync = 1; - inode = ext4_new_inode (handle, dir, S_IFLNK|S_IRWXUGO); + inode = ext4_new_inode(handle, dir, S_IFLNK|S_IRWXUGO); err = PTR_ERR(inode); if (IS_ERR(inode)) goto out_stop; - if (l > sizeof (EXT4_I(inode)->i_data)) { + if (l > sizeof(EXT4_I(inode)->i_data)) { inode->i_op = &ext4_symlink_inode_operations; ext4_set_aops(inode); /* @@ -2221,14 +2223,14 @@ retry: if (err) { clear_nlink(inode); ext4_mark_inode_dirty(handle, inode); - iput (inode); + iput(inode); goto out_stop; } } else { /* clear the extent format for fast symlink */ EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; inode->i_op = &ext4_fast_symlink_inode_operations; - memcpy((char*)&EXT4_I(inode)->i_data,symname,l); + memcpy((char *)&EXT4_I(inode)->i_data, symname, l); inode->i_size = l-1; } EXT4_I(inode)->i_disksize = inode->i_size; @@ -2240,8 +2242,8 @@ out_stop: return err; } -static int ext4_link (struct dentry * old_dentry, - struct inode * dir, struct dentry *dentry) +static int ext4_link(struct dentry *old_dentry, + struct inode *dir, struct dentry *dentry) { handle_t *handle; struct inode *inode = old_dentry->d_inode; @@ -2284,13 +2286,13 @@ retry: * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. */ -static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, - struct inode * new_dir,struct dentry *new_dentry) +static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) { handle_t *handle; - struct inode * old_inode, * new_inode; - struct buffer_head * old_bh, * new_bh, * dir_bh; - struct ext4_dir_entry_2 * old_de, * new_de; + struct inode *old_inode, *new_inode; + struct buffer_head *old_bh, *new_bh, *dir_bh; + struct ext4_dir_entry_2 *old_de, *new_de; int retval; old_bh = new_bh = dir_bh = NULL; @@ -2308,7 +2310,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) handle->h_sync = 1; - old_bh = ext4_find_entry (old_dentry, &old_de); + old_bh = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de); /* * Check for inode number is _not_ due to possible IO errors. * We might rmdir the source, keep it as pwd of some process @@ -2321,32 +2323,32 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, goto end_rename; new_inode = new_dentry->d_inode; - new_bh = ext4_find_entry (new_dentry, &new_de); + new_bh = ext4_find_entry(new_dir, &new_dentry->d_name, &new_de); if (new_bh) { if (!new_inode) { - brelse (new_bh); + brelse(new_bh); new_bh = NULL; } } if (S_ISDIR(old_inode->i_mode)) { if (new_inode) { retval = -ENOTEMPTY; - if (!empty_dir (new_inode)) + if (!empty_dir(new_inode)) goto end_rename; } retval = -EIO; - dir_bh = ext4_bread (handle, old_inode, 0, 0, &retval); + dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval); if (!dir_bh) goto end_rename; if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino) goto end_rename; retval = -EMLINK; - if (!new_inode && new_dir!=old_dir && + if (!new_inode && new_dir != old_dir && new_dir->i_nlink >= EXT4_LINK_MAX) goto end_rename; } if (!new_bh) { - retval = ext4_add_entry (handle, new_dentry, old_inode); + retval = ext4_add_entry(handle, new_dentry, old_inode); if (retval) goto end_rename; } else { @@ -2388,7 +2390,7 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, struct buffer_head *old_bh2; struct ext4_dir_entry_2 *old_de2; - old_bh2 = ext4_find_entry(old_dentry, &old_de2); + old_bh2 = ext4_find_entry(old_dir, &old_dentry->d_name, &old_de2); if (old_bh2) { retval = ext4_delete_entry(handle, old_dir, old_de2, old_bh2); @@ -2433,9 +2435,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, retval = 0; end_rename: - brelse (dir_bh); - brelse (old_bh); - brelse (new_bh); + brelse(dir_bh); + brelse(old_bh); + brelse(new_bh); ext4_journal_stop(handle); return retval; } @@ -2454,7 +2456,7 @@ const struct inode_operations ext4_dir_inode_operations = { .mknod = ext4_mknod, .rename = ext4_rename, .setattr = ext4_setattr, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, @@ -2465,7 +2467,7 @@ const struct inode_operations ext4_dir_inode_operations = { const struct inode_operations ext4_special_inode_operations = { .setattr = ext4_setattr, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 0a926516426..b6ec1843a01 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -416,8 +416,8 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - /* - * If we are not using the primary superblock/GDT copy don't resize, + /* + * If we are not using the primary superblock/GDT copy don't resize, * because the user tools have no way of handling this. Probably a * bad time to do it anyways. */ @@ -773,7 +773,8 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (reserved_gdb || gdb_off == 0) { if (!EXT4_HAS_COMPAT_FEATURE(sb, - EXT4_FEATURE_COMPAT_RESIZE_INODE)){ + EXT4_FEATURE_COMPAT_RESIZE_INODE) + || !le16_to_cpu(es->s_reserved_gdt_blocks)) { ext4_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; @@ -869,11 +870,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * We can allocate memory for mb_alloc based on the new group * descriptor */ - if (test_opt(sb, MBALLOC)) { - err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); - if (err) - goto exit_journal; - } + err = ext4_mb_add_more_groupinfo(sb, input->group, gdp); + if (err) + goto exit_journal; + /* * Make the new blocks and inodes valid next. We do this before * increasing the group count so that once the group is enabled, @@ -928,6 +928,15 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) percpu_counter_add(&sbi->s_freeinodes_counter, EXT4_INODES_PER_GROUP(sb)); + if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) { + ext4_group_t flex_group; + flex_group = ext4_flex_group(sbi, input->group); + sbi->s_flex_groups[flex_group].free_blocks += + input->free_blocks_count; + sbi->s_flex_groups[flex_group].free_inodes += + EXT4_INODES_PER_GROUP(sb); + } + ext4_journal_dirty_metadata(handle, sbi->s_sbh); sb->s_dirt = 1; @@ -963,7 +972,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_group_t o_groups_count; ext4_grpblk_t last; ext4_grpblk_t add; - struct buffer_head * bh; + struct buffer_head *bh; handle_t *handle; int err; unsigned long freed_blocks; @@ -1076,8 +1085,15 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, /* * Mark mballoc pages as not up to date so that they will be updated * next time they are loaded by ext4_mb_load_buddy. + * + * XXX Bad, Bad, BAD!!! We should not be overloading the + * Uptodate flag, particularly on thte bitmap bh, as way of + * hinting to ext4_mb_load_buddy() that it needs to be + * overloaded. A user could take a LVM snapshot, then do an + * on-line fsck, and clear the uptodate flag, and this would + * not be a bug in userspace, but a bug in the kernel. FIXME!!! */ - if (test_opt(sb, MBALLOC)) { + { struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; int blocks_per_page; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d5d77958b86..9b2b2bc4ec1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -34,6 +34,8 @@ #include <linux/namei.h> #include <linux/quotaops.h> #include <linux/seq_file.h> +#include <linux/proc_fs.h> +#include <linux/marker.h> #include <linux/log2.h> #include <linux/crc16.h> #include <asm/uaccess.h> @@ -45,6 +47,8 @@ #include "namei.h" #include "group.h" +struct proc_dir_entry *ext4_proc_root; + static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); static int ext4_create_journal(struct super_block *, struct ext4_super_block *, @@ -370,66 +374,6 @@ void ext4_update_dynamic_rev(struct super_block *sb) */ } -int ext4_update_compat_feature(handle_t *handle, - struct super_block *sb, __u32 compat) -{ - int err = 0; - if (!EXT4_HAS_COMPAT_FEATURE(sb, compat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_COMPAT_FEATURE(sb, compat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_rocompat_feature(handle_t *handle, - struct super_block *sb, __u32 rocompat) -{ - int err = 0; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, rocompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_RO_COMPAT_FEATURE(sb, rocompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - -int ext4_update_incompat_feature(handle_t *handle, - struct super_block *sb, __u32 incompat) -{ - int err = 0; - if (!EXT4_HAS_INCOMPAT_FEATURE(sb, incompat)) { - err = ext4_journal_get_write_access(handle, - EXT4_SB(sb)->s_sbh); - if (err) - return err; - EXT4_SET_INCOMPAT_FEATURE(sb, incompat); - sb->s_dirt = 1; - handle->h_sync = 1; - BUFFER_TRACE(EXT4_SB(sb)->s_sbh, - "call ext4_journal_dirty_met adata"); - err = ext4_journal_dirty_metadata(handle, - EXT4_SB(sb)->s_sbh); - } - return err; -} - /* * Open the external journal device */ @@ -503,15 +447,18 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); - jbd2_journal_destroy(sbi->s_journal); + if (jbd2_journal_destroy(sbi->s_journal) < 0) + ext4_abort(sb, __func__, "Couldn't clean up the journal"); sbi->s_journal = NULL; if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - BUFFER_TRACE(sbi->s_sbh, "marking dirty"); - mark_buffer_dirty(sbi->s_sbh); ext4_commit_super(sb, es, 1); } + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); @@ -520,6 +467,7 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) @@ -562,12 +510,12 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS); if (!ei) return NULL; -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL ei->i_acl = EXT4_ACL_NOT_CACHED; ei->i_default_acl = EXT4_ACL_NOT_CACHED; #endif - ei->i_block_alloc_info = NULL; ei->vfs_inode.i_version = 1; + ei->vfs_inode.i_data.writeback_index = 0; memset(&ei->i_cached_extent, 0, sizeof(struct ext4_ext_cache)); INIT_LIST_HEAD(&ei->i_prealloc_list); spin_lock_init(&ei->i_prealloc_lock); @@ -598,7 +546,7 @@ static void init_once(void *foo) struct ext4_inode_info *ei = (struct ext4_inode_info *) foo; INIT_LIST_HEAD(&ei->i_orphan); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR init_rwsem(&ei->xattr_sem); #endif init_rwsem(&ei->i_data_sem); @@ -624,8 +572,7 @@ static void destroy_inodecache(void) static void ext4_clear_inode(struct inode *inode) { - struct ext4_block_alloc_info *rsv = EXT4_I(inode)->i_block_alloc_info; -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL if (EXT4_I(inode)->i_acl && EXT4_I(inode)->i_acl != EXT4_ACL_NOT_CACHED) { posix_acl_release(EXT4_I(inode)->i_acl); @@ -637,10 +584,7 @@ static void ext4_clear_inode(struct inode *inode) EXT4_I(inode)->i_default_acl = EXT4_ACL_NOT_CACHED; } #endif - ext4_discard_reservation(inode); - EXT4_I(inode)->i_block_alloc_info = NULL; - if (unlikely(rsv)) - kfree(rsv); + ext4_discard_preallocations(inode); jbd2_journal_release_jbd_inode(EXT4_SB(inode->i_sb)->s_journal, &EXT4_I(inode)->jinode); } @@ -653,7 +597,7 @@ static inline void ext4_show_quota_options(struct seq_file *seq, if (sbi->s_jquota_fmt) seq_printf(seq, ",jqfmt=%s", - (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold": "vfsv0"); + (sbi->s_jquota_fmt == QFMT_VFS_OLD) ? "vfsold" : "vfsv0"); if (sbi->s_qf_names[USRQUOTA]) seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]); @@ -717,7 +661,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",debug"); if (test_opt(sb, OLDALLOC)) seq_puts(seq, ",oldalloc"); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR if (test_opt(sb, XATTR_USER) && !(def_mount_opts & EXT4_DEFM_XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -726,7 +670,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nouser_xattr"); } #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL)) seq_puts(seq, ",acl"); if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL)) @@ -751,8 +695,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",nobh"); if (!test_opt(sb, EXTENTS)) seq_puts(seq, ",noextents"); - if (!test_opt(sb, MBALLOC)) - seq_puts(seq, ",nomballoc"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); if (!test_opt(sb, DELALLOC)) @@ -772,6 +714,13 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) seq_puts(seq, ",data=writeback"); + if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS) + seq_printf(seq, ",inode_readahead_blks=%u", + sbi->s_inode_readahead_blks); + + if (test_opt(sb, DATA_ERR_ABORT)) + seq_puts(seq, ",data_err=abort"); + ext4_show_quota_options(seq, sb); return 0; } @@ -821,7 +770,7 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA?"user":"group") +#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") #define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) static int ext4_dquot_initialize(struct inode *inode, int type); @@ -895,20 +844,22 @@ static const struct export_operations ext4_export_ops = { enum { Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, + Opt_data_err_abort, Opt_data_err_ignore, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_extents, Opt_noextents, Opt_i_version, - Opt_mballoc, Opt_nomballoc, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_inode_readahead_blks }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_bsd_df, "bsddf"}, {Opt_minix_df, "minixdf"}, {Opt_grpid, "grpid"}, @@ -922,8 +873,6 @@ static match_table_t tokens = { {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, {Opt_nouid32, "nouid32"}, - {Opt_nocheck, "nocheck"}, - {Opt_nocheck, "check=none"}, {Opt_debug, "debug"}, {Opt_oldalloc, "oldalloc"}, {Opt_orlov, "orlov"}, @@ -946,6 +895,8 @@ static match_table_t tokens = { {Opt_data_journal, "data=journal"}, {Opt_data_ordered, "data=ordered"}, {Opt_data_writeback, "data=writeback"}, + {Opt_data_err_abort, "data_err=abort"}, + {Opt_data_err_ignore, "data_err=ignore"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -960,12 +911,11 @@ static match_table_t tokens = { {Opt_extents, "extents"}, {Opt_noextents, "noextents"}, {Opt_i_version, "i_version"}, - {Opt_mballoc, "mballoc"}, - {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_err, NULL}, }; @@ -980,7 +930,7 @@ static ext4_fsblk_t get_sb_block(void **data) /*todo: use simple_strtoll with >32bit ext4 */ sb_block = simple_strtoul(options, &options, 0); if (*options && *options != ',') { - printk("EXT4-fs: Invalid sb specification: %s\n", + printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n", (char *) *data); return 1; } @@ -1059,9 +1009,6 @@ static int parse_options(char *options, struct super_block *sb, case Opt_nouid32: set_opt(sbi->s_mount_opt, NO_UID32); break; - case Opt_nocheck: - clear_opt(sbi->s_mount_opt, CHECK); - break; case Opt_debug: set_opt(sbi->s_mount_opt, DEBUG); break; @@ -1071,7 +1018,7 @@ static int parse_options(char *options, struct super_block *sb, case Opt_orlov: clear_opt(sbi->s_mount_opt, OLDALLOC); break; -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR case Opt_user_xattr: set_opt(sbi->s_mount_opt, XATTR_USER); break; @@ -1081,10 +1028,11 @@ static int parse_options(char *options, struct super_block *sb, #else case Opt_user_xattr: case Opt_nouser_xattr: - printk("EXT4 (no)user_xattr options not supported\n"); + printk(KERN_ERR "EXT4 (no)user_xattr options " + "not supported\n"); break; #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL case Opt_acl: set_opt(sbi->s_mount_opt, POSIX_ACL); break; @@ -1094,7 +1042,8 @@ static int parse_options(char *options, struct super_block *sb, #else case Opt_acl: case Opt_noacl: - printk("EXT4 (no)acl options not supported\n"); + printk(KERN_ERR "EXT4 (no)acl options " + "not supported\n"); break; #endif case Opt_reservation: @@ -1177,6 +1126,12 @@ static int parse_options(char *options, struct super_block *sb, sbi->s_mount_opt |= data_opt; } break; + case Opt_data_err_abort: + set_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; + case Opt_data_err_ignore: + clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -1188,8 +1143,8 @@ set_qf_name: sb_any_quota_suspended(sb)) && !sbi->s_qf_names[qtype]) { printk(KERN_ERR - "EXT4-fs: Cannot change journaled " - "quota options when quota turned on.\n"); + "EXT4-fs: Cannot change journaled " + "quota options when quota turned on.\n"); return 0; } qname = match_strdup(&args[0]); @@ -1356,12 +1311,6 @@ set_qf_format: case Opt_nodelalloc: clear_opt(sbi->s_mount_opt, DELALLOC); break; - case Opt_mballoc: - set_opt(sbi->s_mount_opt, MBALLOC); - break; - case Opt_nomballoc: - clear_opt(sbi->s_mount_opt, MBALLOC); - break; case Opt_stripe: if (match_int(&args[0], &option)) return 0; @@ -1372,6 +1321,13 @@ set_qf_format: case Opt_delalloc: set_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_inode_readahead_blks: + if (match_int(&args[0], &option)) + return 0; + if (option < 0 || option > (1 << 30)) + return 0; + sbi->s_inode_readahead_blks = option; + break; default: printk(KERN_ERR "EXT4-fs: Unrecognized mount option \"%s\" " @@ -1472,15 +1428,9 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, EXT4_INODES_PER_GROUP(sb), sbi->s_mount_opt); - printk(KERN_INFO "EXT4 FS on %s, ", sb->s_id); - if (EXT4_SB(sb)->s_journal->j_inode == NULL) { - char b[BDEVNAME_SIZE]; - - printk("external journal on %s\n", - bdevname(EXT4_SB(sb)->s_journal->j_dev, b)); - } else { - printk("internal journal\n"); - } + printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n", + sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" : + "external", EXT4_SB(sb)->s_journal->j_devname); return res; } @@ -1503,8 +1453,11 @@ static int ext4_fill_flex_info(struct super_block *sb) sbi->s_log_groups_per_flex = sbi->s_es->s_log_groups_per_flex; groups_per_flex = 1 << sbi->s_log_groups_per_flex; - flex_group_count = (sbi->s_groups_count + groups_per_flex - 1) / - groups_per_flex; + /* We allocate both existing and potentially added groups */ + flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + + ((sbi->s_es->s_reserved_gdt_blocks +1 ) << + EXT4_DESC_PER_BLOCK_BITS(sb))) / + groups_per_flex; sbi->s_flex_groups = kzalloc(flex_group_count * sizeof(struct flex_groups), GFP_KERNEL); if (sbi->s_flex_groups == NULL) { @@ -1583,7 +1536,7 @@ static int ext4_check_descriptors(struct super_block *sb) if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) flexbg_flag = 1; - ext4_debug ("Checking group descriptors"); + ext4_debug("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); @@ -1598,14 +1551,14 @@ static int ext4_check_descriptors(struct super_block *sb) if (block_bitmap < first_block || block_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Block bitmap for group %lu not in group " - "(block %llu)!", i, block_bitmap); + "(block %llu)!\n", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode bitmap for group %lu not in group " - "(block %llu)!", i, inode_bitmap); + "(block %llu)!\n", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); @@ -1613,7 +1566,7 @@ static int ext4_check_descriptors(struct super_block *sb) inode_table + sbi->s_itb_per_group - 1 > last_block) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Inode table for group %lu not in group " - "(block %llu)!", i, inode_table); + "(block %llu)!\n", i, inode_table); return 0; } spin_lock(sb_bgl_lock(sbi, i)); @@ -1622,8 +1575,10 @@ static int ext4_check_descriptors(struct super_block *sb) "Checksum for group %lu failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); - if (!(sb->s_flags & MS_RDONLY)) + if (!(sb->s_flags & MS_RDONLY)) { + spin_unlock(sb_bgl_lock(sbi, i)); return 0; + } } spin_unlock(sb_bgl_lock(sbi, i)); if (!flexbg_flag) @@ -1713,9 +1668,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, DQUOT_INIT(inode); if (inode->i_nlink) { printk(KERN_DEBUG - "%s: truncating inode %lu to %Ld bytes\n", + "%s: truncating inode %lu to %lld bytes\n", __func__, inode->i_ino, inode->i_size); - jbd_debug(2, "truncating inode %lu to %Ld bytes\n", + jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); ext4_truncate(inode); nr_truncates++; @@ -1756,13 +1711,13 @@ static void ext4_orphan_cleanup(struct super_block *sb, * * Note, this does *not* consider any metadata overhead for vfs i_blocks. */ -static loff_t ext4_max_size(int blkbits) +static loff_t ext4_max_size(int blkbits, int has_huge_files) { loff_t res; loff_t upper_limit = MAX_LFS_FILESIZE; /* small i_blocks in vfs inode? */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* * CONFIG_LSF is not enabled implies the inode * i_block represent total blocks in 512 bytes @@ -1792,7 +1747,7 @@ static loff_t ext4_max_size(int blkbits) * block limit, and also a limit of (2^48 - 1) 512-byte sectors in i_blocks. * We need to be 1 filesystem block less than the 2^48 sector limit. */ -static loff_t ext4_max_bitmap_size(int bits) +static loff_t ext4_max_bitmap_size(int bits, int has_huge_files) { loff_t res = EXT4_NDIR_BLOCKS; int meta_blocks; @@ -1805,11 +1760,11 @@ static loff_t ext4_max_bitmap_size(int bits) * total number of 512 bytes blocks of the file */ - if (sizeof(blkcnt_t) < sizeof(u64)) { + if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) { /* - * CONFIG_LSF is not enabled implies the inode - * i_block represent total blocks in 512 bytes - * 32 == size of vfs inode i_blocks * 8 + * !has_huge_files or CONFIG_LSF is not enabled + * implies the inode i_block represent total blocks in + * 512 bytes 32 == size of vfs inode i_blocks * 8 */ upper_limit = (1LL << 32) - 1; @@ -1913,11 +1868,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; + char *cp; int ret = -EINVAL; int blocksize; int db_count; int i; - int needs_recovery; + int needs_recovery, has_huge_files; __le32 features; __u64 blocks_count; int err; @@ -1929,10 +1885,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_mount_opt = 0; sbi->s_resuid = EXT4_DEF_RESUID; sbi->s_resgid = EXT4_DEF_RESGID; + sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; unlock_kernel(); + /* Cleanup superblock name */ + for (cp = sb->s_id; (cp = strchr(cp, '/'));) + *cp = '!'; + blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { printk(KERN_ERR "EXT4-fs: unable to set blocksize\n"); @@ -1972,11 +1933,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, GRPID); if (def_mount_opts & EXT4_DEFM_UID16) set_opt(sbi->s_mount_opt, NO_UID32); -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR if (def_mount_opts & EXT4_DEFM_XATTR_USER) set_opt(sbi->s_mount_opt, XATTR_USER); #endif -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL if (def_mount_opts & EXT4_DEFM_ACL) set_opt(sbi->s_mount_opt, POSIX_ACL); #endif @@ -2011,11 +1972,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_warning(sb, __func__, "extents feature not enabled on this filesystem, " "use tune2fs.\n"); - /* - * turn on mballoc code by default in ext4 filesystem - * Use -o nomballoc to turn it off - */ - set_opt(sbi->s_mount_opt, MBALLOC); /* * enable delayed allocation by default @@ -2040,16 +1996,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "running e2fsck is recommended\n"); /* - * Since ext4 is still considered development code, we require - * that the TEST_FILESYS flag in s->flags be set. - */ - if (!(le32_to_cpu(es->s_flags) & EXT2_FLAGS_TEST_FILESYS)) { - printk(KERN_WARNING "EXT4-fs: %s: not marked " - "OK to use with test code.\n", sb->s_id); - goto failed_mount; - } - - /* * Check feature flags regardless of the revision level, since we * previously didn't change the revision level when setting the flags, * so there is a chance incompat flags are set on a rev 0 filesystem. @@ -2068,7 +2014,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_id, le32_to_cpu(features)); goto failed_mount; } - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_HUGE_FILE)) { + has_huge_files = EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_HUGE_FILE); + if (has_huge_files) { /* * Large file size enabled file system can only be * mount if kernel is build with CONFIG_LSF @@ -2118,8 +2066,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } } - sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits); - sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits); + sbi->s_bitmap_maxbytes = ext4_max_bitmap_size(sb->s_blocksize_bits, + has_huge_files); + sb->s_maxbytes = ext4_max_size(sb->s_blocksize_bits, has_huge_files); if (le32_to_cpu(es->s_rev_level) == EXT4_GOOD_OLD_REV) { sbi->s_inode_size = EXT4_GOOD_OLD_INODE_SIZE; @@ -2218,6 +2167,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } +#ifdef CONFIG_PROC_FS + if (ext4_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root); + + if (sbi->s_proc) + proc_create_data("inode_readahead_blks", 0644, sbi->s_proc, + &ext4_ui_proc_fops, + &sbi->s_inode_readahead_blks); +#endif + bgl_lock_init(&sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -2256,24 +2215,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) err = percpu_counter_init(&sbi->s_dirs_counter, ext4_count_dirs(sb)); } + if (!err) { + err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0); + } if (err) { printk(KERN_ERR "EXT4-fs: insufficient memory\n"); goto failed_mount3; } - /* per fileystem reservation list head & lock */ - spin_lock_init(&sbi->s_rsv_window_lock); - sbi->s_rsv_window_root = RB_ROOT; - /* Add a single, static dummy reservation to the start of the - * reservation window list --- it gives us a placeholder for - * append-at-start-of-list which makes the allocation logic - * _much_ simpler. */ - sbi->s_rsv_window_head.rsv_start = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_end = EXT4_RESERVE_WINDOW_NOT_ALLOCATED; - sbi->s_rsv_window_head.rsv_alloc_hit = 0; - sbi->s_rsv_window_head.rsv_goal_size = 0; - ext4_rsv_window_add(sb, &sbi->s_rsv_window_head); - sbi->s_stripe = ext4_get_stripe_size(sbi); /* @@ -2443,6 +2392,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "available.\n"); } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { + printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " + "requested data journaling mode\n"); + clear_opt(sbi->s_mount_opt, DELALLOC); + } else if (test_opt(sb, DELALLOC)) + printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + + ext4_ext_init(sb); + err = ext4_mb_init(sb, needs_recovery); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n", + err); + goto failed_mount4; + } + /* * akpm: core read_super() calls in here with the superblock locked. * That deadlocks, because orphan cleanup needs to lock the superblock @@ -2462,16 +2426,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { - printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - " - "requested data journaling mode\n"); - clear_opt(sbi->s_mount_opt, DELALLOC); - } else if (test_opt(sb, DELALLOC)) - printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); - - ext4_ext_init(sb); - ext4_mb_init(sb, needs_recovery); - lock_kernel(); return 0; @@ -2488,11 +2442,16 @@ failed_mount3: percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); + percpu_counter_destroy(&sbi->s_dirtyblocks_counter); failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); failed_mount: + if (sbi->s_proc) { + remove_proc_entry("inode_readahead_blks", sbi->s_proc); + remove_proc_entry(sb->s_id, ext4_proc_root); + } #ifdef CONFIG_QUOTA for (i = 0; i < MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); @@ -2526,6 +2485,10 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; + if (test_opt(sb, DATA_ERR_ABORT)) + journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; + else + journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; spin_unlock(&journal->j_state_lock); } @@ -2551,7 +2514,7 @@ static journal_t *ext4_get_journal(struct super_block *sb, return NULL; } - jbd_debug(2, "Journal inode found at %p: %Ld bytes\n", + jbd_debug(2, "Journal inode found at %p: %lld bytes\n", journal_inode, journal_inode->i_size); if (!S_ISREG(journal_inode->i_mode)) { printk(KERN_ERR "EXT4-fs: invalid journal inode.\n"); @@ -2714,6 +2677,11 @@ static int ext4_load_journal(struct super_block *sb, return -EINVAL; } + if (journal->j_flags & JBD2_BARRIER) + printk(KERN_INFO "EXT4-fs: barriers enabled\n"); + else + printk(KERN_INFO "EXT4-fs: barriers disabled\n"); + if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) { err = jbd2_journal_update_format(journal); if (err) { @@ -2798,13 +2766,34 @@ static void ext4_commit_super(struct super_block *sb, if (!sbh) return; + if (buffer_write_io_error(sbh)) { + /* + * Oh, dear. A previous attempt to write the + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "ext4: previous I/O error to " + "superblock detected for %s.\n", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } es->s_wtime = cpu_to_le32(get_seconds()); ext4_free_blocks_count_set(es, ext4_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb)); BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); - if (sync) + if (sync) { sync_dirty_buffer(sbh); + if (buffer_write_io_error(sbh)) { + printk(KERN_ERR "ext4: I/O error while writing " + "superblock for %s.\n", sb->s_id); + clear_buffer_write_io_error(sbh); + set_buffer_uptodate(sbh); + } + } } @@ -2819,7 +2808,9 @@ static void ext4_mark_recovery_complete(struct super_block *sb, journal_t *journal = EXT4_SB(sb)->s_journal; jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); + if (jbd2_journal_flush(journal) < 0) + goto out; + lock_super(sb); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { @@ -2828,6 +2819,8 @@ static void ext4_mark_recovery_complete(struct super_block *sb, ext4_commit_super(sb, es, 1); } unlock_super(sb); + +out: jbd2_journal_unlock_updates(journal); } @@ -2906,6 +2899,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) { tid_t target; + trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { if (wait) @@ -2927,7 +2921,13 @@ static void ext4_write_super_lockfs(struct super_block *sb) /* Now we set up the journal barrier. */ jbd2_journal_lock_updates(journal); - jbd2_journal_flush(journal); + + /* + * We don't want to clear needs_recovery flag when we failed + * to flush the journal. + */ + if (jbd2_journal_flush(journal) < 0) + return; /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -3161,7 +3161,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_type = EXT4_SUPER_MAGIC; buf->f_bsize = sb->s_blocksize; buf->f_blocks = ext4_blocks_count(es) - sbi->s_overhead_last; - buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter); + buf->f_bfree = percpu_counter_sum_positive(&sbi->s_freeblocks_counter) - + percpu_counter_sum_positive(&sbi->s_dirtyblocks_counter); ext4_free_blocks_count_set(es, buf->f_bfree); buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es); if (buf->f_bfree < ext4_r_blocks_count(es)) @@ -3366,8 +3367,12 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, * otherwise be livelocked... */ jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal); - jbd2_journal_flush(EXT4_SB(sb)->s_journal); + err = jbd2_journal_flush(EXT4_SB(sb)->s_journal); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (err) { + path_put(&nd.path); + return err; + } } err = vfs_quota_on_path(sb, type, format_id, &nd.path); @@ -3431,7 +3436,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, handle_t *handle = journal_current_handle(); if (!handle) { - printk(KERN_WARNING "EXT4-fs: Quota write (off=%Lu, len=%Lu)" + printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)" " cancelled because transaction is not started.\n", (unsigned long long)off, (unsigned long long)len); return -EIO; @@ -3492,18 +3497,82 @@ static int ext4_get_sb(struct file_system_type *fs_type, return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); } +#ifdef CONFIG_PROC_FS +static int ext4_ui_proc_show(struct seq_file *m, void *v) +{ + unsigned int *p = m->private; + + seq_printf(m, "%u\n", *p); + return 0; +} + +static int ext4_ui_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, ext4_ui_proc_show, PDE(inode)->data); +} + +static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf, + size_t cnt, loff_t *ppos) +{ + unsigned int *p = PDE(file->f_path.dentry->d_inode)->data; + char str[32]; + unsigned long value; + + if (cnt >= sizeof(str)) + return -EINVAL; + if (copy_from_user(str, buf, cnt)) + return -EFAULT; + value = simple_strtol(str, NULL, 0); + if (value < 0) + return -ERANGE; + *p = value; + return cnt; +} + +const struct file_operations ext4_ui_proc_fops = { + .owner = THIS_MODULE, + .open = ext4_ui_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, + .write = ext4_ui_proc_write, +}; +#endif + +static struct file_system_type ext4_fs_type = { + .owner = THIS_MODULE, + .name = "ext4", + .get_sb = ext4_get_sb, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +#ifdef CONFIG_EXT4DEV_COMPAT +static int ext4dev_get_sb(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data, struct vfsmount *mnt) +{ + printk(KERN_WARNING "EXT4-fs: Update your userspace programs " + "to mount using ext4\n"); + printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility " + "will go away by 2.6.31\n"); + return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt); +} + static struct file_system_type ext4dev_fs_type = { .owner = THIS_MODULE, .name = "ext4dev", - .get_sb = ext4_get_sb, + .get_sb = ext4dev_get_sb, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, }; +MODULE_ALIAS("ext4dev"); +#endif static int __init init_ext4_fs(void) { int err; + ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) return err; @@ -3514,9 +3583,16 @@ static int __init init_ext4_fs(void) err = init_inodecache(); if (err) goto out1; - err = register_filesystem(&ext4dev_fs_type); + err = register_filesystem(&ext4_fs_type); if (err) goto out; +#ifdef CONFIG_EXT4DEV_COMPAT + err = register_filesystem(&ext4dev_fs_type); + if (err) { + unregister_filesystem(&ext4_fs_type); + goto out; + } +#endif return 0; out: destroy_inodecache(); @@ -3529,10 +3605,14 @@ out2: static void __exit exit_ext4_fs(void) { + unregister_filesystem(&ext4_fs_type); +#ifdef CONFIG_EXT4DEV_COMPAT unregister_filesystem(&ext4dev_fs_type); +#endif destroy_inodecache(); exit_ext4_xattr(); exit_ext4_mballoc(); + remove_proc_entry("fs/ext4", NULL); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index e9178643dc0..00740cb32be 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -23,10 +23,10 @@ #include "ext4.h" #include "xattr.h" -static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) +static void *ext4_follow_link(struct dentry *dentry, struct nameidata *nd) { struct ext4_inode_info *ei = EXT4_I(dentry->d_inode); - nd_set_link(nd, (char*)ei->i_data); + nd_set_link(nd, (char *) ei->i_data); return NULL; } @@ -34,7 +34,7 @@ const struct inode_operations ext4_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = page_follow_link_light, .put_link = page_put_link, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, @@ -45,7 +45,7 @@ const struct inode_operations ext4_symlink_inode_operations = { const struct inode_operations ext4_fast_symlink_inode_operations = { .readlink = generic_readlink, .follow_link = ext4_follow_link, -#ifdef CONFIG_EXT4DEV_FS_XATTR +#ifdef CONFIG_EXT4_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = ext4_listxattr, diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 8954208b489..80626d516fe 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -99,12 +99,12 @@ static struct mb_cache *ext4_xattr_cache; static struct xattr_handler *ext4_xattr_handler_map[] = { [EXT4_XATTR_INDEX_USER] = &ext4_xattr_user_handler, -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL [EXT4_XATTR_INDEX_POSIX_ACL_ACCESS] = &ext4_xattr_acl_access_handler, [EXT4_XATTR_INDEX_POSIX_ACL_DEFAULT] = &ext4_xattr_acl_default_handler, #endif [EXT4_XATTR_INDEX_TRUSTED] = &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4DEV_FS_SECURITY +#ifdef CONFIG_EXT4_FS_SECURITY [EXT4_XATTR_INDEX_SECURITY] = &ext4_xattr_security_handler, #endif }; @@ -112,11 +112,11 @@ static struct xattr_handler *ext4_xattr_handler_map[] = { struct xattr_handler *ext4_xattr_handlers[] = { &ext4_xattr_user_handler, &ext4_xattr_trusted_handler, -#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL +#ifdef CONFIG_EXT4_FS_POSIX_ACL &ext4_xattr_acl_access_handler, &ext4_xattr_acl_default_handler, #endif -#ifdef CONFIG_EXT4DEV_FS_SECURITY +#ifdef CONFIG_EXT4_FS_SECURITY &ext4_xattr_security_handler, #endif NULL @@ -959,6 +959,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, struct ext4_xattr_block_find bs = { .s = { .not_found = -ENODATA, }, }; + unsigned long no_expand; int error; if (!name) @@ -966,6 +967,9 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, if (strlen(name) > 255) return -ERANGE; down_write(&EXT4_I(inode)->xattr_sem); + no_expand = EXT4_I(inode)->i_state & EXT4_STATE_NO_EXPAND; + EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; + error = ext4_get_inode_loc(inode, &is.iloc); if (error) goto cleanup; @@ -1042,6 +1046,8 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index, cleanup: brelse(is.iloc.bh); brelse(bs.bh); + if (no_expand == 0) + EXT4_I(inode)->i_state &= ~EXT4_STATE_NO_EXPAND; up_write(&EXT4_I(inode)->xattr_sem); return error; } diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index 5992fe979bb..8ede88b18c2 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -51,8 +51,8 @@ struct ext4_xattr_entry { (((name_len) + EXT4_XATTR_ROUND + \ sizeof(struct ext4_xattr_entry)) & ~EXT4_XATTR_ROUND) #define EXT4_XATTR_NEXT(entry) \ - ( (struct ext4_xattr_entry *)( \ - (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len)) ) + ((struct ext4_xattr_entry *)( \ + (char *)(entry) + EXT4_XATTR_LEN((entry)->e_name_len))) #define EXT4_XATTR_SIZE(size) \ (((size) + EXT4_XATTR_ROUND) & ~EXT4_XATTR_ROUND) @@ -63,7 +63,7 @@ struct ext4_xattr_entry { EXT4_I(inode)->i_extra_isize)) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) -# ifdef CONFIG_EXT4DEV_FS_XATTR +# ifdef CONFIG_EXT4_FS_XATTR extern struct xattr_handler ext4_xattr_user_handler; extern struct xattr_handler ext4_xattr_trusted_handler; @@ -88,7 +88,7 @@ extern void exit_ext4_xattr(void); extern struct xattr_handler *ext4_xattr_handlers[]; -# else /* CONFIG_EXT4DEV_FS_XATTR */ +# else /* CONFIG_EXT4_FS_XATTR */ static inline int ext4_xattr_get(struct inode *inode, int name_index, const char *name, @@ -141,9 +141,9 @@ ext4_expand_extra_isize_ea(struct inode *inode, int new_extra_isize, #define ext4_xattr_handlers NULL -# endif /* CONFIG_EXT4DEV_FS_XATTR */ +# endif /* CONFIG_EXT4_FS_XATTR */ -#ifdef CONFIG_EXT4DEV_FS_SECURITY +#ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir); #else diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 302e95c4af7..fb98b3d847e 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -6,6 +6,7 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/msdos_fs.h> +#include <linux/blkdev.h> struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); @@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster) struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; + int first_cl = cluster; nr_bhs = 0; fatent_init(&fatent); @@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster) goto error; } + /* + * Issue discard for the sectors we no longer care about, + * batching contiguous clusters into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus); + first_cl = cluster; + } + ops->ent_put(&fatent, FAT_ENT_FREE); if (sbi->free_clusters != -1) { sbi->free_clusters++; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 80ff3381fa2..d12cdf2a040 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -855,7 +855,7 @@ enum { Opt_obsolate, Opt_flush, Opt_tz_utc, Opt_err, }; -static match_table_t fat_tokens = { +static const match_table_t fat_tokens = { {Opt_check_r, "check=relaxed"}, {Opt_check_s, "check=strict"}, {Opt_check_n, "check=normal"}, @@ -890,14 +890,14 @@ static match_table_t fat_tokens = { {Opt_tz_utc, "tz=UTC"}, {Opt_err, NULL}, }; -static match_table_t msdos_tokens = { +static const match_table_t msdos_tokens = { {Opt_nodots, "nodots"}, {Opt_nodots, "dotsOK=no"}, {Opt_dots, "dots"}, {Opt_dots, "dotsOK=yes"}, {Opt_err, NULL} }; -static match_table_t vfat_tokens = { +static const match_table_t vfat_tokens = { {Opt_charset, "iocharset=%s"}, {Opt_shortname_lower, "shortname=lower"}, {Opt_shortname_win95, "shortname=win95"}, diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 25adfc3c693..d0ff0b8cf30 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -8,7 +8,7 @@ * pages against inodes. ie: data writeback. Writeout of the * inode itself is not handled here. * - * 10Apr2002 akpm@zip.com.au + * 10Apr2002 Andrew Morton * Split out of fs/inode.c * Additions for address_space-based writeback */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index d2249f174e2..6a84388cacf 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -354,7 +354,7 @@ enum { OPT_ERR }; -static match_table_t tokens = { +static const match_table_t tokens = { {OPT_FD, "fd=%u"}, {OPT_ROOTMODE, "rootmode=%o"}, {OPT_USER_ID, "user_id=%u"}, diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 13391e54661..c962283d4e7 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1265,6 +1265,8 @@ static void blocking_cb(struct gfs2_sbd *sdp, struct lm_lockname *name, holdtime = gl->gl_tchange + gl->gl_ops->go_min_hold_time; if (time_before(now, holdtime)) delay = holdtime - now; + if (test_bit(GLF_REPLY_PENDING, &gl->gl_flags)) + delay = gl->gl_ops->go_min_hold_time; spin_lock(&gl->gl_spin); handle_callback(gl, state, 1, delay); @@ -1578,8 +1580,6 @@ static const char *hflags2str(char *buf, unsigned flags, unsigned long iflags) *p++ = 'a'; if (flags & GL_EXACT) *p++ = 'E'; - if (flags & GL_ATIME) - *p++ = 'a'; if (flags & GL_NOCACHE) *p++ = 'c'; if (test_bit(HIF_HOLDER, &iflags)) @@ -1816,15 +1816,17 @@ restart: if (gl) { gi->gl = hlist_entry(gl->gl_list.next, struct gfs2_glock, gl_list); - if (gi->gl) - gfs2_glock_hold(gi->gl); + } else { + gi->gl = hlist_entry(gl_hash_table[gi->hash].hb_list.first, + struct gfs2_glock, gl_list); } + if (gi->gl) + gfs2_glock_hold(gi->gl); read_unlock(gl_lock_addr(gi->hash)); if (gl) gfs2_glock_put(gl); - if (gl && gi->gl == NULL) - gi->hash++; while (gi->gl == NULL) { + gi->hash++; if (gi->hash >= GFS2_GL_HASH_SIZE) return 1; read_lock(gl_lock_addr(gi->hash)); @@ -1833,7 +1835,6 @@ restart: if (gi->gl) gfs2_glock_hold(gi->gl); read_unlock(gl_lock_addr(gi->hash)); - gi->hash++; } if (gi->sdp != gi->gl->gl_sbd) diff --git a/fs/gfs2/glock.h b/fs/gfs2/glock.h index 971d92af70f..695c6b19361 100644 --- a/fs/gfs2/glock.h +++ b/fs/gfs2/glock.h @@ -24,7 +24,6 @@ #define GL_ASYNC 0x00000040 #define GL_EXACT 0x00000080 #define GL_SKIP 0x00000100 -#define GL_ATIME 0x00000200 #define GL_NOCACHE 0x00000400 #define GLR_TRYFAILED 13 diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 448697a5c46..f566ec1b4e8 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -386,20 +386,21 @@ struct gfs2_statfs_change_host { #define GFS2_DATA_ORDERED 2 struct gfs2_args { - char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ - char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ - char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ - int ar_spectator; /* Don't get a journal because we're always RO */ - int ar_ignore_local_fs; /* Don't optimize even if local_fs is 1 */ - int ar_localflocks; /* Let the VFS do flock|fcntl locks for us */ - int ar_localcaching; /* Local-style caching (dangerous on multihost) */ - int ar_debug; /* Oops on errors instead of trying to be graceful */ - int ar_upgrade; /* Upgrade ondisk/multihost format */ - unsigned int ar_num_glockd; /* Number of glockd threads */ - int ar_posix_acl; /* Enable posix acls */ - int ar_quota; /* off/account/on */ - int ar_suiddir; /* suiddir support */ - int ar_data; /* ordered/writeback */ + char ar_lockproto[GFS2_LOCKNAME_LEN]; /* Name of the Lock Protocol */ + char ar_locktable[GFS2_LOCKNAME_LEN]; /* Name of the Lock Table */ + char ar_hostdata[GFS2_LOCKNAME_LEN]; /* Host specific data */ + unsigned int ar_spectator:1; /* Don't get a journal */ + unsigned int ar_ignore_local_fs:1; /* Ignore optimisations */ + unsigned int ar_localflocks:1; /* Let the VFS do flock|fcntl */ + unsigned int ar_localcaching:1; /* Local caching */ + unsigned int ar_debug:1; /* Oops on errors */ + unsigned int ar_upgrade:1; /* Upgrade ondisk format */ + unsigned int ar_posix_acl:1; /* Enable posix acls */ + unsigned int ar_quota:2; /* off/account/on */ + unsigned int ar_suiddir:1; /* suiddir support */ + unsigned int ar_data:2; /* ordered/writeback */ + unsigned int ar_meta:1; /* mount metafs */ + unsigned int ar_num_glockd; /* Number of glockd threads */ }; struct gfs2_tune { @@ -419,7 +420,6 @@ struct gfs2_tune { unsigned int gt_quota_scale_den; /* Denominator */ unsigned int gt_quota_cache_secs; unsigned int gt_quota_quantum; /* Secs between syncs to quota file */ - unsigned int gt_atime_quantum; /* Min secs between atime updates */ unsigned int gt_new_files_jdata; unsigned int gt_max_readahead; /* Max bytes to read-ahead from disk */ unsigned int gt_stall_secs; /* Detects trouble! */ @@ -432,7 +432,7 @@ enum { SDF_JOURNAL_CHECKED = 0, SDF_JOURNAL_LIVE = 1, SDF_SHUTDOWN = 2, - SDF_NOATIME = 3, + SDF_NOBARRIERS = 3, }; #define GFS2_FSNAME_LEN 256 @@ -461,7 +461,6 @@ struct gfs2_sb_host { struct gfs2_sbd { struct super_block *sd_vfs; - struct super_block *sd_vfs_meta; struct kobject sd_kobj; unsigned long sd_flags; /* SDF_... */ struct gfs2_sb_host sd_sb; @@ -499,7 +498,9 @@ struct gfs2_sbd { /* Inode Stuff */ - struct inode *sd_master_dir; + struct dentry *sd_master_dir; + struct dentry *sd_root_dir; + struct inode *sd_jindex; struct inode *sd_inum_inode; struct inode *sd_statfs_inode; @@ -634,7 +635,6 @@ struct gfs2_sbd { /* Debugging crud */ unsigned long sd_last_warning; - struct vfsmount *sd_gfs2mnt; struct dentry *debugfs_dir; /* debugfs directory */ struct dentry *debugfs_dentry_glocks; /* for debugfs */ }; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8b0806a3294..7cee695fa44 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -18,6 +18,7 @@ #include <linux/crc32.h> #include <linux/lm_interface.h> #include <linux/security.h> +#include <linux/time.h> #include "gfs2.h" #include "incore.h" @@ -249,6 +250,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) { struct gfs2_dinode_host *di = &ip->i_di; const struct gfs2_dinode *str = buf; + struct timespec atime; u16 height, depth; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) @@ -275,8 +277,10 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) di->di_size = be64_to_cpu(str->di_size); i_size_write(&ip->i_inode, di->di_size); gfs2_set_inode_blocks(&ip->i_inode, be64_to_cpu(str->di_blocks)); - ip->i_inode.i_atime.tv_sec = be64_to_cpu(str->di_atime); - ip->i_inode.i_atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + atime.tv_sec = be64_to_cpu(str->di_atime); + atime.tv_nsec = be32_to_cpu(str->di_atime_nsec); + if (timespec_compare(&ip->i_inode.i_atime, &atime) < 0) + ip->i_inode.i_atime = atime; ip->i_inode.i_mtime.tv_sec = be64_to_cpu(str->di_mtime); ip->i_inode.i_mtime.tv_nsec = be32_to_cpu(str->di_mtime_nsec); ip->i_inode.i_ctime.tv_sec = be64_to_cpu(str->di_ctime); @@ -1033,13 +1037,11 @@ struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, if (bh) brelse(bh); - if (!inode) - return ERR_PTR(-ENOMEM); return inode; fail_gunlock2: gfs2_glock_dq_uninit(ghs + 1); - if (inode) + if (inode && !IS_ERR(inode)) iput(inode); fail_gunlock: gfs2_glock_dq(ghs); @@ -1140,54 +1142,6 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -/* - * gfs2_ok_to_move - check if it's ok to move a directory to another directory - * @this: move this - * @to: to here - * - * Follow @to back to the root and make sure we don't encounter @this - * Assumes we already hold the rename lock. - * - * Returns: errno - */ - -int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) -{ - struct inode *dir = &to->i_inode; - struct super_block *sb = dir->i_sb; - struct inode *tmp; - struct qstr dotdot; - int error = 0; - - gfs2_str2qstr(&dotdot, ".."); - - igrab(dir); - - for (;;) { - if (dir == &this->i_inode) { - error = -EINVAL; - break; - } - if (dir == sb->s_root->d_inode) { - error = 0; - break; - } - - tmp = gfs2_lookupi(dir, &dotdot, 1); - if (IS_ERR(tmp)) { - error = PTR_ERR(tmp); - break; - } - - iput(dir); - dir = tmp; - } - - iput(dir); - - return error; -} - /** * gfs2_readlinki - return the contents of a symlink * @ip: the symlink's inode @@ -1207,8 +1161,8 @@ int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) unsigned int x; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); - error = gfs2_glock_nq_atime(&i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); return error; @@ -1243,101 +1197,6 @@ out: return error; } -/** - * gfs2_glock_nq_atime - Acquire a hold on an inode's glock, and - * conditionally update the inode's atime - * @gh: the holder to acquire - * - * Tests atime (access time) for gfs2_read, gfs2_readdir and gfs2_mmap - * Update if the difference between the current time and the inode's current - * atime is greater than an interval specified at mount. - * - * Returns: errno - */ - -int gfs2_glock_nq_atime(struct gfs2_holder *gh) -{ - struct gfs2_glock *gl = gh->gh_gl; - struct gfs2_sbd *sdp = gl->gl_sbd; - struct gfs2_inode *ip = gl->gl_object; - s64 quantum = gfs2_tune_get(sdp, gt_atime_quantum); - unsigned int state; - int flags; - int error; - struct timespec tv = CURRENT_TIME; - - if (gfs2_assert_warn(sdp, gh->gh_flags & GL_ATIME) || - gfs2_assert_warn(sdp, !(gh->gh_flags & GL_ASYNC)) || - gfs2_assert_warn(sdp, gl->gl_ops == &gfs2_inode_glops)) - return -EINVAL; - - state = gh->gh_state; - flags = gh->gh_flags; - - error = gfs2_glock_nq(gh); - if (error) - return error; - - if (test_bit(SDF_NOATIME, &sdp->sd_flags) || - (sdp->sd_vfs->s_flags & MS_RDONLY)) - return 0; - - if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { - gfs2_glock_dq(gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, gh->gh_flags & ~LM_FLAG_ANY, - gh); - error = gfs2_glock_nq(gh); - if (error) - return error; - - /* Verify that atime hasn't been updated while we were - trying to get exclusive lock. */ - - tv = CURRENT_TIME; - if (tv.tv_sec - ip->i_inode.i_atime.tv_sec >= quantum) { - struct buffer_head *dibh; - struct gfs2_dinode *di; - - error = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (error == -EROFS) - return 0; - if (error) - goto fail; - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto fail_end_trans; - - ip->i_inode.i_atime = tv; - - gfs2_trans_add_bh(ip->i_gl, dibh, 1); - di = (struct gfs2_dinode *)dibh->b_data; - di->di_atime = cpu_to_be64(ip->i_inode.i_atime.tv_sec); - di->di_atime_nsec = cpu_to_be32(ip->i_inode.i_atime.tv_nsec); - brelse(dibh); - - gfs2_trans_end(sdp); - } - - /* If someone else has asked for the glock, - unlock and let them have it. Then reacquire - in the original state. */ - if (gfs2_glock_is_blocking(gl)) { - gfs2_glock_dq(gh); - gfs2_holder_reinit(state, flags, gh); - return gfs2_glock_nq(gh); - } - } - - return 0; - -fail_end_trans: - gfs2_trans_end(sdp); -fail: - gfs2_glock_dq(gh); - return error; -} - static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) { diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 58f9607d6a8..2d43f69610a 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -91,9 +91,7 @@ int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, const struct gfs2_inode *ip); int gfs2_permission(struct inode *inode, int mask); -int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to); int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); -int gfs2_glock_nq_atime(struct gfs2_holder *gh); int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/locking/dlm/mount.c b/fs/gfs2/locking/dlm/mount.c index 09d78c216f4..0c4cbe6c828 100644 --- a/fs/gfs2/locking/dlm/mount.c +++ b/fs/gfs2/locking/dlm/mount.c @@ -144,7 +144,8 @@ static int gdlm_mount(char *table_name, char *host_data, error = dlm_new_lockspace(ls->fsname, strlen(ls->fsname), &ls->dlm_lockspace, - DLM_LSFL_FS | (nodir ? DLM_LSFL_NODIR : 0), + DLM_LSFL_FS | DLM_LSFL_NEWEXCL | + (nodir ? DLM_LSFL_NODIR : 0), GDLM_LVB_SIZE); if (error) { log_error("dlm_new_lockspace error %d", error); diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 6c6af9f5e3a..ad305854bdc 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -18,6 +18,7 @@ #include <linux/delay.h> #include <linux/kthread.h> #include <linux/freezer.h> +#include <linux/bio.h> #include "gfs2.h" #include "incore.h" @@ -584,7 +585,6 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) memset(bh->b_data, 0, bh->b_size); set_buffer_uptodate(bh); clear_buffer_dirty(bh); - unlock_buffer(bh); gfs2_ail1_empty(sdp, 0); tail = current_tail(sdp); @@ -601,8 +601,23 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) hash = gfs2_disk_hash(bh->b_data, sizeof(struct gfs2_log_header)); lh->lh_hash = cpu_to_be32(hash); - set_buffer_dirty(bh); - if (sync_dirty_buffer(bh)) + bh->b_end_io = end_buffer_write_sync; + if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) + goto skip_barrier; + get_bh(bh); + submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); + wait_on_buffer(bh); + if (buffer_eopnotsupp(bh)) { + clear_buffer_eopnotsupp(bh); + set_buffer_uptodate(bh); + set_bit(SDF_NOBARRIERS, &sdp->sd_flags); + lock_buffer(bh); +skip_barrier: + get_bh(bh); + submit_bh(WRITE_SYNC | (1 << BIO_RW_META), bh); + wait_on_buffer(bh); + } + if (!buffer_uptodate(bh)) gfs2_io_error_bh(sdp, bh); brelse(bh); diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index b941f9f9f95..f96eb90a2cf 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -42,10 +42,11 @@ enum { Opt_nosuiddir, Opt_data_writeback, Opt_data_ordered, + Opt_meta, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_lockproto, "lockproto=%s"}, {Opt_locktable, "locktable=%s"}, {Opt_hostdata, "hostdata=%s"}, @@ -66,6 +67,7 @@ static match_table_t tokens = { {Opt_nosuiddir, "nosuiddir"}, {Opt_data_writeback, "data=writeback"}, {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, {Opt_err, NULL} }; @@ -239,6 +241,11 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, char *data_arg, int remount) case Opt_data_ordered: args->ar_data = GFS2_DATA_ORDERED; break; + case Opt_meta: + if (remount && args->ar_meta != 1) + goto cant_remount; + args->ar_meta = 1; + break; case Opt_err: default: fs_info(sdp, "unknown option: %s\n", o); diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index e64a1b04117..27563816e1c 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -512,8 +512,8 @@ static int gfs2_readpage(struct file *file, struct page *page) int error; unlock_page(page); - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - error = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); if (unlikely(error)) goto out; error = AOP_TRUNCATED_PAGE; @@ -594,8 +594,8 @@ static int gfs2_readpages(struct file *file, struct address_space *mapping, struct gfs2_holder gh; int ret; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - ret = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + ret = gfs2_glock_nq(&gh); if (unlikely(ret)) goto out_uninit; if (!gfs2_is_stuffed(ip)) @@ -636,8 +636,8 @@ static int gfs2_write_begin(struct file *file, struct address_space *mapping, unsigned to = from + len; struct page *page; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &ip->i_gh); - error = gfs2_glock_nq_atime(&ip->i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &ip->i_gh); + error = gfs2_glock_nq(&ip->i_gh); if (unlikely(error)) goto out_uninit; @@ -975,7 +975,7 @@ static int gfs2_ok_for_dio(struct gfs2_inode *ip, int rw, loff_t offset) if (gfs2_is_stuffed(ip)) return 0; - if (offset > i_size_read(&ip->i_inode)) + if (offset >= i_size_read(&ip->i_inode)) return 0; return 1; } @@ -1000,8 +1000,8 @@ static ssize_t gfs2_direct_IO(int rw, struct kiocb *iocb, * unfortunately have the option of only flushing a range like * the VFS does. */ - gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, GL_ATIME, &gh); - rv = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_DEFERRED, 0, &gh); + rv = gfs2_glock_nq(&gh); if (rv) return rv; rv = gfs2_ok_for_dio(ip, rw, offset); diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index e9a366d4411..3a747f8e218 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -89,8 +89,8 @@ static int gfs2_readdir(struct file *file, void *dirent, filldir_t filldir) u64 offset = file->f_pos; int error; - gfs2_holder_init(dip->i_gl, LM_ST_SHARED, GL_ATIME, &d_gh); - error = gfs2_glock_nq_atime(&d_gh); + gfs2_holder_init(dip->i_gl, LM_ST_SHARED, 0, &d_gh); + error = gfs2_glock_nq(&d_gh); if (error) { gfs2_holder_uninit(&d_gh); return error; @@ -153,8 +153,8 @@ static int gfs2_get_flags(struct file *filp, u32 __user *ptr) int error; u32 fsflags; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &gh); - error = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &gh); + error = gfs2_glock_nq(&gh); if (error) return error; @@ -351,8 +351,8 @@ static int gfs2_page_mkwrite(struct vm_area_struct *vma, struct page *page) struct gfs2_alloc *al; int ret; - gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_ATIME, &gh); - ret = gfs2_glock_nq_atime(&gh); + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + ret = gfs2_glock_nq(&gh); if (ret) goto out; @@ -434,8 +434,8 @@ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) struct gfs2_holder i_gh; int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, GL_ATIME, &i_gh); - error = gfs2_glock_nq_atime(&i_gh); + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); if (error) { gfs2_holder_uninit(&i_gh); return error; diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index b4d1d649063..b117fcf2c4f 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -40,6 +40,44 @@ #define DO 0 #define UNDO 1 +static const u32 gfs2_old_fs_formats[] = { + 0 +}; + +static const u32 gfs2_old_multihost_formats[] = { + 0 +}; + +/** + * gfs2_tune_init - Fill a gfs2_tune structure with default values + * @gt: tune + * + */ + +static void gfs2_tune_init(struct gfs2_tune *gt) +{ + spin_lock_init(>->gt_spin); + + gt->gt_demote_secs = 300; + gt->gt_incore_log_blocks = 1024; + gt->gt_log_flush_secs = 60; + gt->gt_recoverd_secs = 60; + gt->gt_logd_secs = 1; + gt->gt_quotad_secs = 5; + gt->gt_quota_simul_sync = 64; + gt->gt_quota_warn_period = 10; + gt->gt_quota_scale_num = 1; + gt->gt_quota_scale_den = 1; + gt->gt_quota_cache_secs = 300; + gt->gt_quota_quantum = 60; + gt->gt_new_files_jdata = 0; + gt->gt_max_readahead = 1 << 18; + gt->gt_stall_secs = 600; + gt->gt_complain_secs = 10; + gt->gt_statfs_quantum = 30; + gt->gt_statfs_slow = 0; +} + static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; @@ -96,21 +134,271 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) return sdp; } -static void init_vfs(struct super_block *sb, unsigned noatime) + +/** + * gfs2_check_sb - Check superblock + * @sdp: the filesystem + * @sb: The superblock + * @silent: Don't print a message if the check fails + * + * Checks the version code of the FS is one that we understand how to + * read and that the sizes of the various on-disk structures have not + * changed. + */ + +static int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) { - struct gfs2_sbd *sdp = sb->s_fs_info; + unsigned int x; - sb->s_magic = GFS2_MAGIC; - sb->s_op = &gfs2_super_ops; - sb->s_export_op = &gfs2_export_ops; - sb->s_time_gran = 1; - sb->s_maxbytes = MAX_LFS_FILESIZE; + if (sb->sb_magic != GFS2_MAGIC || + sb->sb_type != GFS2_METATYPE_SB) { + if (!silent) + printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); + return -EINVAL; + } + + /* If format numbers match exactly, we're done. */ + + if (sb->sb_fs_format == GFS2_FORMAT_FS && + sb->sb_multihost_format == GFS2_FORMAT_MULTI) + return 0; + + if (sb->sb_fs_format != GFS2_FORMAT_FS) { + for (x = 0; gfs2_old_fs_formats[x]; x++) + if (gfs2_old_fs_formats[x] == sb->sb_fs_format) + break; + + if (!gfs2_old_fs_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { + for (x = 0; gfs2_old_multihost_formats[x]; x++) + if (gfs2_old_multihost_formats[x] == + sb->sb_multihost_format) + break; + + if (!gfs2_old_multihost_formats[x]) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_WARNING + "GFS2: I don't know how to upgrade this FS\n"); + return -EINVAL; + } + } + + if (!sdp->sd_args.ar_upgrade) { + printk(KERN_WARNING + "GFS2: code version (%u, %u) is incompatible " + "with ondisk format (%u, %u)\n", + GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, + sb->sb_fs_format, sb->sb_multihost_format); + printk(KERN_INFO + "GFS2: Use the \"upgrade\" mount option to upgrade " + "the FS\n"); + printk(KERN_INFO "GFS2: See the manual for more details\n"); + return -EINVAL; + } + + return 0; +} + +static void end_bio_io_page(struct bio *bio, int error) +{ + struct page *page = bio->bi_private; - if (sb->s_flags & (MS_NOATIME | MS_NODIRATIME)) - set_bit(noatime, &sdp->sd_flags); + if (!error) + SetPageUptodate(page); + else + printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); + unlock_page(page); +} + +static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) +{ + const struct gfs2_sb *str = buf; + + sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); + sb->sb_type = be32_to_cpu(str->sb_header.mh_type); + sb->sb_format = be32_to_cpu(str->sb_header.mh_format); + sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); + sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); + sb->sb_bsize = be32_to_cpu(str->sb_bsize); + sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); + sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); + sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); + sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); + sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); + + memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); + memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); +} + +/** + * gfs2_read_super - Read the gfs2 super block from disk + * @sdp: The GFS2 super block + * @sector: The location of the super block + * @error: The error code to return + * + * This uses the bio functions to read the super block from disk + * because we want to be 100% sure that we never read cached data. + * A super block is read twice only during each GFS2 mount and is + * never written to by the filesystem. The first time its read no + * locks are held, and the only details which are looked at are those + * relating to the locking protocol. Once locking is up and working, + * the sb is read again under the lock to establish the location of + * the master directory (contains pointers to journals etc) and the + * root directory. + * + * Returns: 0 on success or error + */ + +static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) +{ + struct super_block *sb = sdp->sd_vfs; + struct gfs2_sb *p; + struct page *page; + struct bio *bio; + + page = alloc_page(GFP_NOFS); + if (unlikely(!page)) + return -ENOBUFS; + + ClearPageUptodate(page); + ClearPageDirty(page); + lock_page(page); + + bio = bio_alloc(GFP_NOFS, 1); + if (unlikely(!bio)) { + __free_page(page); + return -ENOBUFS; + } - /* Don't let the VFS update atimes. GFS2 handles this itself. */ - sb->s_flags |= MS_NOATIME | MS_NODIRATIME; + bio->bi_sector = sector * (sb->s_blocksize >> 9); + bio->bi_bdev = sb->s_bdev; + bio_add_page(bio, page, PAGE_SIZE, 0); + + bio->bi_end_io = end_bio_io_page; + bio->bi_private = page; + submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); + wait_on_page_locked(page); + bio_put(bio); + if (!PageUptodate(page)) { + __free_page(page); + return -EIO; + } + p = kmap(page); + gfs2_sb_in(&sdp->sd_sb, p); + kunmap(page); + __free_page(page); + return 0; +} +/** + * gfs2_read_sb - Read super block + * @sdp: The GFS2 superblock + * @gl: the glock for the superblock (assumed to be held) + * @silent: Don't print message if mount fails + * + */ + +static int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) +{ + u32 hash_blocks, ind_blocks, leaf_blocks; + u32 tmp_blocks; + unsigned int x; + int error; + + error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); + if (error) { + if (!silent) + fs_err(sdp, "can't read superblock\n"); + return error; + } + + error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); + if (error) + return error; + + sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - + GFS2_BASIC_BLOCK_SHIFT; + sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode)) / sizeof(u64); + sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / sizeof(u64); + sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); + sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; + sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; + sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); + sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_meta_header)) / + sizeof(struct gfs2_quota_change); + + /* Compute maximum reservation required to add a entry to a directory */ + + hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), + sdp->sd_jbsize); + + ind_blocks = 0; + for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { + tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); + ind_blocks += tmp_blocks; + } + + leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; + + sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; + + sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_heightsize[x - 1] || m) + break; + sdp->sd_heightsize[x] = space; + } + sdp->sd_max_height = x; + sdp->sd_heightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); + + sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - + sizeof(struct gfs2_dinode); + sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; + for (x = 2;; x++) { + u64 space, d; + u32 m; + + space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; + d = space; + m = do_div(d, sdp->sd_inptrs); + + if (d != sdp->sd_jheightsize[x - 1] || m) + break; + sdp->sd_jheightsize[x] = space; + } + sdp->sd_max_jheight = x; + sdp->sd_jheightsize[x] = ~0; + gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); + + return 0; } static int init_names(struct gfs2_sbd *sdp, int silent) @@ -224,51 +512,59 @@ fail: return error; } -static inline struct inode *gfs2_lookup_root(struct super_block *sb, - u64 no_addr) +static int gfs2_lookup_root(struct super_block *sb, struct dentry **dptr, + u64 no_addr, const char *name) { - return gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + struct gfs2_sbd *sdp = sb->s_fs_info; + struct dentry *dentry; + struct inode *inode; + + inode = gfs2_inode_lookup(sb, DT_DIR, no_addr, 0, 0); + if (IS_ERR(inode)) { + fs_err(sdp, "can't read in %s inode: %ld\n", name, PTR_ERR(inode)); + return PTR_ERR(inode); + } + dentry = d_alloc_root(inode); + if (!dentry) { + fs_err(sdp, "can't alloc %s dentry\n", name); + iput(inode); + return -ENOMEM; + } + dentry->d_op = &gfs2_dops; + *dptr = dentry; + return 0; } -static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) +static int init_sb(struct gfs2_sbd *sdp, int silent) { struct super_block *sb = sdp->sd_vfs; struct gfs2_holder sb_gh; u64 no_addr; - struct inode *inode; - int error = 0; + int ret; - if (undo) { - if (sb->s_root) { - dput(sb->s_root); - sb->s_root = NULL; - } - return 0; + ret = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, + LM_ST_SHARED, 0, &sb_gh); + if (ret) { + fs_err(sdp, "can't acquire superblock glock: %d\n", ret); + return ret; } - error = gfs2_glock_nq_num(sdp, GFS2_SB_LOCK, &gfs2_meta_glops, - LM_ST_SHARED, 0, &sb_gh); - if (error) { - fs_err(sdp, "can't acquire superblock glock: %d\n", error); - return error; - } - - error = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); - if (error) { - fs_err(sdp, "can't read superblock: %d\n", error); + ret = gfs2_read_sb(sdp, sb_gh.gh_gl, silent); + if (ret) { + fs_err(sdp, "can't read superblock: %d\n", ret); goto out; } /* Set up the buffer cache and SB for real */ if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { - error = -EINVAL; + ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too small for device " "block size (%u)\n", sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); goto out; } if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { - error = -EINVAL; + ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too big for machine " "page size (%u)\n", sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); @@ -278,26 +574,21 @@ static int init_sb(struct gfs2_sbd *sdp, int silent, int undo) /* Get the root inode */ no_addr = sdp->sd_sb.sb_root_dir.no_addr; - if (sb->s_type == &gfs2meta_fs_type) - no_addr = sdp->sd_sb.sb_master_dir.no_addr; - inode = gfs2_lookup_root(sb, no_addr); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - fs_err(sdp, "can't read in root inode: %d\n", error); + ret = gfs2_lookup_root(sb, &sdp->sd_root_dir, no_addr, "root"); + if (ret) goto out; - } - sb->s_root = d_alloc_root(inode); - if (!sb->s_root) { - fs_err(sdp, "can't get root dentry\n"); - error = -ENOMEM; - iput(inode); - } else - sb->s_root->d_op = &gfs2_dops; - + /* Get the master inode */ + no_addr = sdp->sd_sb.sb_master_dir.no_addr; + ret = gfs2_lookup_root(sb, &sdp->sd_master_dir, no_addr, "master"); + if (ret) { + dput(sdp->sd_root_dir); + goto out; + } + sb->s_root = dget(sdp->sd_args.ar_meta ? sdp->sd_master_dir : sdp->sd_root_dir); out: gfs2_glock_dq_uninit(&sb_gh); - return error; + return ret; } /** @@ -372,6 +663,7 @@ static void gfs2_lm_others_may_mount(struct gfs2_sbd *sdp) static int init_journal(struct gfs2_sbd *sdp, int undo) { + struct inode *master = sdp->sd_master_dir->d_inode; struct gfs2_holder ji_gh; struct task_struct *p; struct gfs2_inode *ip; @@ -383,7 +675,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) goto fail_recoverd; } - sdp->sd_jindex = gfs2_lookup_simple(sdp->sd_master_dir, "jindex"); + sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); if (IS_ERR(sdp->sd_jindex)) { fs_err(sdp, "can't lookup journal index: %d\n", error); return PTR_ERR(sdp->sd_jindex); @@ -506,25 +798,17 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) { int error = 0; struct gfs2_inode *ip; - struct inode *inode; + struct inode *master = sdp->sd_master_dir->d_inode; if (undo) goto fail_qinode; - inode = gfs2_lookup_root(sdp->sd_vfs, sdp->sd_sb.sb_master_dir.no_addr); - if (IS_ERR(inode)) { - error = PTR_ERR(inode); - fs_err(sdp, "can't read in master directory: %d\n", error); - goto fail; - } - sdp->sd_master_dir = inode; - error = init_journal(sdp, undo); if (error) - goto fail_master; + goto fail; /* Read in the master inode number inode */ - sdp->sd_inum_inode = gfs2_lookup_simple(sdp->sd_master_dir, "inum"); + sdp->sd_inum_inode = gfs2_lookup_simple(master, "inum"); if (IS_ERR(sdp->sd_inum_inode)) { error = PTR_ERR(sdp->sd_inum_inode); fs_err(sdp, "can't read in inum inode: %d\n", error); @@ -533,7 +817,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) /* Read in the master statfs inode */ - sdp->sd_statfs_inode = gfs2_lookup_simple(sdp->sd_master_dir, "statfs"); + sdp->sd_statfs_inode = gfs2_lookup_simple(master, "statfs"); if (IS_ERR(sdp->sd_statfs_inode)) { error = PTR_ERR(sdp->sd_statfs_inode); fs_err(sdp, "can't read in statfs inode: %d\n", error); @@ -541,7 +825,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) } /* Read in the resource index inode */ - sdp->sd_rindex = gfs2_lookup_simple(sdp->sd_master_dir, "rindex"); + sdp->sd_rindex = gfs2_lookup_simple(master, "rindex"); if (IS_ERR(sdp->sd_rindex)) { error = PTR_ERR(sdp->sd_rindex); fs_err(sdp, "can't get resource index inode: %d\n", error); @@ -552,7 +836,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo) sdp->sd_rindex_uptodate = 0; /* Read in the quota inode */ - sdp->sd_quota_inode = gfs2_lookup_simple(sdp->sd_master_dir, "quota"); + sdp->sd_quota_inode = gfs2_lookup_simple(master, "quota"); if (IS_ERR(sdp->sd_quota_inode)) { error = PTR_ERR(sdp->sd_quota_inode); fs_err(sdp, "can't get quota file inode: %d\n", error); @@ -571,8 +855,6 @@ fail_inum: iput(sdp->sd_inum_inode); fail_journal: init_journal(sdp, UNDO); -fail_master: - iput(sdp->sd_master_dir); fail: return error; } @@ -583,6 +865,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) char buf[30]; int error = 0; struct gfs2_inode *ip; + struct inode *master = sdp->sd_master_dir->d_inode; if (sdp->sd_args.ar_spectator) return 0; @@ -590,7 +873,7 @@ static int init_per_node(struct gfs2_sbd *sdp, int undo) if (undo) goto fail_qc_gh; - pn = gfs2_lookup_simple(sdp->sd_master_dir, "per_node"); + pn = gfs2_lookup_simple(master, "per_node"); if (IS_ERR(pn)) { error = PTR_ERR(pn); fs_err(sdp, "can't find per_node directory: %d\n", error); @@ -800,7 +1083,11 @@ static int fill_super(struct super_block *sb, void *data, int silent) goto fail; } - init_vfs(sb, SDF_NOATIME); + sb->s_magic = GFS2_MAGIC; + sb->s_op = &gfs2_super_ops; + sb->s_export_op = &gfs2_export_ops; + sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ @@ -828,7 +1115,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) if (error) goto fail_lm; - error = init_sb(sdp, silent, DO); + error = init_sb(sdp, silent); if (error) goto fail_locking; @@ -869,7 +1156,11 @@ fail_per_node: fail_inodes: init_inodes(sdp, UNDO); fail_sb: - init_sb(sdp, 0, UNDO); + if (sdp->sd_root_dir) + dput(sdp->sd_root_dir); + if (sdp->sd_master_dir) + dput(sdp->sd_master_dir); + sb->s_root = NULL; fail_locking: init_locking(sdp, &mount_gh, UNDO); fail_lm: @@ -887,151 +1178,63 @@ fail: } static int gfs2_get_sb(struct file_system_type *fs_type, int flags, - const char *dev_name, void *data, struct vfsmount *mnt) + const char *dev_name, void *data, struct vfsmount *mnt) { - struct super_block *sb; - struct gfs2_sbd *sdp; - int error = get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); - if (error) - goto out; - sb = mnt->mnt_sb; - sdp = sb->s_fs_info; - sdp->sd_gfs2mnt = mnt; -out: - return error; + return get_sb_bdev(fs_type, flags, dev_name, data, fill_super, mnt); } -static int fill_super_meta(struct super_block *sb, struct super_block *new, - void *data, int silent) +static struct super_block *get_gfs2_sb(const char *dev_name) { - struct gfs2_sbd *sdp = sb->s_fs_info; - struct inode *inode; - int error = 0; - - new->s_fs_info = sdp; - sdp->sd_vfs_meta = sb; - - init_vfs(new, SDF_NOATIME); - - /* Get the master inode */ - inode = igrab(sdp->sd_master_dir); - - new->s_root = d_alloc_root(inode); - if (!new->s_root) { - fs_err(sdp, "can't get root dentry\n"); - error = -ENOMEM; - iput(inode); - } else - new->s_root->d_op = &gfs2_dops; - - return error; -} - -static int set_bdev_super(struct super_block *s, void *data) -{ - s->s_bdev = data; - s->s_dev = s->s_bdev->bd_dev; - return 0; -} - -static int test_bdev_super(struct super_block *s, void *data) -{ - return s->s_bdev == data; -} - -static struct super_block* get_gfs2_sb(const char *dev_name) -{ - struct kstat stat; + struct super_block *sb; struct nameidata nd; - struct super_block *sb = NULL, *s; int error; error = path_lookup(dev_name, LOOKUP_FOLLOW, &nd); if (error) { - printk(KERN_WARNING "GFS2: path_lookup on %s returned error\n", - dev_name); - goto out; - } - error = vfs_getattr(nd.path.mnt, nd.path.dentry, &stat); - - list_for_each_entry(s, &gfs2_fs_type.fs_supers, s_instances) { - if ((S_ISBLK(stat.mode) && s->s_dev == stat.rdev) || - (S_ISDIR(stat.mode) && - s == nd.path.dentry->d_inode->i_sb)) { - sb = s; - goto free_nd; - } + printk(KERN_WARNING "GFS2: path_lookup on %s returned error %d\n", + dev_name, error); + return NULL; } - - printk(KERN_WARNING "GFS2: Unrecognized block device or " - "mount point %s\n", dev_name); - -free_nd: + sb = nd.path.dentry->d_inode->i_sb; + if (sb && (sb->s_type == &gfs2_fs_type)) + atomic_inc(&sb->s_active); + else + sb = NULL; path_put(&nd.path); -out: return sb; } static int gfs2_get_sb_meta(struct file_system_type *fs_type, int flags, const char *dev_name, void *data, struct vfsmount *mnt) { - int error = 0; - struct super_block *sb = NULL, *new; + struct super_block *sb = NULL; struct gfs2_sbd *sdp; sb = get_gfs2_sb(dev_name); if (!sb) { printk(KERN_WARNING "GFS2: gfs2 mount does not exist\n"); - error = -ENOENT; - goto error; + return -ENOENT; } sdp = sb->s_fs_info; - if (sdp->sd_vfs_meta) { - printk(KERN_WARNING "GFS2: gfs2meta mount already exists\n"); - error = -EBUSY; - goto error; - } - down(&sb->s_bdev->bd_mount_sem); - new = sget(fs_type, test_bdev_super, set_bdev_super, sb->s_bdev); - up(&sb->s_bdev->bd_mount_sem); - if (IS_ERR(new)) { - error = PTR_ERR(new); - goto error; - } - new->s_flags = flags; - strlcpy(new->s_id, sb->s_id, sizeof(new->s_id)); - sb_set_blocksize(new, sb->s_blocksize); - error = fill_super_meta(sb, new, data, flags & MS_SILENT ? 1 : 0); - if (error) { - up_write(&new->s_umount); - deactivate_super(new); - goto error; - } - - new->s_flags |= MS_ACTIVE; - - /* Grab a reference to the gfs2 mount point */ - atomic_inc(&sdp->sd_gfs2mnt->mnt_count); - return simple_set_mnt(mnt, new); -error: - return error; + mnt->mnt_sb = sb; + mnt->mnt_root = dget(sdp->sd_master_dir); + return 0; } static void gfs2_kill_sb(struct super_block *sb) { - if (sb->s_fs_info) { - gfs2_delete_debugfs_file(sb->s_fs_info); - gfs2_meta_syncfs(sb->s_fs_info); + struct gfs2_sbd *sdp = sb->s_fs_info; + if (sdp) { + gfs2_meta_syncfs(sdp); + dput(sdp->sd_root_dir); + dput(sdp->sd_master_dir); + sdp->sd_root_dir = NULL; + sdp->sd_master_dir = NULL; } + shrink_dcache_sb(sb); kill_block_super(sb); -} - -static void gfs2_kill_sb_meta(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - generic_shutdown_super(sb); - sdp->sd_vfs_meta = NULL; - atomic_dec(&sdp->sd_gfs2mnt->mnt_count); + if (sdp) + gfs2_delete_debugfs_file(sdp); } struct file_system_type gfs2_fs_type = { @@ -1046,7 +1249,6 @@ struct file_system_type gfs2meta_fs_type = { .name = "gfs2meta", .fs_flags = FS_REQUIRES_DEV, .get_sb = gfs2_get_sb_meta, - .kill_sb = gfs2_kill_sb_meta, .owner = THIS_MODULE, }; diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index e2c62f73a77..534e1e2c65c 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -159,9 +159,13 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, gfs2_holder_init(dip->i_gl, LM_ST_EXCLUSIVE, 0, ghs); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, ghs + 1); - error = gfs2_glock_nq_m(2, ghs); + error = gfs2_glock_nq(ghs); /* parent */ if (error) - goto out; + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; error = gfs2_permission(dir, MAY_WRITE | MAY_EXEC); if (error) @@ -245,8 +249,10 @@ out_alloc: if (alloc_required) gfs2_alloc_put(dip); out_gunlock: - gfs2_glock_dq_m(2, ghs); -out: + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_glock_dq(ghs); +out_parent: gfs2_holder_uninit(ghs); gfs2_holder_uninit(ghs + 1); if (!error) { @@ -302,7 +308,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) error = gfs2_unlink_ok(dip, &dentry->d_name, ip); if (error) - goto out_rgrp; + goto out_gunlock; error = gfs2_trans_begin(sdp, 2*RES_DINODE + RES_LEAF + RES_RG_BIT, 0); if (error) @@ -316,6 +322,7 @@ static int gfs2_unlink(struct inode *dir, struct dentry *dentry) out_end_trans: gfs2_trans_end(sdp); +out_gunlock: gfs2_glock_dq(ghs + 2); out_rgrp: gfs2_holder_uninit(ghs + 2); @@ -485,7 +492,6 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) struct gfs2_holder ri_gh; int error; - error = gfs2_rindex_hold(sdp, &ri_gh); if (error) return error; @@ -495,9 +501,17 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr); gfs2_holder_init(rgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + 2); - error = gfs2_glock_nq_m(3, ghs); + error = gfs2_glock_nq(ghs); /* parent */ if (error) - goto out; + goto out_parent; + + error = gfs2_glock_nq(ghs + 1); /* child */ + if (error) + goto out_child; + + error = gfs2_glock_nq(ghs + 2); /* rgrp */ + if (error) + goto out_rgrp; error = gfs2_unlink_ok(dip, &dentry->d_name, ip); if (error) @@ -523,11 +537,15 @@ static int gfs2_rmdir(struct inode *dir, struct dentry *dentry) gfs2_trans_end(sdp); out_gunlock: - gfs2_glock_dq_m(3, ghs); -out: - gfs2_holder_uninit(ghs); - gfs2_holder_uninit(ghs + 1); + gfs2_glock_dq(ghs + 2); +out_rgrp: gfs2_holder_uninit(ghs + 2); + gfs2_glock_dq(ghs + 1); +out_child: + gfs2_holder_uninit(ghs + 1); + gfs2_glock_dq(ghs); +out_parent: + gfs2_holder_uninit(ghs); gfs2_glock_dq_uninit(&ri_gh); return error; } @@ -571,6 +589,54 @@ static int gfs2_mknod(struct inode *dir, struct dentry *dentry, int mode, return 0; } +/* + * gfs2_ok_to_move - check if it's ok to move a directory to another directory + * @this: move this + * @to: to here + * + * Follow @to back to the root and make sure we don't encounter @this + * Assumes we already hold the rename lock. + * + * Returns: errno + */ + +static int gfs2_ok_to_move(struct gfs2_inode *this, struct gfs2_inode *to) +{ + struct inode *dir = &to->i_inode; + struct super_block *sb = dir->i_sb; + struct inode *tmp; + struct qstr dotdot; + int error = 0; + + gfs2_str2qstr(&dotdot, ".."); + + igrab(dir); + + for (;;) { + if (dir == &this->i_inode) { + error = -EINVAL; + break; + } + if (dir == sb->s_root->d_inode) { + error = 0; + break; + } + + tmp = gfs2_lookupi(dir, &dotdot, 1); + if (IS_ERR(tmp)) { + error = PTR_ERR(tmp); + break; + } + + iput(dir); + dir = tmp; + } + + iput(dir); + + return error; +} + /** * gfs2_rename - Rename a file * @odir: Parent directory of old file name @@ -589,7 +655,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_inode *ip = GFS2_I(odentry->d_inode); struct gfs2_inode *nip = NULL; struct gfs2_sbd *sdp = GFS2_SB(odir); - struct gfs2_holder ghs[5], r_gh; + struct gfs2_holder ghs[5], r_gh = { .gh_gl = NULL, }; struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; @@ -603,19 +669,20 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, return 0; } - /* Make sure we aren't trying to move a dirctory into it's subdir */ - - if (S_ISDIR(ip->i_inode.i_mode) && odip != ndip) { - dir_rename = 1; - error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, 0, - &r_gh); + if (odip != ndip) { + error = gfs2_glock_nq_init(sdp->sd_rename_gl, LM_ST_EXCLUSIVE, + 0, &r_gh); if (error) goto out; - error = gfs2_ok_to_move(ip, ndip); - if (error) - goto out_gunlock_r; + if (S_ISDIR(ip->i_inode.i_mode)) { + dir_rename = 1; + /* don't move a dirctory into it's subdir */ + error = gfs2_ok_to_move(ip, ndip); + if (error) + goto out_gunlock_r; + } } num_gh = 1; @@ -639,9 +706,11 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, gfs2_holder_init(nrgd->rd_gl, LM_ST_EXCLUSIVE, 0, ghs + num_gh++); } - error = gfs2_glock_nq_m(num_gh, ghs); - if (error) - goto out_uninit; + for (x = 0; x < num_gh; x++) { + error = gfs2_glock_nq(ghs + x); + if (error) + goto out_gunlock; + } /* Check out the old directory */ @@ -804,12 +873,12 @@ out_alloc: if (alloc_required) gfs2_alloc_put(ndip); out_gunlock: - gfs2_glock_dq_m(num_gh, ghs); -out_uninit: - for (x = 0; x < num_gh; x++) + while (x--) { + gfs2_glock_dq(ghs + x); gfs2_holder_uninit(ghs + x); + } out_gunlock_r: - if (dir_rename) + if (r_gh.gh_gl) gfs2_glock_dq_uninit(&r_gh); out: return error; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index f66ea0f7a35..d5355d9b592 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -20,6 +20,7 @@ #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> #include <linux/lm_interface.h> +#include <linux/time.h> #include "gfs2.h" #include "incore.h" @@ -38,6 +39,7 @@ #include "dir.h" #include "eattr.h" #include "bmap.h" +#include "meta_io.h" /** * gfs2_write_inode - Make sure the inode is stable on the disk @@ -50,16 +52,74 @@ static int gfs2_write_inode(struct inode *inode, int sync) { struct gfs2_inode *ip = GFS2_I(inode); - - /* Check this is a "normal" inode */ - if (test_bit(GIF_USER, &ip->i_flags)) { - if (current->flags & PF_MEMALLOC) - return 0; - if (sync) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + struct buffer_head *bh; + struct timespec atime; + struct gfs2_dinode *di; + int ret = 0; + + /* Check this is a "normal" inode, etc */ + if (!test_bit(GIF_USER, &ip->i_flags) || + (current->flags & PF_MEMALLOC)) + return 0; + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) + goto do_unlock; + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + di = (struct gfs2_dinode *)bh->b_data; + atime.tv_sec = be64_to_cpu(di->di_atime); + atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); + if (timespec_compare(&inode->i_atime, &atime) > 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + } + brelse(bh); } + gfs2_trans_end(sdp); +do_unlock: + gfs2_glock_dq_uninit(&gh); +do_flush: + if (sync != 0) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + return ret; +} - return 0; +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + gfs2_quota_sync(sdp); + gfs2_statfs_sync(sdp); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; } /** @@ -73,12 +133,6 @@ static void gfs2_put_super(struct super_block *sb) struct gfs2_sbd *sdp = sb->s_fs_info; int error; - if (!sdp) - return; - - if (!strncmp(sb->s_type->name, "gfs2meta", 8)) - return; /* Nothing to do */ - /* Unfreeze the filesystem, if we need to */ mutex_lock(&sdp->sd_freeze_lock); @@ -101,7 +155,6 @@ static void gfs2_put_super(struct super_block *sb) /* Release stuff */ - iput(sdp->sd_master_dir); iput(sdp->sd_jindex); iput(sdp->sd_inum_inode); iput(sdp->sd_statfs_inode); @@ -152,6 +205,7 @@ static void gfs2_write_super(struct super_block *sb) * * Flushes the log to disk. */ + static int gfs2_sync_fs(struct super_block *sb, int wait) { sb->s_dirt = 0; @@ -270,14 +324,6 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) } } - if (*flags & (MS_NOATIME | MS_NODIRATIME)) - set_bit(SDF_NOATIME, &sdp->sd_flags); - else - clear_bit(SDF_NOATIME, &sdp->sd_flags); - - /* Don't let the VFS update atimes. GFS2 handles this itself. */ - *flags |= MS_NOATIME | MS_NODIRATIME; - return error; } @@ -295,6 +341,7 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) * inode's blocks, or alternatively pass the baton on to another * node for later deallocation. */ + static void gfs2_drop_inode(struct inode *inode) { struct gfs2_inode *ip = GFS2_I(inode); @@ -333,6 +380,16 @@ static void gfs2_clear_inode(struct inode *inode) } } +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + /** * gfs2_show_options - Show mount options for /proc/mounts * @s: seq_file structure @@ -346,6 +403,8 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; + if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + seq_printf(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); if (args->ar_locktable[0]) @@ -414,6 +473,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) * conversion on the iopen lock, but we can change that later. This * is safe, just less efficient. */ + static void gfs2_delete_inode(struct inode *inode) { struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; @@ -478,8 +538,6 @@ out: clear_inode(inode); } - - static struct inode *gfs2_alloc_inode(struct super_block *sb) { struct gfs2_inode *ip; diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index ca831991cbc..c3ba3d9d0aa 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -33,313 +33,6 @@ #include "trans.h" #include "util.h" -static const u32 gfs2_old_fs_formats[] = { - 0 -}; - -static const u32 gfs2_old_multihost_formats[] = { - 0 -}; - -/** - * gfs2_tune_init - Fill a gfs2_tune structure with default values - * @gt: tune - * - */ - -void gfs2_tune_init(struct gfs2_tune *gt) -{ - spin_lock_init(>->gt_spin); - - gt->gt_demote_secs = 300; - gt->gt_incore_log_blocks = 1024; - gt->gt_log_flush_secs = 60; - gt->gt_recoverd_secs = 60; - gt->gt_logd_secs = 1; - gt->gt_quotad_secs = 5; - gt->gt_quota_simul_sync = 64; - gt->gt_quota_warn_period = 10; - gt->gt_quota_scale_num = 1; - gt->gt_quota_scale_den = 1; - gt->gt_quota_cache_secs = 300; - gt->gt_quota_quantum = 60; - gt->gt_atime_quantum = 3600; - gt->gt_new_files_jdata = 0; - gt->gt_max_readahead = 1 << 18; - gt->gt_stall_secs = 600; - gt->gt_complain_secs = 10; - gt->gt_statfs_quantum = 30; - gt->gt_statfs_slow = 0; -} - -/** - * gfs2_check_sb - Check superblock - * @sdp: the filesystem - * @sb: The superblock - * @silent: Don't print a message if the check fails - * - * Checks the version code of the FS is one that we understand how to - * read and that the sizes of the various on-disk structures have not - * changed. - */ - -int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent) -{ - unsigned int x; - - if (sb->sb_magic != GFS2_MAGIC || - sb->sb_type != GFS2_METATYPE_SB) { - if (!silent) - printk(KERN_WARNING "GFS2: not a GFS2 filesystem\n"); - return -EINVAL; - } - - /* If format numbers match exactly, we're done. */ - - if (sb->sb_fs_format == GFS2_FORMAT_FS && - sb->sb_multihost_format == GFS2_FORMAT_MULTI) - return 0; - - if (sb->sb_fs_format != GFS2_FORMAT_FS) { - for (x = 0; gfs2_old_fs_formats[x]; x++) - if (gfs2_old_fs_formats[x] == sb->sb_fs_format) - break; - - if (!gfs2_old_fs_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (sb->sb_multihost_format != GFS2_FORMAT_MULTI) { - for (x = 0; gfs2_old_multihost_formats[x]; x++) - if (gfs2_old_multihost_formats[x] == - sb->sb_multihost_format) - break; - - if (!gfs2_old_multihost_formats[x]) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_WARNING - "GFS2: I don't know how to upgrade this FS\n"); - return -EINVAL; - } - } - - if (!sdp->sd_args.ar_upgrade) { - printk(KERN_WARNING - "GFS2: code version (%u, %u) is incompatible " - "with ondisk format (%u, %u)\n", - GFS2_FORMAT_FS, GFS2_FORMAT_MULTI, - sb->sb_fs_format, sb->sb_multihost_format); - printk(KERN_INFO - "GFS2: Use the \"upgrade\" mount option to upgrade " - "the FS\n"); - printk(KERN_INFO "GFS2: See the manual for more details\n"); - return -EINVAL; - } - - return 0; -} - - -static void end_bio_io_page(struct bio *bio, int error) -{ - struct page *page = bio->bi_private; - - if (!error) - SetPageUptodate(page); - else - printk(KERN_WARNING "gfs2: error %d reading superblock\n", error); - unlock_page(page); -} - -static void gfs2_sb_in(struct gfs2_sb_host *sb, const void *buf) -{ - const struct gfs2_sb *str = buf; - - sb->sb_magic = be32_to_cpu(str->sb_header.mh_magic); - sb->sb_type = be32_to_cpu(str->sb_header.mh_type); - sb->sb_format = be32_to_cpu(str->sb_header.mh_format); - sb->sb_fs_format = be32_to_cpu(str->sb_fs_format); - sb->sb_multihost_format = be32_to_cpu(str->sb_multihost_format); - sb->sb_bsize = be32_to_cpu(str->sb_bsize); - sb->sb_bsize_shift = be32_to_cpu(str->sb_bsize_shift); - sb->sb_master_dir.no_addr = be64_to_cpu(str->sb_master_dir.no_addr); - sb->sb_master_dir.no_formal_ino = be64_to_cpu(str->sb_master_dir.no_formal_ino); - sb->sb_root_dir.no_addr = be64_to_cpu(str->sb_root_dir.no_addr); - sb->sb_root_dir.no_formal_ino = be64_to_cpu(str->sb_root_dir.no_formal_ino); - - memcpy(sb->sb_lockproto, str->sb_lockproto, GFS2_LOCKNAME_LEN); - memcpy(sb->sb_locktable, str->sb_locktable, GFS2_LOCKNAME_LEN); -} - -/** - * gfs2_read_super - Read the gfs2 super block from disk - * @sdp: The GFS2 super block - * @sector: The location of the super block - * @error: The error code to return - * - * This uses the bio functions to read the super block from disk - * because we want to be 100% sure that we never read cached data. - * A super block is read twice only during each GFS2 mount and is - * never written to by the filesystem. The first time its read no - * locks are held, and the only details which are looked at are those - * relating to the locking protocol. Once locking is up and working, - * the sb is read again under the lock to establish the location of - * the master directory (contains pointers to journals etc) and the - * root directory. - * - * Returns: 0 on success or error - */ - -int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector) -{ - struct super_block *sb = sdp->sd_vfs; - struct gfs2_sb *p; - struct page *page; - struct bio *bio; - - page = alloc_page(GFP_NOFS); - if (unlikely(!page)) - return -ENOBUFS; - - ClearPageUptodate(page); - ClearPageDirty(page); - lock_page(page); - - bio = bio_alloc(GFP_NOFS, 1); - if (unlikely(!bio)) { - __free_page(page); - return -ENOBUFS; - } - - bio->bi_sector = sector * (sb->s_blocksize >> 9); - bio->bi_bdev = sb->s_bdev; - bio_add_page(bio, page, PAGE_SIZE, 0); - - bio->bi_end_io = end_bio_io_page; - bio->bi_private = page; - submit_bio(READ_SYNC | (1 << BIO_RW_META), bio); - wait_on_page_locked(page); - bio_put(bio); - if (!PageUptodate(page)) { - __free_page(page); - return -EIO; - } - p = kmap(page); - gfs2_sb_in(&sdp->sd_sb, p); - kunmap(page); - __free_page(page); - return 0; -} - -/** - * gfs2_read_sb - Read super block - * @sdp: The GFS2 superblock - * @gl: the glock for the superblock (assumed to be held) - * @silent: Don't print message if mount fails - * - */ - -int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent) -{ - u32 hash_blocks, ind_blocks, leaf_blocks; - u32 tmp_blocks; - unsigned int x; - int error; - - error = gfs2_read_super(sdp, GFS2_SB_ADDR >> sdp->sd_fsb2bb_shift); - if (error) { - if (!silent) - fs_err(sdp, "can't read superblock\n"); - return error; - } - - error = gfs2_check_sb(sdp, &sdp->sd_sb, silent); - if (error) - return error; - - sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - - GFS2_BASIC_BLOCK_SHIFT; - sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; - sdp->sd_diptrs = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode)) / sizeof(u64); - sdp->sd_inptrs = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / sizeof(u64); - sdp->sd_jbsize = sdp->sd_sb.sb_bsize - sizeof(struct gfs2_meta_header); - sdp->sd_hash_bsize = sdp->sd_sb.sb_bsize / 2; - sdp->sd_hash_bsize_shift = sdp->sd_sb.sb_bsize_shift - 1; - sdp->sd_hash_ptrs = sdp->sd_hash_bsize / sizeof(u64); - sdp->sd_qc_per_block = (sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_meta_header)) / - sizeof(struct gfs2_quota_change); - - /* Compute maximum reservation required to add a entry to a directory */ - - hash_blocks = DIV_ROUND_UP(sizeof(u64) * (1 << GFS2_DIR_MAX_DEPTH), - sdp->sd_jbsize); - - ind_blocks = 0; - for (tmp_blocks = hash_blocks; tmp_blocks > sdp->sd_diptrs;) { - tmp_blocks = DIV_ROUND_UP(tmp_blocks, sdp->sd_inptrs); - ind_blocks += tmp_blocks; - } - - leaf_blocks = 2 + GFS2_DIR_MAX_DEPTH; - - sdp->sd_max_dirres = hash_blocks + ind_blocks + leaf_blocks; - - sdp->sd_heightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_heightsize[1] = sdp->sd_sb.sb_bsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_heightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_heightsize[x - 1] || m) - break; - sdp->sd_heightsize[x] = space; - } - sdp->sd_max_height = x; - sdp->sd_heightsize[x] = ~0; - gfs2_assert(sdp, sdp->sd_max_height <= GFS2_MAX_META_HEIGHT); - - sdp->sd_jheightsize[0] = sdp->sd_sb.sb_bsize - - sizeof(struct gfs2_dinode); - sdp->sd_jheightsize[1] = sdp->sd_jbsize * sdp->sd_diptrs; - for (x = 2;; x++) { - u64 space, d; - u32 m; - - space = sdp->sd_jheightsize[x - 1] * sdp->sd_inptrs; - d = space; - m = do_div(d, sdp->sd_inptrs); - - if (d != sdp->sd_jheightsize[x - 1] || m) - break; - sdp->sd_jheightsize[x] = space; - } - sdp->sd_max_jheight = x; - sdp->sd_jheightsize[x] = ~0; - gfs2_assert(sdp, sdp->sd_max_jheight <= GFS2_MAX_META_HEIGHT); - - return 0; -} - /** * gfs2_jindex_hold - Grab a lock on the jindex * @sdp: The GFS2 superblock @@ -581,39 +274,6 @@ fail: return error; } -/** - * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one - * @sdp: the filesystem - * - * Returns: errno - */ - -int gfs2_make_fs_ro(struct gfs2_sbd *sdp) -{ - struct gfs2_holder t_gh; - int error; - - gfs2_quota_sync(sdp); - gfs2_statfs_sync(sdp); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); - if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return error; - - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); - - gfs2_quota_cleanup(sdp); - - return error; -} - static void gfs2_statfs_change_in(struct gfs2_statfs_change_host *sc, const void *buf) { const struct gfs2_statfs_change *str = buf; diff --git a/fs/gfs2/super.h b/fs/gfs2/super.h index 44361ecc44f..50a4c9b1215 100644 --- a/fs/gfs2/super.h +++ b/fs/gfs2/super.h @@ -12,11 +12,6 @@ #include "incore.h" -void gfs2_tune_init(struct gfs2_tune *gt); - -int gfs2_check_sb(struct gfs2_sbd *sdp, struct gfs2_sb_host *sb, int silent); -int gfs2_read_sb(struct gfs2_sbd *sdp, struct gfs2_glock *gl, int silent); -int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector); void gfs2_lm_unmount(struct gfs2_sbd *sdp); static inline unsigned int gfs2_jindex_size(struct gfs2_sbd *sdp) @@ -40,7 +35,6 @@ int gfs2_lookup_in_master_dir(struct gfs2_sbd *sdp, char *filename, struct gfs2_inode **ipp); int gfs2_make_fs_rw(struct gfs2_sbd *sdp); -int gfs2_make_fs_ro(struct gfs2_sbd *sdp); int gfs2_statfs_init(struct gfs2_sbd *sdp); void gfs2_statfs_change(struct gfs2_sbd *sdp, diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 74846559fc3..7e1879f1a02 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -269,14 +269,6 @@ ARGS_ATTR(quota, "%u\n"); ARGS_ATTR(suiddir, "%d\n"); ARGS_ATTR(data, "%d\n"); -/* one oddball doesn't fit the macro mold */ -static ssize_t noatime_show(struct gfs2_sbd *sdp, char *buf) -{ - return snprintf(buf, PAGE_SIZE, "%d\n", - !!test_bit(SDF_NOATIME, &sdp->sd_flags)); -} -static struct args_attr args_attr_noatime = __ATTR_RO(noatime); - static struct attribute *args_attrs[] = { &args_attr_lockproto.attr, &args_attr_locktable.attr, @@ -292,7 +284,6 @@ static struct attribute *args_attrs[] = { &args_attr_quota.attr, &args_attr_suiddir.attr, &args_attr_data.attr, - &args_attr_noatime.attr, NULL, }; @@ -407,7 +398,6 @@ TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); TUNE_ATTR(quota_quantum, 0); -TUNE_ATTR(atime_quantum, 0); TUNE_ATTR(max_readahead, 0); TUNE_ATTR(complain_secs, 0); TUNE_ATTR(statfs_slow, 0); @@ -427,7 +417,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_log_flush_secs.attr, &tune_attr_quota_warn_period.attr, &tune_attr_quota_quantum.attr, - &tune_attr_atime_quantum.attr, &tune_attr_max_readahead.attr, &tune_attr_complain_secs.attr, &tune_attr_statfs_slow.attr, diff --git a/fs/hfs/catalog.c b/fs/hfs/catalog.c index ba851576ebb..6d98f116ca0 100644 --- a/fs/hfs/catalog.c +++ b/fs/hfs/catalog.c @@ -190,6 +190,10 @@ int hfs_cat_find_brec(struct super_block *sb, u32 cnid, fd->search_key->cat.ParID = rec.thread.ParID; len = fd->search_key->cat.CName.len = rec.thread.CName.len; + if (len > HFS_NAMELEN) { + printk(KERN_ERR "hfs: bad catalog namelength\n"); + return -EIO; + } memcpy(fd->search_key->cat.CName.name, rec.thread.CName.name, len); return hfs_brec_find(fd); } diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 4abb1047c68..3c7c7637719 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -173,7 +173,7 @@ enum { opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { { opt_uid, "uid=%u" }, { opt_gid, "gid=%u" }, { opt_umask, "umask=%o" }, diff --git a/fs/hfsplus/bitmap.c b/fs/hfsplus/bitmap.c index d128a25b74d..ea30afc2a03 100644 --- a/fs/hfsplus/bitmap.c +++ b/fs/hfsplus/bitmap.c @@ -32,6 +32,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma mutex_lock(&HFSPLUS_SB(sb).alloc_file->i_mutex); mapping = HFSPLUS_SB(sb).alloc_file->i_mapping; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } pptr = kmap(page); curr = pptr + (offset & (PAGE_CACHE_BITS - 1)) / 32; i = offset % 32; @@ -73,6 +77,10 @@ int hfsplus_block_allocate(struct super_block *sb, u32 size, u32 offset, u32 *ma break; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } curr = pptr = kmap(page); if ((size ^ offset) / PAGE_CACHE_BITS) end = pptr + PAGE_CACHE_BITS / 32; @@ -120,6 +128,10 @@ found: offset += PAGE_CACHE_BITS; page = read_mapping_page(mapping, offset / PAGE_CACHE_BITS, NULL); + if (IS_ERR(page)) { + start = size; + goto out; + } pptr = kmap(page); curr = pptr; end = pptr + PAGE_CACHE_BITS / 32; diff --git a/fs/hfsplus/catalog.c b/fs/hfsplus/catalog.c index ba117c445e7..f6874acb2cf 100644 --- a/fs/hfsplus/catalog.c +++ b/fs/hfsplus/catalog.c @@ -168,6 +168,11 @@ int hfsplus_find_cat(struct super_block *sb, u32 cnid, return -EIO; } + if (be16_to_cpu(tmp.thread.nodeName.length) > 255) { + printk(KERN_ERR "hfs: catalog name length corrupted\n"); + return -EIO; + } + hfsplus_cat_build_key_uni(fd->search_key, be32_to_cpu(tmp.thread.parentID), &tmp.thread.nodeName); return hfs_brec_find(fd); diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index 9997cbf8beb..9699c56d323 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -25,7 +25,7 @@ enum { opt_force, opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { { opt_creator, "creator=%s" }, { opt_type, "type=%s" }, { opt_umask, "umask=%o" }, diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index e834e578c93..eb74531a0a8 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -356,7 +356,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_SOFTLOCK)) { printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; - } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { + } else if ((vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) && !(sb->s_flags & MS_RDONLY)) { printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " "use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index b8ae9c90ada..29ad461d568 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -215,7 +215,7 @@ enum { Opt_timeshift, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_help, "help"}, {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3f58923fb39..61edc701b0e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -57,7 +57,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_size, "size=%s"}, {Opt_nr_inodes, "nr_inodes=%s"}, {Opt_mode, "mode=%o"}, diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 60249429a25..d85c7d931cd 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -323,7 +323,7 @@ out: } /* - * remove_kevent - cleans up and ultimately frees the given kevent + * remove_kevent - cleans up the given kevent * * Caller must hold dev->ev_mutex. */ @@ -334,7 +334,13 @@ static void remove_kevent(struct inotify_device *dev, dev->event_count--; dev->queue_size -= sizeof(struct inotify_event) + kevent->event.len; +} +/* + * free_kevent - frees the given kevent. + */ +static void free_kevent(struct inotify_kernel_event *kevent) +{ kfree(kevent->name); kmem_cache_free(event_cachep, kevent); } @@ -350,6 +356,7 @@ static void inotify_dev_event_dequeue(struct inotify_device *dev) struct inotify_kernel_event *kevent; kevent = inotify_dev_get_event(dev); remove_kevent(dev, kevent); + free_kevent(kevent); } } @@ -433,17 +440,15 @@ static ssize_t inotify_read(struct file *file, char __user *buf, dev = file->private_data; while (1) { - int events; prepare_to_wait(&dev->wq, &wait, TASK_INTERRUPTIBLE); mutex_lock(&dev->ev_mutex); - events = !list_empty(&dev->events); - mutex_unlock(&dev->ev_mutex); - if (events) { + if (!list_empty(&dev->events)) { ret = 0; break; } + mutex_unlock(&dev->ev_mutex); if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; @@ -462,7 +467,6 @@ static ssize_t inotify_read(struct file *file, char __user *buf, if (ret) return ret; - mutex_lock(&dev->ev_mutex); while (1) { struct inotify_kernel_event *kevent; @@ -481,6 +485,13 @@ static ssize_t inotify_read(struct file *file, char __user *buf, } break; } + remove_kevent(dev, kevent); + + /* + * Must perform the copy_to_user outside the mutex in order + * to avoid a lock order reversal with mmap_sem. + */ + mutex_unlock(&dev->ev_mutex); if (copy_to_user(buf, &kevent->event, event_size)) { ret = -EFAULT; @@ -498,7 +509,9 @@ static ssize_t inotify_read(struct file *file, char __user *buf, count -= kevent->event.len; } - remove_kevent(dev, kevent); + free_kevent(kevent); + + mutex_lock(&dev->ev_mutex); } mutex_unlock(&dev->ev_mutex); diff --git a/fs/ioctl.c b/fs/ioctl.c index 7db32b3382d..d152856c371 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -13,9 +13,14 @@ #include <linux/security.h> #include <linux/module.h> #include <linux/uaccess.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> #include <asm/ioctls.h> +/* So that the fiemap access checks can't overflow on 32 bit machines. */ +#define FIEMAP_MAX_EXTENTS (UINT_MAX / sizeof(struct fiemap_extent)) + /** * vfs_ioctl - call filesystem specific ioctl methods * @filp: open file to invoke ioctl method on @@ -71,6 +76,276 @@ static int ioctl_fibmap(struct file *filp, int __user *p) return put_user(res, p); } +/** + * fiemap_fill_next_extent - Fiemap helper function + * @fieinfo: Fiemap context passed into ->fiemap + * @logical: Extent logical start offset, in bytes + * @phys: Extent physical start offset, in bytes + * @len: Extent length, in bytes + * @flags: FIEMAP_EXTENT flags that describe this extent + * + * Called from file system ->fiemap callback. Will populate extent + * info as passed in via arguments and copy to user memory. On + * success, extent count on fieinfo is incremented. + * + * Returns 0 on success, -errno on error, 1 if this was the last + * extent that will fit in user array. + */ +#define SET_UNKNOWN_FLAGS (FIEMAP_EXTENT_DELALLOC) +#define SET_NO_UNMOUNTED_IO_FLAGS (FIEMAP_EXTENT_DATA_ENCRYPTED) +#define SET_NOT_ALIGNED_FLAGS (FIEMAP_EXTENT_DATA_TAIL|FIEMAP_EXTENT_DATA_INLINE) +int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, + u64 phys, u64 len, u32 flags) +{ + struct fiemap_extent extent; + struct fiemap_extent *dest = fieinfo->fi_extents_start; + + /* only count the extents */ + if (fieinfo->fi_extents_max == 0) { + fieinfo->fi_extents_mapped++; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; + } + + if (fieinfo->fi_extents_mapped >= fieinfo->fi_extents_max) + return 1; + + if (flags & SET_UNKNOWN_FLAGS) + flags |= FIEMAP_EXTENT_UNKNOWN; + if (flags & SET_NO_UNMOUNTED_IO_FLAGS) + flags |= FIEMAP_EXTENT_ENCODED; + if (flags & SET_NOT_ALIGNED_FLAGS) + flags |= FIEMAP_EXTENT_NOT_ALIGNED; + + memset(&extent, 0, sizeof(extent)); + extent.fe_logical = logical; + extent.fe_physical = phys; + extent.fe_length = len; + extent.fe_flags = flags; + + dest += fieinfo->fi_extents_mapped; + if (copy_to_user(dest, &extent, sizeof(extent))) + return -EFAULT; + + fieinfo->fi_extents_mapped++; + if (fieinfo->fi_extents_mapped == fieinfo->fi_extents_max) + return 1; + return (flags & FIEMAP_EXTENT_LAST) ? 1 : 0; +} +EXPORT_SYMBOL(fiemap_fill_next_extent); + +/** + * fiemap_check_flags - check validity of requested flags for fiemap + * @fieinfo: Fiemap context passed into ->fiemap + * @fs_flags: Set of fiemap flags that the file system understands + * + * Called from file system ->fiemap callback. This will compute the + * intersection of valid fiemap flags and those that the fs supports. That + * value is then compared against the user supplied flags. In case of bad user + * flags, the invalid values will be written into the fieinfo structure, and + * -EBADR is returned, which tells ioctl_fiemap() to return those values to + * userspace. For this reason, a return code of -EBADR should be preserved. + * + * Returns 0 on success, -EBADR on bad flags. + */ +int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +{ + u32 incompat_flags; + + incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } + return 0; +} +EXPORT_SYMBOL(fiemap_check_flags); + +static int fiemap_check_ranges(struct super_block *sb, + u64 start, u64 len, u64 *new_len) +{ + *new_len = len; + + if (len == 0) + return -EINVAL; + + if (start > sb->s_maxbytes) + return -EFBIG; + + /* + * Shrink request scope to what the fs can actually handle. + */ + if ((len > sb->s_maxbytes) || + (sb->s_maxbytes - len) < start) + *new_len = sb->s_maxbytes - start; + + return 0; +} + +static int ioctl_fiemap(struct file *filp, unsigned long arg) +{ + struct fiemap fiemap; + struct fiemap_extent_info fieinfo = { 0, }; + struct inode *inode = filp->f_path.dentry->d_inode; + struct super_block *sb = inode->i_sb; + u64 len; + int error; + + if (!inode->i_op->fiemap) + return -EOPNOTSUPP; + + if (copy_from_user(&fiemap, (struct fiemap __user *)arg, + sizeof(struct fiemap))) + return -EFAULT; + + if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) + return -EINVAL; + + error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, + &len); + if (error) + return error; + + fieinfo.fi_flags = fiemap.fm_flags; + fieinfo.fi_extents_max = fiemap.fm_extent_count; + fieinfo.fi_extents_start = (struct fiemap_extent *)(arg + sizeof(fiemap)); + + if (fiemap.fm_extent_count != 0 && + !access_ok(VERIFY_WRITE, fieinfo.fi_extents_start, + fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) + return -EFAULT; + + if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) + filemap_write_and_wait(inode->i_mapping); + + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); + fiemap.fm_flags = fieinfo.fi_flags; + fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; + if (copy_to_user((char *)arg, &fiemap, sizeof(fiemap))) + error = -EFAULT; + + return error; +} + +#ifdef CONFIG_BLOCK + +#define blk_to_logical(inode, blk) (blk << (inode)->i_blkbits) +#define logical_to_blk(inode, offset) (offset >> (inode)->i_blkbits); + +/* + * @inode - the inode to map + * @arg - the pointer to userspace where we copy everything to + * @get_block - the fs's get_block function + * + * This does FIEMAP for block based inodes. Basically it will just loop + * through get_block until we hit the number of extents we want to map, or we + * go past the end of the file and hit a hole. + * + * If it is possible to have data blocks beyond a hole past @inode->i_size, then + * please do not use this function, it will stop at the first unmapped block + * beyond i_size + */ +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, + u64 len, get_block_t *get_block) +{ + struct buffer_head tmp; + unsigned int start_blk; + long long length = 0, map_len = 0; + u64 logical = 0, phys = 0, size = 0; + u32 flags = FIEMAP_EXTENT_MERGED; + int ret = 0; + + if ((ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC))) + return ret; + + start_blk = logical_to_blk(inode, start); + + /* guard against change */ + mutex_lock(&inode->i_mutex); + + length = (long long)min_t(u64, len, i_size_read(inode)); + map_len = length; + + do { + /* + * we set b_size to the total size we want so it will map as + * many contiguous blocks as possible at once + */ + memset(&tmp, 0, sizeof(struct buffer_head)); + tmp.b_size = map_len; + + ret = get_block(inode, start_blk, &tmp, 0); + if (ret) + break; + + /* HOLE */ + if (!buffer_mapped(&tmp)) { + /* + * first hole after going past the EOF, this is our + * last extent + */ + if (length <= 0) { + flags = FIEMAP_EXTENT_MERGED|FIEMAP_EXTENT_LAST; + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + break; + } + + length -= blk_to_logical(inode, 1); + + /* if we have holes up to/past EOF then we're done */ + if (length <= 0) + break; + + start_blk++; + } else { + if (length <= 0 && size) { + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, + flags); + if (ret) + break; + } + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, tmp.b_blocknr); + size = tmp.b_size; + flags = FIEMAP_EXTENT_MERGED; + + length -= tmp.b_size; + start_blk += logical_to_blk(inode, size); + + /* + * if we are past the EOF we need to loop again to see + * if there is a hole so we can mark this extent as the + * last one, and if not keep mapping things until we + * find a hole, or we run out of slots in the extent + * array + */ + if (length <= 0) + continue; + + ret = fiemap_fill_next_extent(fieinfo, logical, phys, + size, flags); + if (ret) + break; + } + cond_resched(); + } while (1); + + mutex_unlock(&inode->i_mutex); + + /* if ret is 1 then we just hit the end of the extent array */ + if (ret == 1) + ret = 0; + + return ret; +} +EXPORT_SYMBOL(generic_block_fiemap); + +#endif /* CONFIG_BLOCK */ + static int file_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -80,6 +355,8 @@ static int file_ioctl(struct file *filp, unsigned int cmd, switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); case FIGETBSZ: return put_user(inode->i_sb->s_blocksize, p); case FIONREAD: diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 26948a6033b..3f8af0f1505 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -310,7 +310,7 @@ enum { Opt_nocompress, Opt_hide, Opt_showassoc, Opt_dmode, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_norock, "norock"}, {Opt_nojoliet, "nojoliet"}, {Opt_unhide, "unhide"}, diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 91389c8aee8..9203c3332f1 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -20,6 +20,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> +#include <linux/marker.h> #include <linux/errno.h> #include <linux/slab.h> @@ -93,7 +94,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) int ret = 0; struct buffer_head *bh = jh2bh(jh); - if (jh->b_jlist == BJ_None && !buffer_locked(bh) && !buffer_dirty(bh)) { + if (jh->b_jlist == BJ_None && !buffer_locked(bh) && + !buffer_dirty(bh) && !buffer_write_io_error(bh)) { JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; jbd_unlock_bh_state(bh); @@ -126,14 +128,29 @@ void __jbd2_log_wait_for_space(journal_t *journal) /* * Test again, another process may have checkpointed while we - * were waiting for the checkpoint lock + * were waiting for the checkpoint lock. If there are no + * outstanding transactions there is nothing to checkpoint and + * we can't make progress. Abort the journal in this case. */ spin_lock(&journal->j_state_lock); + spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); if (__jbd2_log_space_left(journal) < nblocks) { + int chkpt = journal->j_checkpoint_transactions != NULL; + + spin_unlock(&journal->j_list_lock); spin_unlock(&journal->j_state_lock); - jbd2_log_do_checkpoint(journal); + if (chkpt) { + jbd2_log_do_checkpoint(journal); + } else { + printk(KERN_ERR "%s: no transactions\n", + __func__); + jbd2_journal_abort(journal, 0); + } + spin_lock(&journal->j_state_lock); + } else { + spin_unlock(&journal->j_list_lock); } mutex_unlock(&journal->j_checkpoint_mutex); } @@ -160,21 +177,25 @@ static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh) * buffers. Note that we take the buffers in the opposite ordering * from the one in which they were submitted for IO. * + * Return 0 on success, and return <0 if some buffers have failed + * to be written out. + * * Called with j_list_lock held. */ -static void __wait_cp_io(journal_t *journal, transaction_t *transaction) +static int __wait_cp_io(journal_t *journal, transaction_t *transaction) { struct journal_head *jh; struct buffer_head *bh; tid_t this_tid; int released = 0; + int ret = 0; this_tid = transaction->t_tid; restart: /* Did somebody clean up the transaction in the meanwhile? */ if (journal->j_checkpoint_transactions != transaction || transaction->t_tid != this_tid) - return; + return ret; while (!released && transaction->t_checkpoint_io_list) { jh = transaction->t_checkpoint_io_list; bh = jh2bh(jh); @@ -194,6 +215,9 @@ restart: spin_lock(&journal->j_list_lock); goto restart; } + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; + /* * Now in whatever state the buffer currently is, we know that * it has been written out and so we can drop it from the list @@ -203,6 +227,8 @@ restart: jbd2_journal_remove_journal_head(bh); __brelse(bh); } + + return ret; } #define NR_BATCH 64 @@ -226,7 +252,8 @@ __flush_batch(journal_t *journal, struct buffer_head **bhs, int *batch_count) * Try to flush one buffer from the checkpoint list to disk. * * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. + * scan of the checkpoint list. Return <0 if the buffer has failed to + * be written out. * * Called with j_list_lock held and drops it if 1 is returned * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it @@ -258,6 +285,9 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, jbd2_log_wait_commit(journal, tid); ret = 1; } else if (!buffer_dirty(bh)) { + ret = 1; + if (unlikely(buffer_write_io_error(bh))) + ret = -EIO; J_ASSERT_JH(jh, !buffer_jbddirty(bh)); BUFFER_TRACE(bh, "remove from checkpoint"); __jbd2_journal_remove_checkpoint(jh); @@ -265,7 +295,6 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, jbd_unlock_bh_state(bh); jbd2_journal_remove_journal_head(bh); __brelse(bh); - ret = 1; } else { /* * Important: we are about to write the buffer, and @@ -298,6 +327,7 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, * to disk. We submit larger chunks of data at once. * * The journal should be locked before calling this function. + * Called with j_checkpoint_mutex held. */ int jbd2_log_do_checkpoint(journal_t *journal) { @@ -313,6 +343,8 @@ int jbd2_log_do_checkpoint(journal_t *journal) * journal straight away. */ result = jbd2_cleanup_journal_tail(journal); + trace_mark(jbd2_checkpoint, "dev %s need_checkpoint %d", + journal->j_devname, result); jbd_debug(1, "cleanup_journal_tail returned %d\n", result); if (result <= 0) return result; @@ -321,6 +353,7 @@ int jbd2_log_do_checkpoint(journal_t *journal) * OK, we need to start writing disk blocks. Take one transaction * and write it. */ + result = 0; spin_lock(&journal->j_list_lock); if (!journal->j_checkpoint_transactions) goto out; @@ -339,7 +372,7 @@ restart: int batch_count = 0; struct buffer_head *bhs[NR_BATCH]; struct journal_head *jh; - int retry = 0; + int retry = 0, err; while (!retry && transaction->t_checkpoint_list) { struct buffer_head *bh; @@ -353,6 +386,8 @@ restart: } retry = __process_buffer(journal, jh, bhs, &batch_count, transaction); + if (retry < 0 && !result) + result = retry; if (!retry && (need_resched() || spin_needbreak(&journal->j_list_lock))) { spin_unlock(&journal->j_list_lock); @@ -377,14 +412,18 @@ restart: * Now we have cleaned up the first transaction's checkpoint * list. Let's clean up the second one */ - __wait_cp_io(journal, transaction); + err = __wait_cp_io(journal, transaction); + if (!result) + result = err; } out: spin_unlock(&journal->j_list_lock); - result = jbd2_cleanup_journal_tail(journal); if (result < 0) - return result; - return 0; + jbd2_journal_abort(journal, result); + else + result = jbd2_cleanup_journal_tail(journal); + + return (result < 0) ? result : 0; } /* @@ -400,8 +439,9 @@ out: * This is the only part of the journaling code which really needs to be * aware of transaction aborts. Checkpointing involves writing to the * main filesystem area rather than to the journal, so it can proceed - * even in abort state, but we must not update the journal superblock if - * we have an abort error outstanding. + * even in abort state, but we must not update the super block if + * checkpointing may have failed. Otherwise, we would lose some metadata + * buffers which should be written-back to the filesystem. */ int jbd2_cleanup_journal_tail(journal_t *journal) @@ -410,6 +450,9 @@ int jbd2_cleanup_journal_tail(journal_t *journal) tid_t first_tid; unsigned long blocknr, freed; + if (is_journal_aborted(journal)) + return 1; + /* OK, work out the oldest transaction remaining in the log, and * the log block it starts at. * diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index f2ad061e95e..8b119e16aa3 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -16,6 +16,7 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> +#include <linux/marker.h> #include <linux/errno.h> #include <linux/slab.h> #include <linux/mm.h> @@ -126,8 +127,7 @@ static int journal_submit_commit_record(journal_t *journal, JBUFFER_TRACE(descriptor, "submit commit block"); lock_buffer(bh); - get_bh(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); set_buffer_uptodate(bh); bh->b_end_io = journal_end_buffer_io_sync; @@ -147,12 +147,9 @@ static int journal_submit_commit_record(journal_t *journal, * to remember if we sent a barrier request */ if (ret == -EOPNOTSUPP && barrier_done) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", - bdevname(journal->j_dev, b)); + "JBD: barrier-based sync failed on %s - " + "disabling barriers\n", journal->j_devname); spin_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); @@ -160,7 +157,7 @@ static int journal_submit_commit_record(journal_t *journal, /* And try again, without the barrier */ lock_buffer(bh); set_buffer_uptodate(bh); - set_buffer_dirty(bh); + clear_buffer_dirty(bh); ret = submit_bh(WRITE, bh); } *cbh = bh; @@ -371,6 +368,8 @@ void jbd2_journal_commit_transaction(journal_t *journal) commit_transaction = journal->j_running_transaction; J_ASSERT(commit_transaction->t_state == T_RUNNING); + trace_mark(jbd2_start_commit, "dev %s transaction %d", + journal->j_devname, commit_transaction->t_tid); jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); @@ -505,9 +504,10 @@ void jbd2_journal_commit_transaction(journal_t *journal) jh = commit_transaction->t_buffers; /* If we're in abort mode, we just un-journal the buffer and - release it for background writing. */ + release it. */ if (is_journal_aborted(journal)) { + clear_buffer_jbddirty(jh2bh(jh)); JBUFFER_TRACE(jh, "journal is aborting: refile"); jbd2_journal_refile_buffer(journal, jh); /* If that was the last one, we need to clean up @@ -681,11 +681,11 @@ start_journal_io: */ err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { - char b[BDEVNAME_SIZE]; - printk(KERN_WARNING "JBD2: Detected IO errors while flushing file data " - "on %s\n", bdevname(journal->j_fs_dev, b)); + "on %s\n", journal->j_devname); + if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) + jbd2_journal_abort(journal, err); err = 0; } @@ -786,6 +786,9 @@ wait_for_iobuf: /* AKPM: bforget here */ } + if (err) + jbd2_journal_abort(journal, err); + jbd_debug(3, "JBD: commit phase 5\n"); if (!JBD2_HAS_INCOMPAT_FEATURE(journal, @@ -884,6 +887,8 @@ restart_loop: if (buffer_jbddirty(bh)) { JBUFFER_TRACE(jh, "add to new checkpointing trans"); __jbd2_journal_insert_checkpoint(jh, commit_transaction); + if (is_journal_aborted(journal)) + clear_buffer_jbddirty(bh); JBUFFER_TRACE(jh, "refile for checkpoint writeback"); __jbd2_journal_refile_buffer(jh); jbd_unlock_bh_state(bh); @@ -990,6 +995,12 @@ restart_loop: } spin_unlock(&journal->j_list_lock); + if (journal->j_commit_callback) + journal->j_commit_callback(journal, commit_transaction); + + trace_mark(jbd2_end_commit, "dev %s transaction %d head %d", + journal->j_devname, commit_transaction->t_tid, + journal->j_tail_sequence); jbd_debug(1, "JBD: commit %d complete, head %d\n", journal->j_commit_sequence, journal->j_tail_sequence); diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 8207a01c4ed..783de118de9 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -597,13 +597,9 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, if (ret) *retp = ret; else { - char b[BDEVNAME_SIZE]; - printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", - __func__, - blocknr, - bdevname(journal->j_dev, b)); + __func__, blocknr, journal->j_devname); err = -EIO; __journal_abort_soft(journal, err); } @@ -901,10 +897,7 @@ static struct proc_dir_entry *proc_jbd2_stats; static void jbd2_stats_proc_init(journal_t *journal) { - char name[BDEVNAME_SIZE]; - - bdevname(journal->j_dev, name); - journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats); + journal->j_proc_entry = proc_mkdir(journal->j_devname, proc_jbd2_stats); if (journal->j_proc_entry) { proc_create_data("history", S_IRUGO, journal->j_proc_entry, &jbd2_seq_history_fops, journal); @@ -915,12 +908,9 @@ static void jbd2_stats_proc_init(journal_t *journal) static void jbd2_stats_proc_exit(journal_t *journal) { - char name[BDEVNAME_SIZE]; - - bdevname(journal->j_dev, name); remove_proc_entry("info", journal->j_proc_entry); remove_proc_entry("history", journal->j_proc_entry); - remove_proc_entry(name, proc_jbd2_stats); + remove_proc_entry(journal->j_devname, proc_jbd2_stats); } static void journal_init_stats(journal_t *journal) @@ -1018,6 +1008,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, { journal_t *journal = journal_init_common(); struct buffer_head *bh; + char *p; int n; if (!journal) @@ -1039,6 +1030,10 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_fs_dev = fs_dev; journal->j_blk_offset = start; journal->j_maxlen = len; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; jbd2_stats_proc_init(journal); bh = __getblk(journal->j_dev, start, journal->j_blocksize); @@ -1061,6 +1056,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) { struct buffer_head *bh; journal_t *journal = journal_init_common(); + char *p; int err; int n; unsigned long long blocknr; @@ -1070,6 +1066,12 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_dev = journal->j_fs_dev = inode->i_sb->s_bdev; journal->j_inode = inode; + bdevname(journal->j_dev, journal->j_devname); + p = journal->j_devname; + while ((p = strchr(p, '/'))) + *p = '!'; + p = journal->j_devname + strlen(journal->j_devname); + sprintf(p, ":%lu", journal->j_inode->i_ino); jbd_debug(1, "journal %p: inode %s/%ld, size %Ld, bits %d, blksize %ld\n", journal, inode->i_sb->s_id, inode->i_ino, @@ -1253,6 +1255,22 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) goto out; } + if (buffer_write_io_error(bh)) { + /* + * Oh, dear. A previous attempt to write the journal + * superblock failed. This could happen because the + * USB device was yanked out. Or it could happen to + * be a transient write error and maybe the block will + * be remapped. Nothing we can do but to retry the + * write and hope for the best. + */ + printk(KERN_ERR "JBD2: previous I/O error detected " + "for journal superblock update for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + spin_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); @@ -1264,9 +1282,16 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); - if (wait) + if (wait) { sync_dirty_buffer(bh); - else + if (buffer_write_io_error(bh)) { + printk(KERN_ERR "JBD2: I/O error detected " + "when updating journal superblock for %s.\n", + journal->j_devname); + clear_buffer_write_io_error(bh); + set_buffer_uptodate(bh); + } + } else ll_rw_block(SWRITE, 1, &bh); out: @@ -1426,9 +1451,12 @@ recovery_error: * * Release a journal_t structure once it is no longer in use by the * journaled object. + * Return <0 if we couldn't clean up the journal. */ -void jbd2_journal_destroy(journal_t *journal) +int jbd2_journal_destroy(journal_t *journal) { + int err = 0; + /* Wait for the commit thread to wake up and die. */ journal_kill_thread(journal); @@ -1451,11 +1479,16 @@ void jbd2_journal_destroy(journal_t *journal) J_ASSERT(journal->j_checkpoint_transactions == NULL); spin_unlock(&journal->j_list_lock); - /* We can now mark the journal as empty. */ - journal->j_tail = 0; - journal->j_tail_sequence = ++journal->j_transaction_sequence; if (journal->j_sb_buffer) { - jbd2_journal_update_superblock(journal, 1); + if (!is_journal_aborted(journal)) { + /* We can now mark the journal as empty. */ + journal->j_tail = 0; + journal->j_tail_sequence = + ++journal->j_transaction_sequence; + jbd2_journal_update_superblock(journal, 1); + } else { + err = -EIO; + } brelse(journal->j_sb_buffer); } @@ -1467,6 +1500,8 @@ void jbd2_journal_destroy(journal_t *journal) jbd2_journal_destroy_revoke(journal); kfree(journal->j_wbuf); kfree(journal); + + return err; } @@ -1692,10 +1727,16 @@ int jbd2_journal_flush(journal_t *journal) spin_lock(&journal->j_list_lock); while (!err && journal->j_checkpoint_transactions != NULL) { spin_unlock(&journal->j_list_lock); + mutex_lock(&journal->j_checkpoint_mutex); err = jbd2_log_do_checkpoint(journal); + mutex_unlock(&journal->j_checkpoint_mutex); spin_lock(&journal->j_list_lock); } spin_unlock(&journal->j_list_lock); + + if (is_journal_aborted(journal)) + return -EIO; + jbd2_cleanup_journal_tail(journal); /* Finally, mark the journal as really needing no recovery. @@ -1717,7 +1758,7 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); spin_unlock(&journal->j_state_lock); - return err; + return 0; } /** @@ -1761,23 +1802,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) } /* - * journal_dev_name: format a character string to describe on what - * device this journal is present. - */ - -static const char *journal_dev_name(journal_t *journal, char *buffer) -{ - struct block_device *bdev; - - if (journal->j_inode) - bdev = journal->j_inode->i_sb->s_bdev; - else - bdev = journal->j_dev; - - return bdevname(bdev, buffer); -} - -/* * Journal abort has very specific semantics, which we describe * for journal abort. * @@ -1793,13 +1817,12 @@ static const char *journal_dev_name(journal_t *journal, char *buffer) void __jbd2_journal_abort_hard(journal_t *journal) { transaction_t *transaction; - char b[BDEVNAME_SIZE]; if (journal->j_flags & JBD2_ABORT) return; printk(KERN_ERR "Aborting journal on device %s.\n", - journal_dev_name(journal, b)); + journal->j_devname); spin_lock(&journal->j_state_lock); journal->j_flags |= JBD2_ABORT; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 058f50f65b7..73063285b13 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -225,7 +225,7 @@ do { \ */ int jbd2_journal_recover(journal_t *journal) { - int err; + int err, err2; journal_superblock_t * sb; struct recovery_info info; @@ -263,7 +263,10 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence = ++info.end_transaction; jbd2_journal_clear_revoke(journal); - sync_blockdev(journal->j_fs_dev); + err2 = sync_blockdev(journal->j_fs_dev); + if (!err) + err = err2; + return err; } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e5d540588fa..39b7805a599 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -52,6 +52,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); INIT_LIST_HEAD(&transaction->t_inode_list); + INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies(transaction->t_expires); diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index 31559f45fdd..4c41db91eaa 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -12,7 +12,6 @@ #ifndef _JFFS2_FS_I #define _JFFS2_FS_I -#include <linux/version.h> #include <linux/rbtree.h> #include <linux/posix_acl.h> #include <linux/mutex.h> diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 3630718be39..0dae345e481 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -199,7 +199,7 @@ enum { Opt_usrquota, Opt_grpquota, Opt_uid, Opt_gid, Opt_umask }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_integrity, "integrity"}, {Opt_nointegrity, "nointegrity"}, {Opt_iocharset, "iocharset=%s"}, diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 7725a0a9a55..97f6073ab33 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ - svcproc.o svcsubs.o mon.o xdr.o + svcproc.o svcsubs.o mon.o xdr.o grace.o lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0b45fd3a4bf..8307dd64bf4 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; int status; - status = lockd_up(nlm_init->protocol); + status = lockd_up(); if (status < 0) return ERR_PTR(status); - host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname, - strlen(nlm_init->hostname)); + nlm_init->hostname); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); @@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) { const struct file_lock *fl = &lock->fl; const struct nfs_fh *fh = &lock->fh; @@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; @@ -216,7 +215,7 @@ reclaimer(void *ptr) /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(0); /* note: this cannot fail as lockd is already running */ + lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c new file mode 100644 index 00000000000..183cc1f0af1 --- /dev/null +++ b/fs/lockd/grace.c @@ -0,0 +1,59 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + */ + +#include <linux/module.h> +#include <linux/lockd/bind.h> + +static LIST_HEAD(grace_list); +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void locks_start_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_add(&lm->list, &grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int locks_in_grace(void) +{ + return !list_empty(&grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a17664c7eac..9fd8889097b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -11,16 +11,17 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> +#include <linux/in6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/lockd/sm_inter.h> #include <linux/mutex.h> +#include <net/ipv6.h> #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 -#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1)) #define NLM_HOST_REBIND (60 * HZ) #define NLM_HOST_EXPIRE (300 * HZ) #define NLM_HOST_COLLECT (120 * HZ) @@ -30,42 +31,115 @@ static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); - static void nlm_gc_hosts(void); -static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, unsigned int, int); -static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len); +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create); + +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const struct sockaddr *src_sap; /* our address (optional) */ + const size_t src_len; /* it's length */ +}; + +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + +static void nlm_clear_port(struct sockaddr *sap) +{ + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = 0; + break; + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = 0; + break; + } +} + +static void nlm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_UNSPEC: + snprintf(buf, len, "unspecified"); + break; + case AF_INET: + snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr)); + break; + case AF_INET6: + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, NIPQUAD_FMT, + NIPQUAD(sin6->sin6_addr.s6_addr32[3])); + else + snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr)); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} /* * Common host lookup routine for server & client */ -static struct nlm_host *nlm_lookup_host(int server, - const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) { struct hlist_head *chain; struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; - int hash; - - dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%u, my role=%s, name=%.*s)\n", - NIPQUAD(ssin->sin_addr.s_addr), - NIPQUAD(sin->sin_addr.s_addr), proto, version, - server? "server" : "client", - hostname_len, - hostname? hostname : "<none>"); - - hash = NLM_ADDRHASH(sin->sin_addr.s_addr); - - /* Lock hash table */ mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) @@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[hash]; + chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(&host->h_addr, sin)) + if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ if (!nsm) nsm = host->h_nsmhandle; - if (host->h_proto != proto) + if (host->h_proto != ni->protocol) continue; - if (host->h_version != version) + if (host->h_version != ni->version) continue; - if (host->h_server != server) + if (host->h_server != ni->server) continue; - if (!nlm_cmp_addr(&host->h_saddr, ssin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ @@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server, hlist_add_head(&host->h_hash, chain); nlm_get_host(host); + dprintk("lockd: nlm_lookup_host found host %s (%s)\n", + host->h_name, host->h_addrbuf); goto out; } - if (nsm) - atomic_inc(&nsm->sm_count); - - host = NULL; - /* Sadly, the host isn't in our hash table yet. See if - * we have an NSM handle for it. If not, create one. + /* + * The host wasn't in our hash table. If we don't + * have an NSM handle for it yet, create one. */ - if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) - goto out; + if (nsm) + atomic_inc(&nsm->sm_count); + else { + host = NULL; + nsm = nsm_find(ni->sap, ni->salen, + ni->hostname, ni->hostname_len, 1); + if (!nsm) { + dprintk("lockd: nlm_lookup_host failed; " + "no nsm handle\n"); + goto out; + } + } host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) { nsm_release(nsm); + dprintk("lockd: nlm_lookup_host failed; no memory\n"); goto out; } host->h_name = nsm->sm_name; - host->h_addr = *sin; - host->h_addr.sin_port = 0; /* ouch! */ - host->h_saddr = *ssin; - host->h_version = version; - host->h_proto = proto; + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; + nlm_clear_port(nlm_addr(host)); + memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_version = ni->version; + host->h_proto = ni->protocol; host->h_rpcclnt = NULL; mutex_init(&host->h_mutex); host->h_nextrebind = jiffies + NLM_HOST_REBIND; @@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server, host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ host->h_nsmhandle = nsm; - host->h_server = server; + host->h_server = ni->server; hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); @@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server, INIT_LIST_HEAD(&host->h_reclaim); nrhosts++; + + nlm_display_address((struct sockaddr *)&host->h_addr, + host->h_addrbuf, sizeof(host->h_addrbuf)); + nlm_display_address((struct sockaddr *)&host->h_srcaddr, + host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); + + dprintk("lockd: nlm_lookup_host created host %s\n", + host->h_name); + out: mutex_unlock(&nlm_host_mutex); return host; @@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host) kfree(host); } -/* - * Find an NLM server handle in the cache. If there is none, create it. +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, const char *hostname) { - struct sockaddr_in ssin = {0}; - - return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len, &ssin); + const struct sockaddr source = { + .sa_family = AF_UNSPEC, + }; + struct nlm_lookup_host_info ni = { + .server = 0, + .sap = sap, + .salen = salen, + .protocol = protocol, + .version = version, + .hostname = hostname, + .hostname_len = strlen(hostname), + .src_sap = &source, + .src_len = sizeof(source), + }; + + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : "<none>"), version, + (protocol == IPPROTO_UDP ? "udp" : "tcp")); + + return nlm_lookup_host(&ni); } -/* - * Find an NLM client handle in the cache. If there is none, create it. +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. */ -struct nlm_host * -nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) { - struct sockaddr_in ssin = {0}; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + }; + struct nlm_lookup_host_info ni = { + .server = 1, + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + .src_len = rqstp->rq_addrlen, + }; + + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + switch (ni.sap->sa_family) { + case AF_INET: + sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; + ni.src_sap = (struct sockaddr *)&sin; + break; + case AF_INET6: + ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); + ni.src_sap = (struct sockaddr *)&sin6; + break; + default: + return NULL; + } - ssin.sin_addr = rqstp->rq_daddr.addr; - return nlm_lookup_host(1, svc_addr_in(rqstp), - rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len, &ssin); + return nlm_lookup_host(&ni); } /* @@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", - NIPQUAD(host->h_saddr.sin_addr), - NIPQUAD(host->h_addr.sin_addr)); + dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", + host->h_name, host->h_addrbuf, host->h_srcaddrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host) if (time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %ld jiffies\n", + dprintk("lockd: next rebind in %lu jiffies\n", host->h_nextrebind - jiffies); } } else { @@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host) }; struct rpc_create_args args = { .protocol = host->h_proto, - .address = (struct sockaddr *)&host->h_addr, - .addrsize = sizeof(host->h_addr), - .saddress = (struct sockaddr *)&host->h_saddr, + .address = nlm_addr(host), + .addrsize = host->h_addrlen, + .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", - hostname, NIPQUAD(sin->sin_addr)); - - /* Find the NSM handle for this peer */ - if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) + nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), + hostname, hostname_len, 0); + if (nsm == NULL) { + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + hostname_len, hostname); return; + } + + dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", + hostname_len, hostname, nsm->sm_addrbuf); /* When reclaiming locks on this peer, make sure that * we set up a new notification */ @@ -461,22 +628,23 @@ nlm_gc_hosts(void) static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); -static struct nsm_handle * -__nsm_find(const struct sockaddr_in *sin, - const char *hostname, unsigned int hostname_len, - int create) +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create) { struct nsm_handle *nsm = NULL; struct nsm_handle *pos; - if (!sin) + if (!sap) return NULL; if (hostname && memchr(hostname, '/', hostname_len) != NULL) { if (printk_ratelimit()) { printk(KERN_WARNING "Invalid hostname \"%.*s\" " "in NFS lock request\n", - hostname_len, hostname); + (int)hostname_len, hostname); } return NULL; } @@ -489,7 +657,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) + } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) continue; atomic_inc(&pos->sm_count); kfree(nsm); @@ -509,10 +677,13 @@ retry: if (nsm == NULL) return NULL; - nsm->sm_addr = *sin; + memcpy(nsm_addr(nsm), sap, salen); + nsm->sm_addrlen = salen; nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; + nlm_display_address((struct sockaddr *)&nsm->sm_addr, + nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); atomic_set(&nsm->sm_count, 1); goto retry; @@ -521,13 +692,6 @@ found: return nsm; } -static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, - unsigned int hostname_len) -{ - return __nsm_find(sin, hostname, hostname_len, 1); -} - /* * Release an NSM handle */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e4d563543b1..4e7e958e8f6 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) memset(&args, 0, sizeof(args)); args.mon_name = nsm->sm_name; - args.addr = nsm->sm_addr.sin_addr.s_addr; + args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; args.prog = NLM_PROGRAM; args.vers = 3; args.proc = NLMPROC_NSM_NOTIFY; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5bd9bf0fa9d..c631a83931c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; -int nlmsvc_grace_period; unsigned long nlmsvc_timeout; /* @@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -unsigned long get_nfs_grace_period(void) -{ - unsigned long lockdgrace = get_lockd_grace_period(); - unsigned long nfsdgrace = 0; - - if (nlmsvc_ops) - nfsdgrace = nlmsvc_ops->get_grace_period(); - - return max(lockdgrace, nfsdgrace); -} -EXPORT_SYMBOL(get_nfs_grace_period); +static struct lock_manager lockd_manager = { +}; -static unsigned long set_grace_period(void) +static void grace_ender(struct work_struct *not_used) { - nlmsvc_grace_period = 1; - return get_nfs_grace_period() + jiffies; + locks_end_grace(&lockd_manager); } -static inline void clear_grace_period(void) +static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + +static void set_grace_period(void) { - nlmsvc_grace_period = 0; + unsigned long grace_period = get_lockd_grace_period(); + + locks_start_grace(&lockd_manager); + cancel_delayed_work_sync(&grace_period_end); + schedule_delayed_work(&grace_period_end, grace_period); } /* @@ -116,7 +111,6 @@ lockd(void *vrqstp) { int err = 0, preverr = 0; struct svc_rqst *rqstp = vrqstp; - unsigned long grace_period_expire; /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -139,7 +133,7 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - grace_period_expire = set_grace_period(); + set_grace_period(); /* * The main request loop. We don't terminate until the last @@ -153,21 +147,12 @@ lockd(void *vrqstp) flush_signals(current); if (nlmsvc_ops) { nlmsvc_invalidate_all(); - grace_period_expire = set_grace_period(); + set_grace_period(); } continue; } - /* - * Retry any blocked locks that have been notified by - * the VFS. Don't do this during grace period. - * (Theoretically, there shouldn't even be blocked locks - * during grace period). - */ - if (!nlmsvc_grace_period) { - timeout = nlmsvc_retry_blocked(); - } else if (time_before(grace_period_expire, jiffies)) - clear_grace_period(); + timeout = nlmsvc_retry_blocked(); /* * Find a socket with data available and call its @@ -195,6 +180,7 @@ lockd(void *vrqstp) svc_process(rqstp); } flush_signals(current); + cancel_delayed_work_sync(&grace_period_end); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -203,25 +189,28 @@ lockd(void *vrqstp) } /* - * Make any sockets that are needed but not present. - * If nlm_udpport or nlm_tcpport were set as module - * options, make those sockets unconditionally + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. */ -static int make_socks(struct svc_serv *serv, int proto) +static int make_socks(struct svc_serv *serv) { static int warned; struct svc_xprt *xprt; int err = 0; - if (proto == IPPROTO_UDP || nlm_udpport) { - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { + xprt = svc_find_xprt(serv, "udp", 0, 0); + if (!xprt) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); + else + svc_xprt_put(xprt); + if (err >= 0) { xprt = svc_find_xprt(serv, "tcp", 0, 0); if (!xprt) err = svc_create_xprt(serv, "tcp", nlm_tcpport, @@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto) /* * Bring up the lockd process if it's not already up. */ -int -lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ +int lockd_up(void) { struct svc_serv *serv; int error = 0; @@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) { - if (proto) - error = make_socks(nlmsvc_rqst->rq_server, proto); + if (nlmsvc_rqst) goto out; - } /* * Sanity check: if there's no pid, @@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; } - if ((error = make_socks(serv, proto)) < 0) + error = make_socks(serv); + if (error < 0) goto destroy_and_out; /* diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a714f64515..014f6ce4817 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST4 called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie); + argp->block, &argp->cookie, + argp->reclaim); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index cf0d5c2c318..6063a8e4b9f 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) __be32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, - struct nlm_cookie *cookie) + struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; int error; @@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace() && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (reclaim && !locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); @@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } error = vfs_test_lock(file->f_file, &lock->fl); if (error == FILE_LOCK_DEFERRED) { ret = nlmsvc_defer_lock_rqst(rqstp, block); @@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (locks_in_grace()) + return nlm_lck_denied_grace_period; + mutex_lock(&file->f_mutex); block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 76262c1986f..548b0bb2b84 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie)); + argp->block, &argp->cookie, + argp->reclaim)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 198b4e55b37..34c2766e27c 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(&host->h_saddr, datap); + return nlm_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 3e459e18cc3..1f226290c67 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 43ff9397e6c..50c493a8ad8 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/mpage.c b/fs/mpage.c index dbcc7af76a1..552b80b3fac 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -6,7 +6,7 @@ * Contains functions related to preparing and submitting BIOs which contain * multiple pagecache pages. * - * 15May2002 akpm@zip.com.au + * 15May2002 Andrew Morton * Initial version * 27Jun2002 axboe@suse.de * use bio_add_page() to build bio's just the right size diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f447f4b4476..6a09760c596 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -105,7 +105,8 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + AF_INET, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5ee23e7058b..7547600b617 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -675,7 +675,7 @@ static int nfs_init_server(struct nfs_server *server, server->nfs_client = clp; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; if (data->rsize) server->rsize = nfs_block_size(data->rsize, NULL); @@ -850,7 +850,6 @@ static struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->client_link); INIT_LIST_HEAD(&server->master_link); - init_waitqueue_head(&server->active_wq); atomic_set(&server->active, 0); server->io_stats = nfs_alloc_iostats(); @@ -1073,7 +1072,7 @@ static int nfs4_init_server(struct nfs_server *server, goto error; /* Initialise the client representation from the mount data */ - server->flags = data->flags & NFS_MOUNT_FLAGMASK; + server->flags = data->flags; server->caps |= NFS_CAP_ATOMIC_OPEN; if (data->rsize) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 74f92b717f7..2ab70d46ecb 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -156,6 +156,7 @@ typedef struct { decode_dirent_t decode; int plus; unsigned long timestamp; + unsigned long gencount; int timestamp_valid; } nfs_readdir_descriptor_t; @@ -177,7 +178,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) struct file *file = desc->file; struct inode *inode = file->f_path.dentry->d_inode; struct rpc_cred *cred = nfs_file_cred(file); - unsigned long timestamp; + unsigned long timestamp, gencount; int error; dfprintk(DIRCACHE, "NFS: %s: reading cookie %Lu into page %lu\n", @@ -186,6 +187,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) again: timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); error = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, desc->entry->cookie, page, NFS_SERVER(inode)->dtsize, desc->plus); if (error < 0) { @@ -199,6 +201,7 @@ int nfs_readdir_filler(nfs_readdir_descriptor_t *desc, struct page *page) goto error; } desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; SetPageUptodate(page); /* Ensure consistent page alignment of the data. @@ -224,9 +227,10 @@ int dir_decode(nfs_readdir_descriptor_t *desc) if (IS_ERR(p)) return PTR_ERR(p); desc->ptr = p; - if (desc->timestamp_valid) + if (desc->timestamp_valid) { desc->entry->fattr->time_start = desc->timestamp; - else + desc->entry->fattr->gencount = desc->gencount; + } else desc->entry->fattr->valid &= ~NFS_ATTR_FATTR; return 0; } @@ -471,7 +475,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, struct rpc_cred *cred = nfs_file_cred(file); struct page *page = NULL; int status; - unsigned long timestamp; + unsigned long timestamp, gencount; dfprintk(DIRCACHE, "NFS: uncached_readdir() searching for cookie %Lu\n", (unsigned long long)*desc->dir_cookie); @@ -482,6 +486,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, goto out; } timestamp = jiffies; + gencount = nfs_inc_attr_generation_counter(); status = NFS_PROTO(inode)->readdir(file->f_path.dentry, cred, *desc->dir_cookie, page, NFS_SERVER(inode)->dtsize, @@ -490,6 +495,7 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc, void *dirent, desc->ptr = kmap(page); /* matching kunmap in nfs_do_filldir */ if (status >= 0) { desc->timestamp = timestamp; + desc->gencount = gencount; desc->timestamp_valid = 1; if ((status = dir_decode(desc)) == 0) desc->entry->prev_cookie = *desc->dir_cookie; @@ -655,7 +661,7 @@ static int nfs_fsync_dir(struct file *filp, struct dentry *dentry, int datasync) */ void nfs_force_lookup_revalidate(struct inode *dir) { - NFS_I(dir)->cache_change_attribute = jiffies; + NFS_I(dir)->cache_change_attribute++; } /* @@ -667,6 +673,8 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry) { if (IS_ROOT(dentry)) return 1; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE) + return 0; if (!nfs_verify_change_attribute(dir, dentry->d_time)) return 0; /* Revalidate nfsi->cache_change_attribute before we declare a match */ @@ -750,6 +758,8 @@ int nfs_neg_need_reval(struct inode *dir, struct dentry *dentry, /* Don't revalidate a negative dentry if we're creating a new file */ if (nd != NULL && nfs_lookup_check_intent(nd, LOOKUP_CREATE) != 0) return 0; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONEG) + return 1; return !nfs_check_verifier(dir, dentry); } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 78460657f5c..d319b49f8f0 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -188,13 +188,16 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) /* origin == SEEK_END => we must revalidate the cached file length */ if (origin == SEEK_END) { struct inode *inode = filp->f_mapping->host; + int retval = nfs_revalidate_file_size(inode, filp); if (retval < 0) return (loff_t)retval; - } - lock_kernel(); /* BKL needed? */ - loff = generic_file_llseek_unlocked(filp, offset, origin); - unlock_kernel(); + + spin_lock(&inode->i_lock); + loff = generic_file_llseek_unlocked(filp, offset, origin); + spin_unlock(&inode->i_lock); + } else + loff = generic_file_llseek_unlocked(filp, offset, origin); return loff; } @@ -699,13 +702,6 @@ static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) filp->f_path.dentry->d_name.name, fl->fl_type, fl->fl_flags); - /* - * No BSD flocks over NFS allowed. - * Note: we could try to fake a POSIX lock request here by - * using ((u32) filp | 0x80000000) or some such as the pid. - * Not sure whether that would be unique, though, or whether - * that would break in other places. - */ if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 52daefa2f52..b9195c02a86 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -305,8 +305,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) init_special_inode(inode, inode->i_mode, fattr->rdev); nfsi->read_cache_jiffies = fattr->time_start; - nfsi->last_updated = now; - nfsi->cache_change_attribute = now; + nfsi->attr_gencount = fattr->gencount; inode->i_atime = fattr->atime; inode->i_mtime = fattr->mtime; inode->i_ctime = fattr->ctime; @@ -453,6 +452,7 @@ out_big: void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) { if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { + spin_lock(&inode->i_lock); if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -462,7 +462,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) inode->i_uid = attr->ia_uid; if ((attr->ia_valid & ATTR_GID) != 0) inode->i_gid = attr->ia_gid; - spin_lock(&inode->i_lock); NFS_I(inode)->cache_validity |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; spin_unlock(&inode->i_lock); } @@ -472,37 +471,6 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr) } } -static int nfs_wait_schedule(void *word) -{ - if (signal_pending(current)) - return -ERESTARTSYS; - schedule(); - return 0; -} - -/* - * Wait for the inode to get unlocked. - */ -static int nfs_wait_on_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - int error; - - error = wait_on_bit_lock(&nfsi->flags, NFS_INO_REVALIDATING, - nfs_wait_schedule, TASK_KILLABLE); - - return error; -} - -static void nfs_wake_up_inode(struct inode *inode) -{ - struct nfs_inode *nfsi = NFS_I(inode); - - clear_bit(NFS_INO_REVALIDATING, &nfsi->flags); - smp_mb__after_clear_bit(); - wake_up_bit(&nfsi->flags, NFS_INO_REVALIDATING); -} - int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) { struct inode *inode = dentry->d_inode; @@ -697,20 +665,15 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) dfprintk(PAGECACHE, "NFS: revalidating (%s/%Ld)\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode)); - nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); if (is_bad_inode(inode)) - goto out_nowait; + goto out; if (NFS_STALE(inode)) - goto out_nowait; - - status = nfs_wait_on_inode(inode); - if (status < 0) goto out; - status = -ESTALE; if (NFS_STALE(inode)) goto out; + nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE); status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr); if (status != 0) { dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) getattr failed, error=%d\n", @@ -724,16 +687,13 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) goto out; } - spin_lock(&inode->i_lock); - status = nfs_update_inode(inode, &fattr); + status = nfs_refresh_inode(inode, &fattr); if (status) { - spin_unlock(&inode->i_lock); dfprintk(PAGECACHE, "nfs_revalidate_inode: (%s/%Ld) refresh failed, error=%d\n", inode->i_sb->s_id, (long long)NFS_FILEID(inode), status); goto out; } - spin_unlock(&inode->i_lock); if (nfsi->cache_validity & NFS_INO_INVALID_ACL) nfs_zap_acl_cache(inode); @@ -743,9 +703,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode) (long long)NFS_FILEID(inode)); out: - nfs_wake_up_inode(inode); - - out_nowait: return status; } @@ -908,9 +865,6 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat return -EIO; } - /* Do atomic weak cache consistency updates */ - nfs_wcc_update_inode(inode, fattr); - if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && nfsi->change_attr != fattr->change_attr) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; @@ -939,15 +893,81 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat if (invalid != 0) nfsi->cache_validity |= invalid; - else - nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR - | NFS_INO_INVALID_ATIME - | NFS_INO_REVAL_PAGECACHE); nfsi->read_cache_jiffies = fattr->time_start; return 0; } +static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0; +} + +static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + return nfs_size_to_loff_t(fattr->size) > i_size_read(inode); +} + +static unsigned long nfs_attr_generation_counter; + +static unsigned long nfs_read_attr_generation_counter(void) +{ + smp_rmb(); + return nfs_attr_generation_counter; +} + +unsigned long nfs_inc_attr_generation_counter(void) +{ + unsigned long ret; + smp_rmb(); + ret = ++nfs_attr_generation_counter; + smp_wmb(); + return ret; +} + +void nfs_fattr_init(struct nfs_fattr *fattr) +{ + fattr->valid = 0; + fattr->time_start = jiffies; + fattr->gencount = nfs_inc_attr_generation_counter(); +} + +/** + * nfs_inode_attrs_need_update - check if the inode attributes need updating + * @inode - pointer to inode + * @fattr - attributes + * + * Attempt to divine whether or not an RPC call reply carrying stale + * attributes got scheduled after another call carrying updated ones. + * + * To do so, the function first assumes that a more recent ctime means + * that the attributes in fattr are newer, however it also attempt to + * catch the case where ctime either didn't change, or went backwards + * (if someone reset the clock on the server) by looking at whether + * or not this RPC call was started after the inode was last updated. + * Note also the check for wraparound of 'attr_gencount' + * + * The function returns 'true' if it thinks the attributes in 'fattr' are + * more recent than the ones cached in the inode. + * + */ +static int nfs_inode_attrs_need_update(const struct inode *inode, const struct nfs_fattr *fattr) +{ + const struct nfs_inode *nfsi = NFS_I(inode); + + return ((long)fattr->gencount - (long)nfsi->attr_gencount) > 0 || + nfs_ctime_need_update(inode, fattr) || + nfs_size_need_update(inode, fattr) || + ((long)nfsi->attr_gencount - (long)nfs_read_attr_generation_counter() > 0); +} + +static int nfs_refresh_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + if (nfs_inode_attrs_need_update(inode, fattr)) + return nfs_update_inode(inode, fattr); + return nfs_check_inode_attributes(inode, fattr); +} + /** * nfs_refresh_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -960,21 +980,28 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat */ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); int status; if ((fattr->valid & NFS_ATTR_FATTR) == 0) return 0; spin_lock(&inode->i_lock); - if (time_after(fattr->time_start, nfsi->last_updated)) - status = nfs_update_inode(inode, fattr); - else - status = nfs_check_inode_attributes(inode, fattr); - + status = nfs_refresh_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); return status; } +static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr *fattr) +{ + struct nfs_inode *nfsi = NFS_I(inode); + + nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; + if (S_ISDIR(inode->i_mode)) + nfsi->cache_validity |= NFS_INO_INVALID_DATA; + if ((fattr->valid & NFS_ATTR_FATTR) == 0) + return 0; + return nfs_refresh_inode_locked(inode, fattr); +} + /** * nfs_post_op_update_inode - try to update the inode attribute cache * @inode - pointer to inode @@ -991,14 +1018,12 @@ int nfs_refresh_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) { - struct nfs_inode *nfsi = NFS_I(inode); + int status; spin_lock(&inode->i_lock); - nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE; - if (S_ISDIR(inode->i_mode)) - nfsi->cache_validity |= NFS_INO_INVALID_DATA; + status = nfs_post_op_update_inode_locked(inode, fattr); spin_unlock(&inode->i_lock); - return nfs_refresh_inode(inode, fattr); + return status; } /** @@ -1014,6 +1039,15 @@ int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr) */ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr) { + int status; + + spin_lock(&inode->i_lock); + /* Don't do a WCC update if these attributes are already stale */ + if ((fattr->valid & NFS_ATTR_FATTR) == 0 || + !nfs_inode_attrs_need_update(inode, fattr)) { + fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC); + goto out_noforce; + } if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 && (fattr->valid & NFS_ATTR_WCC_V4) == 0) { fattr->pre_change_attr = NFS_I(inode)->change_attr; @@ -1026,7 +1060,10 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa fattr->pre_size = i_size_read(inode); fattr->valid |= NFS_ATTR_WCC; } - return nfs_post_op_update_inode(inode, fattr); +out_noforce: + status = nfs_post_op_update_inode_locked(inode, fattr); + spin_unlock(&inode->i_lock); + return status; } /* @@ -1092,7 +1129,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) } /* If ctime has changed we should definitely clear access+acl caches */ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) - invalid |= NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; } else if (nfsi->change_attr != fattr->change_attr) { dprintk("NFS: change_attr change on server for file %s/%ld\n", inode->i_sb->s_id, inode->i_ino); @@ -1126,6 +1163,9 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) inode->i_gid != fattr->gid) invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL; + if (inode->i_nlink != fattr->nlink) + invalid |= NFS_INO_INVALID_ATTR; + inode->i_mode = fattr->mode; inode->i_nlink = fattr->nlink; inode->i_uid = fattr->uid; @@ -1145,18 +1185,13 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr) nfs_inc_stats(inode, NFSIOS_ATTRINVALIDATE); nfsi->attrtimeo = NFS_MINATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; - nfsi->last_updated = now; + nfsi->attr_gencount = nfs_inc_attr_generation_counter(); } else { if (!time_in_range(now, nfsi->attrtimeo_timestamp, nfsi->attrtimeo_timestamp + nfsi->attrtimeo)) { if ((nfsi->attrtimeo <<= 1) > NFS_MAXATTRTIMEO(inode)) nfsi->attrtimeo = NFS_MAXATTRTIMEO(inode); nfsi->attrtimeo_timestamp = now; } - /* - * Avoid jiffy wraparound issues with nfsi->last_updated - */ - if (!time_in_range(nfsi->last_updated, nfsi->read_cache_jiffies, now)) - nfsi->last_updated = nfsi->read_cache_jiffies; } invalid &= ~NFS_INO_INVALID_ATTR; /* Don't invalidate the data if we were to blame */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 24241fcbb98..d212ee41caf 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -153,6 +153,7 @@ extern void nfs4_clear_inode(struct inode *); void nfs_zap_acl_cache(struct inode *inode); /* super.c */ +void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *); extern struct file_system_type nfs_xdev_fs_type; #ifdef CONFIG_NFS_V4 extern struct file_system_type nfs4_xdev_fs_type; @@ -163,8 +164,8 @@ extern struct rpc_stat nfs_rpcstat; extern int __init register_nfs_fs(void); extern void __exit unregister_nfs_fs(void); -extern void nfs_sb_active(struct nfs_server *server); -extern void nfs_sb_deactive(struct nfs_server *server); +extern void nfs_sb_active(struct super_block *sb); +extern void nfs_sb_deactive(struct super_block *sb); /* namespace.c */ extern char *nfs_path(const char *base, @@ -276,3 +277,23 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) PAGE_SIZE - 1) >> PAGE_SHIFT; } +#define IPV6_SCOPE_DELIMITER '%' + +/* + * Set the port number in an address. Be agnostic about the address + * family. + */ +static inline void nfs_set_port(struct sockaddr *sap, unsigned short port) +{ + struct sockaddr_in *ap = (struct sockaddr_in *)sap; + struct sockaddr_in6 *ap6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_INET: + ap->sin_port = htons(port); + break; + case AF_INET6: + ap6->sin6_port = htons(port); + break; + } +} diff --git a/fs/nfs/mount_clnt.c b/fs/nfs/mount_clnt.c index 779d2eb649c..086a6830d78 100644 --- a/fs/nfs/mount_clnt.c +++ b/fs/nfs/mount_clnt.c @@ -14,6 +14,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/sched.h> #include <linux/nfs_fs.h> +#include "internal.h" #ifdef RPC_DEBUG # define NFSDBG_FACILITY NFSDBG_MOUNT @@ -98,7 +99,7 @@ out_call_err: out_mnt_err: dprintk("NFS: MNT server returned result %d\n", result.status); - status = -EACCES; + status = nfs_stat_to_errno(result.status); goto out; } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 66df08dd1ca..64a288ee046 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -105,7 +105,10 @@ static void * nfs_follow_mountpoint(struct dentry *dentry, struct nameidata *nd) dprintk("--> nfs_follow_mountpoint()\n"); - BUG_ON(IS_ROOT(dentry)); + err = -ESTALE; + if (IS_ROOT(dentry)) + goto out_err; + dprintk("%s: enter\n", __func__); dput(nd->path.dentry); nd->path.dentry = dget(dentry); @@ -189,7 +192,7 @@ static struct vfsmount *nfs_do_clone_mount(struct nfs_server *server, struct nfs_clone_mount *mountdata) { #ifdef CONFIG_NFS_V4 - struct vfsmount *mnt = NULL; + struct vfsmount *mnt = ERR_PTR(-EINVAL); switch (server->nfs_client->rpc_ops->version) { case 2: case 3: diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 423842f51ac..cef62557c87 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -229,6 +229,7 @@ struct posix_acl *nfs3_proc_getacl(struct inode *inode, int type) dprintk("NFS call getacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_GETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); dprintk("NFS reply getacl: %d\n", status); @@ -322,6 +323,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, dprintk("NFS call setacl\n"); msg.rpc_proc = &server->client_acl->cl_procinfo[ACLPROC3_SETACL]; + nfs_fattr_init(&fattr); status = rpc_call_sync(server->client_acl, &msg, 0); nfs_access_zap_cache(inode); nfs_zap_acl_cache(inode); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 1e750e4574a..c55be7a7679 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -699,7 +699,7 @@ nfs3_proc_statfs(struct nfs_server *server, struct nfs_fh *fhandle, } static int -nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, +do_proc_fsinfo(struct rpc_clnt *client, struct nfs_fh *fhandle, struct nfs_fsinfo *info) { struct rpc_message msg = { @@ -711,11 +711,27 @@ nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("NFS call fsinfo\n"); nfs_fattr_init(info->fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(client, &msg, 0); dprintk("NFS reply fsinfo: %d\n", status); return status; } +/* + * Bare-bones access to fsinfo: this is for nfs_get_root/nfs_get_sb via + * nfs_create_server + */ +static int +nfs3_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, + struct nfs_fsinfo *info) +{ + int status; + + status = do_proc_fsinfo(server->client, fhandle, info); + if (status && server->nfs_client->cl_rpcclient != server->client) + status = do_proc_fsinfo(server->nfs_client->cl_rpcclient, fhandle, info); + return status; +} + static int nfs3_proc_pathconf(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_pathconf *info) diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index b112857301f..30befc39b3c 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -93,21 +93,52 @@ static int nfs4_validate_fspath(const struct vfsmount *mnt_parent, return 0; } -/* - * Check if the string represents a "valid" IPv4 address - */ -static inline int valid_ipaddr4(const char *buf) +static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, + char *page, char *page2, + const struct nfs4_fs_location *location) { - int rc, count, in[4]; - - rc = sscanf(buf, "%d.%d.%d.%d", &in[0], &in[1], &in[2], &in[3]); - if (rc != 4) - return -EINVAL; - for (count = 0; count < 4; count++) { - if (in[count] > 255) - return -EINVAL; + struct vfsmount *mnt = ERR_PTR(-ENOENT); + char *mnt_path; + int page2len; + unsigned int s; + + mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); + if (IS_ERR(mnt_path)) + return mnt; + mountdata->mnt_path = mnt_path; + page2 += strlen(mnt_path) + 1; + page2len = PAGE_SIZE - strlen(mnt_path) - 1; + + for (s = 0; s < location->nservers; s++) { + const struct nfs4_string *buf = &location->servers[s]; + struct sockaddr_storage addr; + + if (buf->len <= 0 || buf->len >= PAGE_SIZE) + continue; + + mountdata->addr = (struct sockaddr *)&addr; + + if (memchr(buf->data, IPV6_SCOPE_DELIMITER, buf->len)) + continue; + nfs_parse_ip_address(buf->data, buf->len, + mountdata->addr, &mountdata->addrlen); + if (mountdata->addr->sa_family == AF_UNSPEC) + continue; + nfs_set_port(mountdata->addr, NFS_PORT); + + strncpy(page2, buf->data, page2len); + page2[page2len] = '\0'; + mountdata->hostname = page2; + + snprintf(page, PAGE_SIZE, "%s:%s", + mountdata->hostname, + mountdata->mnt_path); + + mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, mountdata); + if (!IS_ERR(mnt)) + break; } - return 0; + return mnt; } /** @@ -128,7 +159,6 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, .authflavor = NFS_SB(mnt_parent->mnt_sb)->client->cl_auth->au_flavor, }; char *page = NULL, *page2 = NULL; - unsigned int s; int loc, error; if (locations == NULL || locations->nlocations <= 0) @@ -152,53 +182,16 @@ static struct vfsmount *nfs_follow_referral(const struct vfsmount *mnt_parent, goto out; } - loc = 0; - while (loc < locations->nlocations && IS_ERR(mnt)) { + for (loc = 0; loc < locations->nlocations; loc++) { const struct nfs4_fs_location *location = &locations->locations[loc]; - char *mnt_path; if (location == NULL || location->nservers <= 0 || - location->rootpath.ncomponents == 0) { - loc++; + location->rootpath.ncomponents == 0) continue; - } - mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); - if (IS_ERR(mnt_path)) { - loc++; - continue; - } - mountdata.mnt_path = mnt_path; - - s = 0; - while (s < location->nservers) { - struct sockaddr_in addr = { - .sin_family = AF_INET, - .sin_port = htons(NFS_PORT), - }; - - if (location->servers[s].len <= 0 || - valid_ipaddr4(location->servers[s].data) < 0) { - s++; - continue; - } - - mountdata.hostname = location->servers[s].data; - addr.sin_addr.s_addr = in_aton(mountdata.hostname), - mountdata.addr = (struct sockaddr *)&addr; - mountdata.addrlen = sizeof(addr); - - snprintf(page, PAGE_SIZE, "%s:%s", - mountdata.hostname, - mountdata.mnt_path); - - mnt = vfs_kern_mount(&nfs4_referral_fs_type, 0, page, &mountdata); - if (!IS_ERR(mnt)) { - break; - } - s++; - } - loc++; + mnt = try_location(&mountdata, page, page2, location); + if (!IS_ERR(mnt)) + break; } out: diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c index 46763d1cd39..8478fc25dae 100644 --- a/fs/nfs/nfsroot.c +++ b/fs/nfs/nfsroot.c @@ -127,7 +127,7 @@ enum { Opt_err }; -static match_table_t __initdata tokens = { +static match_table_t __initconst tokens = { {Opt_port, "port=%u"}, {Opt_rsize, "rsize=%u"}, {Opt_wsize, "wsize=%u"}, diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 4dbb84df1b6..193465210d7 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -65,14 +65,20 @@ nfs_proc_get_root(struct nfs_server *server, struct nfs_fh *fhandle, dprintk("%s: call getattr\n", __func__); nfs_fattr_init(fattr); - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply getattr: %d\n", __func__, status); if (status) return status; dprintk("%s: call statfs\n", __func__); msg.rpc_proc = &nfs_procedures[NFSPROC_STATFS]; msg.rpc_resp = &fsinfo; - status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); + status = rpc_call_sync(server->client, &msg, 0); + /* Retry with default authentication if different */ + if (status && server->nfs_client->cl_rpcclient != server->client) + status = rpc_call_sync(server->nfs_client->cl_rpcclient, &msg, 0); dprintk("%s: reply statfs: %d\n", __func__, status); if (status) return status; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9abcd2b329f..8b28b95c9e4 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -91,6 +91,7 @@ enum { /* Mount options that take string arguments */ Opt_sec, Opt_proto, Opt_mountproto, Opt_mounthost, Opt_addr, Opt_mountaddr, Opt_clientaddr, + Opt_lookupcache, /* Special mount options */ Opt_userspace, Opt_deprecated, Opt_sloppy, @@ -98,7 +99,7 @@ enum { Opt_err }; -static match_table_t nfs_mount_option_tokens = { +static const match_table_t nfs_mount_option_tokens = { { Opt_userspace, "bg" }, { Opt_userspace, "fg" }, { Opt_userspace, "retry=%s" }, @@ -154,6 +155,8 @@ static match_table_t nfs_mount_option_tokens = { { Opt_mounthost, "mounthost=%s" }, { Opt_mountaddr, "mountaddr=%s" }, + { Opt_lookupcache, "lookupcache=%s" }, + { Opt_err, NULL } }; @@ -163,7 +166,7 @@ enum { Opt_xprt_err }; -static match_table_t nfs_xprt_protocol_tokens = { +static const match_table_t nfs_xprt_protocol_tokens = { { Opt_xprt_udp, "udp" }, { Opt_xprt_tcp, "tcp" }, { Opt_xprt_rdma, "rdma" }, @@ -180,7 +183,7 @@ enum { Opt_sec_err }; -static match_table_t nfs_secflavor_tokens = { +static const match_table_t nfs_secflavor_tokens = { { Opt_sec_none, "none" }, { Opt_sec_none, "null" }, { Opt_sec_sys, "sys" }, @@ -200,6 +203,22 @@ static match_table_t nfs_secflavor_tokens = { { Opt_sec_err, NULL } }; +enum { + Opt_lookupcache_all, Opt_lookupcache_positive, + Opt_lookupcache_none, + + Opt_lookupcache_err +}; + +static match_table_t nfs_lookupcache_tokens = { + { Opt_lookupcache_all, "all" }, + { Opt_lookupcache_positive, "pos" }, + { Opt_lookupcache_positive, "positive" }, + { Opt_lookupcache_none, "none" }, + + { Opt_lookupcache_err, NULL } +}; + static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); @@ -209,7 +228,6 @@ static int nfs_get_sb(struct file_system_type *, int, const char *, void *, stru static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt); static void nfs_kill_super(struct super_block *); -static void nfs_put_super(struct super_block *); static int nfs_remount(struct super_block *sb, int *flags, char *raw_data); static struct file_system_type nfs_fs_type = { @@ -232,7 +250,6 @@ static const struct super_operations nfs_sops = { .alloc_inode = nfs_alloc_inode, .destroy_inode = nfs_destroy_inode, .write_inode = nfs_write_inode, - .put_super = nfs_put_super, .statfs = nfs_statfs, .clear_inode = nfs_clear_inode, .umount_begin = nfs_umount_begin, @@ -337,26 +354,20 @@ void __exit unregister_nfs_fs(void) unregister_filesystem(&nfs_fs_type); } -void nfs_sb_active(struct nfs_server *server) +void nfs_sb_active(struct super_block *sb) { - atomic_inc(&server->active); -} + struct nfs_server *server = NFS_SB(sb); -void nfs_sb_deactive(struct nfs_server *server) -{ - if (atomic_dec_and_test(&server->active)) - wake_up(&server->active_wq); + if (atomic_inc_return(&server->active) == 1) + atomic_inc(&sb->s_active); } -static void nfs_put_super(struct super_block *sb) +void nfs_sb_deactive(struct super_block *sb) { struct nfs_server *server = NFS_SB(sb); - /* - * Make sure there are no outstanding ops to this server. - * If so, wait for them to finish before allowing the - * unmount to continue. - */ - wait_event(server->active_wq, atomic_read(&server->active) == 0); + + if (atomic_dec_and_test(&server->active)) + deactivate_super(sb); } /* @@ -664,25 +675,6 @@ static void nfs_umount_begin(struct super_block *sb) } /* - * Set the port number in an address. Be agnostic about the address family. - */ -static void nfs_set_port(struct sockaddr *sap, unsigned short port) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - ap->sin_port = htons(port); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - ap->sin6_port = htons(port); - break; - } - } -} - -/* * Sanity-check a server address provided by the mount command. * * Address family must be initialized, and address must not be @@ -724,20 +716,22 @@ static void nfs_parse_ipv4_address(char *string, size_t str_len, *addr_len = 0; } -#define IPV6_SCOPE_DELIMITER '%' - #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, - const char *delim, - struct sockaddr_in6 *sin6) +static int nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, + const char *delim, + struct sockaddr_in6 *sin6) { char *p; size_t len; - if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) - return ; + if ((string + str_len) == delim) + return 1; + if (*delim != IPV6_SCOPE_DELIMITER) - return; + return 0; + + if (!(ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL)) + return 0; len = (string + str_len) - delim - 1; p = kstrndup(delim + 1, len, GFP_KERNEL); @@ -750,14 +744,20 @@ static void nfs_parse_ipv6_scope_id(const char *string, const size_t str_len, scope_id = dev->ifindex; dev_put(dev); } else { - /* scope_id is set to zero on error */ - strict_strtoul(p, 10, &scope_id); + if (strict_strtoul(p, 10, &scope_id) == 0) { + kfree(p); + return 0; + } } kfree(p); + sin6->sin6_scope_id = scope_id; dfprintk(MOUNT, "NFS: IPv6 scope ID = %lu\n", scope_id); + return 1; } + + return 0; } static void nfs_parse_ipv6_address(char *string, size_t str_len, @@ -773,9 +773,11 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, sin6->sin6_family = AF_INET6; *addr_len = sizeof(*sin6); - if (in6_pton(string, str_len, addr, IPV6_SCOPE_DELIMITER, &delim)) { - nfs_parse_ipv6_scope_id(string, str_len, delim, sin6); - return; + if (in6_pton(string, str_len, addr, + IPV6_SCOPE_DELIMITER, &delim) != 0) { + if (nfs_parse_ipv6_scope_id(string, str_len, + delim, sin6) != 0) + return; } } @@ -798,7 +800,7 @@ static void nfs_parse_ipv6_address(char *string, size_t str_len, * If there is a problem constructing the new sockaddr, set the address * family to AF_UNSPEC. */ -static void nfs_parse_ip_address(char *string, size_t str_len, +void nfs_parse_ip_address(char *string, size_t str_len, struct sockaddr *sap, size_t *addr_len) { unsigned int i, colons; @@ -1258,6 +1260,30 @@ static int nfs_parse_mount_options(char *raw, &mnt->mount_server.addrlen); kfree(string); break; + case Opt_lookupcache: + string = match_strdup(args); + if (string == NULL) + goto out_nomem; + token = match_token(string, + nfs_lookupcache_tokens, args); + kfree(string); + switch (token) { + case Opt_lookupcache_all: + mnt->flags &= ~(NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE); + break; + case Opt_lookupcache_positive: + mnt->flags &= ~NFS_MOUNT_LOOKUP_CACHE_NONE; + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG; + break; + case Opt_lookupcache_none: + mnt->flags |= NFS_MOUNT_LOOKUP_CACHE_NONEG|NFS_MOUNT_LOOKUP_CACHE_NONE; + break; + default: + errors++; + dfprintk(MOUNT, "NFS: invalid " + "lookupcache argument\n"); + }; + break; /* * Special options @@ -1279,6 +1305,12 @@ static int nfs_parse_mount_options(char *raw, } } + if (errors > 0) { + dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n", + errors, (errors == 1 ? "" : "s")); + if (!sloppy) + return 0; + } return 1; out_nomem: @@ -1552,7 +1584,7 @@ static int nfs_validate_mount_data(void *options, * Translate to nfs_parsed_mount_data, which nfs_fill_super * can deal with. */ - args->flags = data->flags; + args->flags = data->flags & NFS_MOUNT_FLAGMASK; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index f089e5839d7..ecc29534777 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -99,7 +99,7 @@ static void nfs_async_unlink_release(void *calldata) nfs_dec_sillycount(data->dir); nfs_free_unlinkdata(data); - nfs_sb_deactive(NFS_SB(sb)); + nfs_sb_deactive(sb); } static const struct rpc_call_ops nfs_unlink_ops = { @@ -118,6 +118,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n .rpc_message = &msg, .callback_ops = &nfs_unlink_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; struct rpc_task *task; @@ -149,7 +150,7 @@ static int nfs_do_call_unlink(struct dentry *parent, struct inode *dir, struct n nfs_dec_sillycount(dir); return 0; } - nfs_sb_active(NFS_SERVER(dir)); + nfs_sb_active(dir->i_sb); data->args.fh = NFS_FH(dir); nfs_fattr_init(&data->res.dir_attr); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 3229e217c77..9f9845859fc 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1427,8 +1427,9 @@ static int nfs_write_mapping(struct address_space *mapping, int how) .bdi = mapping->backing_dev_info, .sync_mode = WB_SYNC_NONE, .nr_to_write = LONG_MAX, + .range_start = 0, + .range_end = LLONG_MAX, .for_writepages = 1, - .range_cyclic = 1, }; int ret; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 15c6faeec77..b2786a5f9af 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -70,7 +70,6 @@ nlm_fclose(struct file *filp) static struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ - .get_grace_period = get_nfs4_grace_period, }; void diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4d617ea28cf..9dbd2eb9128 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); if (nfserr) RETURN_STATUS(nfserr); @@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: FSSTAT(3) %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); RETURN_STATUS(nfserr); } @@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index b6ed38380ab..54b8b4140c8 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -443,7 +443,7 @@ init_state(struct posix_acl_state *state, int cnt) * enough space for either: */ alloc = sizeof(struct posix_ace_state_array) - + cnt*sizeof(struct posix_ace_state); + + cnt*sizeof(struct posix_user_ace_state); state->users = kzalloc(alloc, GFP_KERNEL); if (!state->users) return -ENOMEM; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 702fa577aa6..094747a1227 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); - WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); + WRITE32(cb_rec->cbr_stateid.si_generation); + WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); WRITEMEM(cb_rec->cbr_fhval, len); @@ -379,6 +380,7 @@ static int do_probe_callback(void *data) .addrsize = sizeof(addr), .timeout = &timeparms, .program = &cb_program, + .prognumber = cb->cb_prog, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), @@ -396,9 +398,6 @@ static int do_probe_callback(void *data) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize rpc_stat */ - memset(args.program->stats, 0, sizeof(struct rpc_stat)); - /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 2e51adac65d..669461e291a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags + if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, @@ -867,11 +867,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, int slack_bytes; __be32 status; - status = nfserr_resource; - cstate = cstate_alloc(); - if (cstate == NULL) - goto out; - resp->xbuf = &rqstp->rq_res; resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; resp->tagp = resp->p; @@ -890,6 +885,11 @@ nfsd4_proc_compound(struct svc_rqst *rqstp, if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) goto out; + status = nfserr_resource; + cstate = cstate_alloc(); + if (cstate == NULL) + goto out; + status = nfs_ok; while (!status && resp->opcnt < args->opcnt) { op = &args->ops[resp->opcnt++]; @@ -957,9 +957,9 @@ encode_op: nfsd4_increment_op_stats(op->opnum); } + cstate_free(cstate); out: nfsd4_release_compoundargs(args); - cstate_free(cstate); dprintk("nfsv4 compound returned %d\n", ntohl(status)); return status; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1578d7a2667..0cc7ff5d5ab 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */ static time_t user_lease_time = 90; static time_t boot_time; -static int in_grace = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; @@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (nfs4_in_grace()) + if (locks_in_grace()) goto out; if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) goto out; @@ -1816,12 +1815,15 @@ out: return status; } +struct lock_manager nfsd4_manager = { +}; + static void -end_grace(void) +nfsd4_end_grace(void) { dprintk("NFSD: end of grace period\n"); nfsd4_recdir_purge_old(); - in_grace = 0; + locks_end_grace(&nfsd4_manager); } static time_t @@ -1838,8 +1840,8 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - if (in_grace) - end_grace(); + if (locks_in_grace()) + nfsd4_end_grace(); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) return nfserr_bad_stateid; else if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (nfs4_in_grace()) { + else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) static inline int io_during_grace_disallowed(struct inode *inode, int flags) { - return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) + return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) && mandatory_lock(inode); } @@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, filp = lock_stp->st_vfs_file; status = nfserr_grace; - if (nfs4_in_grace() && !lock->lk_reclaim) + if (locks_in_grace() && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && lock->lk_reclaim) + if (!locks_in_grace() && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int error; __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -3192,9 +3194,9 @@ __nfs4_state_start(void) unsigned long grace_time; boot_time = get_seconds(); - grace_time = get_nfs_grace_period(); + grace_time = get_nfs4_grace_period(); lease_time = user_lease_time; - in_grace = 1; + locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); @@ -3213,12 +3215,6 @@ nfs4_state_start(void) return; } -int -nfs4_in_grace(void) -{ - return in_grace; -} - time_t nfs4_lease_time(void) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 14ba4d9b285..afcdf4b7684 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -413,6 +413,18 @@ out_nfserr: } static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(sid->si_generation); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) { DECODE_HEAD; @@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; close->cl_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); + READ_BUF(4); READ32(close->cl_seqid); - READ32(close->cl_stateid.si_generation); - COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); + return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; } @@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(dr->dr_stateid.si_generation); - COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; + return nfsd4_decode_stateid(argp, &dr->dr_stateid); } static inline __be32 @@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) READ32(lock->lk_is_new); if (lock->lk_is_new) { - READ_BUF(36); + READ_BUF(4); READ32(lock->lk_new_open_seqid); - READ32(lock->lk_new_open_stateid.si_generation); - - COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); READ32(lock->lk_new_lock_seqid); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); READ32(lock->lk_new_owner.len); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { - READ_BUF(20); - READ32(lock->lk_old_lock_stateid.si_generation); - COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); READ32(lock->lk_old_lock_seqid); } @@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; locku->lu_stateowner = NULL; - READ_BUF(24 + sizeof(stateid_t)); + READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; READ32(locku->lu_seqid); - READ32(locku->lu_stateid.si_generation); - COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); READ64(locku->lu_offset); READ64(locku->lu_length); @@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_delegate_type); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - READ_BUF(sizeof(stateid_t) + 4); - COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); @@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con DECODE_HEAD; open_conf->oc_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); - READ32(open_conf->oc_req_stateid.si_generation); - COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); READ32(open_conf->oc_seqid); DECODE_TAIL; @@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d DECODE_HEAD; open_down->od_stateowner = NULL; - READ_BUF(12 + sizeof(stateid_t)); - READ32(open_down->od_stateid.si_generation); - COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(12); READ32(open_down->od_seqid); READ32(open_down->od_share_access); READ32(open_down->od_share_deny); @@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { DECODE_HEAD; - READ_BUF(sizeof(stateid_t) + 12); - READ32(read->rd_stateid.si_generation); - COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); READ64(read->rd_offset); READ32(read->rd_length); @@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(setattr->sa_stateid.si_generation); - COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t)); - if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl))) - goto out; + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, + &setattr->sa_iattr, &setattr->sa_acl); } static __be32 @@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) int len; DECODE_HEAD; - READ_BUF(sizeof(stateid_opaque_t) + 20); - READ32(write->wr_stateid.si_generation); - COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); READ64(write->wr_offset); READ32(write->wr_stable_how); if (write->wr_stable_how > 2) @@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) * Header routine to setup seqid operation replay cache */ #define ENCODE_SEQID_OP_HEAD \ - __be32 *p; \ __be32 *save; \ \ save = resp->p; @@ -1950,6 +1962,17 @@ fail: return -EINVAL; } +static void +nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +{ + ENCODE_HEAD; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + ADJUST_ARGS(); +} + static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { @@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(close->cl_stateid.si_generation); - WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + if (!nfserr) + nfsd4_encode_stateid(resp, &close->cl_stateid); + ENCODE_SEQID_OP_TAIL(close->cl_stateowner); return nfserr; } @@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(4 + sizeof(stateid_t)); - WRITE32(lock->lk_resp_stateid.si_generation); - WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } else if (nfserr == nfserr_denied) + if (!nfserr) + nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); @@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(locku->lu_stateid.si_generation); - WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } - + if (!nfserr) + nfsd4_encode_stateid(resp, &locku->lu_stateid); + ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); return nfserr; } @@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + ENCODE_HEAD; ENCODE_SEQID_OP_HEAD; if (nfserr) goto out; - RESERVE_SPACE(36 + sizeof(stateid_t)); - WRITE32(open->op_stateid.si_generation); - WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_stateid); + RESERVE_SPACE(40); WRITECINFO(open->op_cinfo); WRITE32(open->op_rflags); WRITE32(2); @@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - RESERVE_SPACE(20 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(20); WRITE32(open->op_recall); /* @@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op ADJUST_ARGS(); break; case NFS4_OPEN_DELEGATE_WRITE: - RESERVE_SPACE(32 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(32); WRITE32(0); /* @@ -2195,13 +2208,9 @@ static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(oc->oc_resp_stateid.si_generation); - WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); return nfserr; @@ -2211,13 +2220,9 @@ static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(od->od_stateid.si_generation); - WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &od->od_stateid); ENCODE_SEQID_OP_TAIL(od->od_stateowner); return nfserr; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c53e65f8f3a..97543df5824 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; err = nfsd_create_serv(); if (!err) { - int proto = 0; - err = svc_addsock(nfsd_serv, fd, buf, &proto); + err = svc_addsock(nfsd_serv, fd, buf); if (err >= 0) { - err = lockd_up(proto); + err = lockd_up(); if (err < 0) svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ea37c96f044..cd25d91895a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - if (!(access & NFSD_MAY_LOCK)) { - /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. - */ - error = check_nfsd_access(exp, rqstp); - if (error) - goto out; - } + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; +skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0766f95d236..5cffeca7ace 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); return nfsd_return_attrs(nfserr, resp); } @@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); return nfserr; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80292ff5e92..59eeb46f82c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -229,6 +229,7 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; @@ -243,25 +244,20 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = lockd_up(IPPROTO_UDP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; - error = lockd_up(IPPROTO_TCP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; + + error = lockd_up(); + if (error < 0) + return error; + return 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 18060bed526..aa1d0d6489a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -83,7 +83,6 @@ struct raparm_hbucket { spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -static struct raparms * raparml; #define RAPARM_HASH_BITS 4 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) @@ -1866,9 +1865,9 @@ out: * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; @@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, void nfsd_racache_shutdown(void) { - if (!raparml) - return; + struct raparms *raparm, *last_raparm; + unsigned int i; + dprintk("nfsd: freeing readahead buffers.\n"); - kfree(raparml); - raparml = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + raparm = raparm_hash[i].pb_head; + while(raparm) { + last_raparm = raparm; + raparm = raparm->p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } } /* * Initialize readahead param cache @@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; + struct raparms **raparm = NULL; - if (raparml) + if (raparm_hash[0].pb_head) return 0; - if (cache_size < 2*RAPARM_HASH_SIZE) - cache_size = 2*RAPARM_HASH_SIZE; - raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); - - if (!raparml) { - printk(KERN_WARNING - "nfsd: Could not allocate memory read-ahead cache.\n"); - return -ENOMEM; - } + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + if (nperbucket < 2) + nperbucket = 2; + cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { - raparm_hash[i].pb_head = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { spin_lock_init(&raparm_hash[i].pb_lock); - } - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - for (i = 0; i < cache_size - 1; i++) { - if (i % nperbucket == 0) - raparm_hash[j++].pb_head = raparml + i; - if (i % nperbucket < nperbucket-1) - raparml[i].p_next = raparml + i + 1; + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; } nfsdstats.ra_size = cache_size; return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c index 64965e1c21c..9b0efdad891 100644 --- a/fs/nls/nls_base.c +++ b/fs/nls/nls_base.c @@ -13,9 +13,7 @@ #include <linux/nls.h> #include <linux/kernel.h> #include <linux/errno.h> -#ifdef CONFIG_KMOD #include <linux/kmod.h> -#endif #include <linux/spinlock.h> static struct nls_table default_table; @@ -215,24 +213,7 @@ static struct nls_table *find_nls(char *charset) struct nls_table *load_nls(char *charset) { - struct nls_table *nls; -#ifdef CONFIG_KMOD - int ret; -#endif - - nls = find_nls(charset); - if (nls) - return nls; - -#ifdef CONFIG_KMOD - ret = request_module("nls_%s", charset); - if (ret != 0) { - printk("Unable to load NLS charset %s\n", charset); - return NULL; - } - nls = find_nls(charset); -#endif - return nls; + return try_then_request_module(find_nls(charset), "nls_%s", charset); } void unload_nls(struct nls_table *nls) diff --git a/fs/ntfs/namei.c b/fs/ntfs/namei.c index e1781c8b165..9e8a95be7a1 100644 --- a/fs/ntfs/namei.c +++ b/fs/ntfs/namei.c @@ -174,7 +174,6 @@ static struct dentry *ntfs_lookup(struct inode *dir_ino, struct dentry *dent, // TODO: Consider moving this lot to a separate function! (AIA) handle_name: { - struct dentry *real_dent, *new_dent; MFT_RECORD *m; ntfs_attr_search_ctx *ctx; ntfs_inode *ni = NTFS_I(dent_inode); @@ -255,93 +254,9 @@ handle_name: } nls_name.hash = full_name_hash(nls_name.name, nls_name.len); - /* - * Note: No need for dent->d_lock lock as i_mutex is held on the - * parent inode. - */ - - /* Does a dentry matching the nls_name exist already? */ - real_dent = d_lookup(dent->d_parent, &nls_name); - /* If not, create it now. */ - if (!real_dent) { - real_dent = d_alloc(dent->d_parent, &nls_name); - kfree(nls_name.name); - if (!real_dent) { - err = -ENOMEM; - goto err_out; - } - new_dent = d_splice_alias(dent_inode, real_dent); - if (new_dent) - dput(real_dent); - else - new_dent = real_dent; - ntfs_debug("Done. (Created new dentry.)"); - return new_dent; - } + dent = d_add_ci(dent, dent_inode, &nls_name); kfree(nls_name.name); - /* Matching dentry exists, check if it is negative. */ - if (real_dent->d_inode) { - if (unlikely(real_dent->d_inode != dent_inode)) { - /* This can happen because bad inodes are unhashed. */ - BUG_ON(!is_bad_inode(dent_inode)); - BUG_ON(!is_bad_inode(real_dent->d_inode)); - } - /* - * Already have the inode and the dentry attached, decrement - * the reference count to balance the ntfs_iget() we did - * earlier on. We found the dentry using d_lookup() so it - * cannot be disconnected and thus we do not need to worry - * about any NFS/disconnectedness issues here. - */ - iput(dent_inode); - ntfs_debug("Done. (Already had inode and dentry.)"); - return real_dent; - } - /* - * Negative dentry: instantiate it unless the inode is a directory and - * has a 'disconnected' dentry (i.e. IS_ROOT and DCACHE_DISCONNECTED), - * in which case d_move() that in place of the found dentry. - */ - if (!S_ISDIR(dent_inode->i_mode)) { - /* Not a directory; everything is easy. */ - d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative file dentry.)"); - return real_dent; - } - spin_lock(&dcache_lock); - if (list_empty(&dent_inode->i_dentry)) { - /* - * Directory without a 'disconnected' dentry; we need to do - * d_instantiate() by hand because it takes dcache_lock which - * we already hold. - */ - list_add(&real_dent->d_alias, &dent_inode->i_dentry); - real_dent->d_inode = dent_inode; - spin_unlock(&dcache_lock); - security_d_instantiate(real_dent, dent_inode); - ntfs_debug("Done. (Already had negative directory dentry.)"); - return real_dent; - } - /* - * Directory with a 'disconnected' dentry; get a reference to the - * 'disconnected' dentry. - */ - new_dent = list_entry(dent_inode->i_dentry.next, struct dentry, - d_alias); - dget_locked(new_dent); - spin_unlock(&dcache_lock); - /* Do security vodoo. */ - security_d_instantiate(real_dent, dent_inode); - /* Move new_dent in place of real_dent. */ - d_move(new_dent, real_dent); - /* Balance the ntfs_iget() we did above. */ - iput(dent_inode); - /* Throw away real_dent. */ - dput(real_dent); - /* Use new_dent as the actual dentry. */ - ntfs_debug("Done. (Already had negative, disconnected directory " - "dentry.)"); - return new_dent; + return dent; eio_err_out: ntfs_error(vol->sb, "Illegal file name attribute. Run chkdsk."); diff --git a/fs/ntfs/usnjrnl.h b/fs/ntfs/usnjrnl.h index 3a8af75351e..4087fbdac32 100644 --- a/fs/ntfs/usnjrnl.h +++ b/fs/ntfs/usnjrnl.h @@ -113,7 +113,7 @@ typedef struct { * Reason flags (32-bit). Cumulative flags describing the change(s) to the * file since it was last opened. I think the names speak for themselves but * if you disagree check out the descriptions in the Linux NTFS project NTFS - * documentation: http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * documentation: http://www.linux-ntfs.org/ */ enum { USN_REASON_DATA_OVERWRITE = const_cpu_to_le32(0x00000001), @@ -145,7 +145,7 @@ typedef le32 USN_REASON_FLAGS; * Source info flags (32-bit). Information about the source of the change(s) * to the file. For detailed descriptions of what these mean, see the Linux * NTFS project NTFS documentation: - * http://linux-ntfs.sourceforge.net/ntfs/files/usnjrnl.html + * http://www.linux-ntfs.org/ */ enum { USN_SOURCE_DATA_MANAGEMENT = const_cpu_to_le32(0x00000001), diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f6956de56fd..589dcdfdfe3 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -34,7 +34,8 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o + ver.o \ + xattr.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 10bfb466e06..0cc2deb9394 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -49,6 +49,340 @@ #include "buffer_head_io.h" + +/* + * Operations for a specific extent tree type. + * + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. + */ +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + rec->e_cpos, + OCFS2_I(inode)->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + struct ocfs2_dinode *di; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + + di = et->et_object; + if (!OCFS2_IS_VALID_DINODE(di)) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has invalid path root", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + + return ret; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = et->et_object; + + et->et_root_el = &xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *) et->et_object; + + return le64_to_cpu(xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + le32_add_cpu(&xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, +}; + +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(inode->i_sb, + OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(inode, et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, + &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv) +{ + __ocfs2_init_extent_tree(et, inode, bh, xv, + &ocfs2_xattr_value_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(inode, et, clusters); +} + +static inline int ocfs2_et_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(inode, et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(inode, et); + return ret; +} + static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); @@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, } /* - * Allocate and initialize a new path based on a disk inode tree. - */ -static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) -{ - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_list *el = &di->id2.i_list; - - return ocfs2_new_path(di_bh, el); -} - -/* * Convenience function to journal all components in a path. */ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, @@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt { */ int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et) { int retval; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el = NULL; struct ocfs2_extent_block *eb; struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; mlog_entry_void(); - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - retval = -EIO; - goto bail; - } + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); - if (fe->i_last_eb_blk) { - retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &eb_bh, OCFS2_BH_CACHED, inode); + if (last_eb_blk) { + retval = ocfs2_read_block(inode, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - } else - el = &fe->id2.i_list; + } BUG_ON(el->l_tree_depth != 0); retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); mlog_exit(retval); return retval; @@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, bail: if (status < 0) { for(i = 0; i < wanted; i++) { - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); bhs[i] = NULL; } } @@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) @@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; @@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, BUG_ON(!last_eb_bh || !*last_eb_bh); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (eb_bh) { eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else - el = &fe->id2.i_list; + el = et->et_root_el; /* we never add a branch to a leaf. */ BUG_ON(!el->l_tree_depth); @@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, } /* Link the new branch into the rest of the tree (el will - * either be on the fe, or the extent block passed in. */ + * either be on the root_bh, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); @@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, /* fe needs a new last extent block pointer, as does the * next_leaf on the previously last-extent-block. */ - fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); @@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, status = ocfs2_journal_dirty(handle, *last_eb_bh); if (status < 0) mlog_errno(status); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) mlog_errno(status); if (eb_bh) { @@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, bail: if (new_eb_bhs) { for (i = 0; i < new_blocks; i++) - if (new_eb_bhs[i]) - brelse(new_eb_bhs[i]); + brelse(new_eb_bhs[i]); kfree(new_eb_bhs); } @@ -717,16 +1031,15 @@ bail: static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { int status, i; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; mlog_entry_void(); @@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb_el = &eb->h_list; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - fe_el = &fe->id2.i_list; + root_el = et->et_root_el; status = ocfs2_journal_access(handle, inode, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); @@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - /* copy the fe data into the new extent block */ - eb_el->l_tree_depth = fe_el->l_tree_depth; - eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - eb_el->l_recs[i] = fe_el->l_recs[i]; + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_clusters = ocfs2_sum_rightmost_rec(eb_el); - /* update fe now */ - le16_add_cpu(&fe_el->l_tree_depth, 1); - fe_el->l_recs[0].e_cpos = 0; - fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); - fe_el->l_next_free_rec = cpu_to_le16(1); + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk * becomes the allocated extent block */ - if (fe_el->l_tree_depth == cpu_to_le16(1)) - fe->i_last_eb_blk = eb->h_blkno; + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_eb_bh = NULL; status = 0; bail: - if (new_eb_bh) - brelse(new_eb_bh); + brelse(new_eb_bh); mlog_exit(status); return status; @@ -817,22 +1128,21 @@ bail: * 1) a lowest extent block is found, then we pass it back in * *lowest_eb_bh and return '0' * - * 2) the search fails to find anything, but the dinode has room. We + * 2) the search fails to find anything, but the root_el has room. We * pass NULL back in *lowest_eb_bh, but still return '0' * - * 3) the search fails to find anything AND the dinode is full, in + * 3) the search fails to find anything AND the root_el is full, in * which case we return > 0 * * return status < 0 indicates an error. */ static int ocfs2_find_branch_target(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; u64 blkno; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; @@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, *target_bh = NULL; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + el = et->et_root_el; while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { @@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, goto bail; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; - status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, if (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) { - if (lowest_bh) - brelse(lowest_bh); + brelse(lowest_bh); lowest_bh = bh; get_bh(lowest_bh); } @@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, /* If we didn't find one and the fe doesn't have any room, * then return '1' */ - if (!lowest_bh - && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) status = 1; *target_bh = lowest_bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -919,19 +1223,19 @@ bail: * *last_eb_bh will be updated by ocfs2_add_branch(). */ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, int *final_depth, + struct ocfs2_extent_tree *et, int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); + shift = ocfs2_find_branch_target(osb, inode, et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, + ret = ocfs2_shift_tree_depth(osb, handle, inode, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); @@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, + ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -990,15 +1294,6 @@ out: } /* - * This is only valid for leaf nodes, which are the only ones that can - * have empty extents anyway. - */ -static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) -{ - return !rec->e_leaf_clusters; -} - -/* * This function will discard the rightmost extent record. */ static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) @@ -1245,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, - &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -2067,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *right_path, int subtree_index, struct ocfs2_cached_dealloc_ctxt *dealloc, - int *deleted) + int *deleted, + struct ocfs2_extent_tree *et) { int ret, i, del_right_subtree = 0, right_has_empty = 0; - struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; struct ocfs2_extent_block *eb; @@ -2123,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2198,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); /* * Removal of the extent in the left leaf was skipped @@ -2208,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et_root_bh); if (ret) mlog_errno(ret); @@ -2331,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_path **empty_extent_path) + struct ocfs2_path **empty_extent_path, + struct ocfs2_extent_tree *et) { int ret, subtree_root, deleted; u32 right_cpos; @@ -2404,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ret = ocfs2_rotate_subtree_left(inode, handle, left_path, right_path, subtree_root, - dealloc, &deleted); + dealloc, &deleted, et); if (ret == -EAGAIN) { /* * The rotation has to temporarily stop due to @@ -2447,29 +2742,20 @@ out: } static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, subtree_index; u32 cpos; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - /* - * XXX: This code assumes that the root is an inode, which is - * true for now but may change as tree code gets generic. - */ - di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto out; - } + ret = ocfs2_et_sanity_check(inode, et); + if (ret) + goto out; /* * There's two ways we handle this depending on * whether path is the only existing one. @@ -2526,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); } else { /* * 'path' is also the leftmost path which @@ -2537,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, */ ocfs2_unlink_path(inode, handle, dealloc, path, 1); - el = &di->id2.i_list; + el = et->et_root_el; el->l_tree_depth = 0; el->l_next_free_rec = 0; memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); - di->i_last_eb_blk = 0; + ocfs2_et_set_last_eb_blk(et, 0); } ocfs2_journal_dirty(handle, path_root_bh(path)); @@ -2570,7 +2856,8 @@ out: */ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, orig_credits = handle->h_buffer_credits; struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; @@ -2584,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, if (path->p_tree_depth == 0) { rightmost_no_delete: /* - * In-inode extents. This is trivially handled, so do + * Inline extents. This is trivially handled, so do * it up front. */ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, @@ -2638,7 +2925,7 @@ rightmost_no_delete: */ ret = ocfs2_remove_rightmost_path(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); goto out; @@ -2650,7 +2937,7 @@ rightmost_no_delete: */ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, - dealloc, &restart_path); + dealloc, &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2662,7 +2949,7 @@ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, tmp_path, dealloc, - &restart_path); + &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2948,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, handle_t *handle, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et, int index) { int ret, i, subtree_index = 0, has_empty_extent = 0; @@ -3068,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le16_to_cpu(el->l_next_free_rec) == 1) { ret = ocfs2_remove_rightmost_path(inode, handle, - right_path, dealloc); + right_path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3095,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_merge_ctxt *ctxt) + struct ocfs2_merge_ctxt *ctxt, + struct ocfs2_extent_tree *et) { int ret = 0; @@ -3113,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * illegal. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3156,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); /* The merge left us with an empty extent, remove it. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3170,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ ret = ocfs2_merge_rec_left(inode, path, handle, rec, - dealloc, + dealloc, et, split_index); if (ret) { @@ -3179,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, } ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -3199,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, ret = ocfs2_merge_rec_left(inode, path, handle, split_rec, - dealloc, + dealloc, et, split_index); if (ret) { mlog_errno(ret); @@ -3222,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * our leaf. Try to rotate it away. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); ret = 0; @@ -3356,16 +3647,6 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static inline void ocfs2_update_dinode_clusters(struct inode *inode, - struct ocfs2_dinode *di, - u32 clusters) -{ - le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - static void ocfs2_adjust_rightmost_records(struct inode *inode, handle_t *handle, struct ocfs2_path *path, @@ -3567,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode, } /* - * This function only does inserts on an allocation b-tree. For dinode - * lists, ocfs2_insert_at_leaf() is called directly. + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. * * right_path is the path we want to do the actual insert * in. left_path should only be passed in if we need to update that @@ -3665,7 +3946,7 @@ out: static int ocfs2_do_insert_extent(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) { @@ -3673,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode, u32 cpos; struct ocfs2_path *right_path = NULL; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_list *el; - di = (struct ocfs2_dinode *) di_bh->b_data; - el = &di->id2.i_list; + el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3691,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, goto out_update_clusters; } - right_path = ocfs2_new_inode_path(di_bh); + right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3741,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3766,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_update_dinode_clusters(inode, di, - le16_to_cpu(insert_rec->e_leaf_clusters)); + ocfs2_et_update_clusters(inode, et, + le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et->et_root_bh); if (ret) mlog_errno(ret); @@ -3899,7 +4178,8 @@ out: static void ocfs2_figure_contig_type(struct inode *inode, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, - struct ocfs2_extent_rec *insert_rec) + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_tree *et) { int i; enum ocfs2_contig_type contig_type = CONTIG_NONE; @@ -3915,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode, } } insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } } /* @@ -3923,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. * - * This should also work against the dinode list for tree's with 0 - * depth. If we consider the dinode list to be the rightmost leaf node + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node * then the logic here makes sense. */ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, @@ -3975,14 +4270,13 @@ set_tail_append: * structure. */ static int ocfs2_figure_insert_type(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, struct ocfs2_insert_type *insert) { int ret; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; @@ -3990,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, insert->ins_split = SPLIT_NONE; - el = &di->id2.i_list; + el = et->et_root_el; insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); if (el->l_tree_depth) { @@ -4000,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), &bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { mlog_exit(ret); goto out; @@ -4023,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4057,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); /* * The insert code isn't quite ready to deal with all cases of @@ -4078,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, * the case that we're doing a tail append, so maybe we can * take advantage of that information somehow. */ - if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { /* * Ok, ocfs2_find_path() returned us the rightmost * tree path. This might be an appending insert. There are @@ -4108,7 +4401,7 @@ out: int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, @@ -4121,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); - mlog(0, "add %u clusters at position %u to inode %llu\n", new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != cpos), - "Device %s, asking for sparse allocation: inode %llu, " - "cpos %u, clusters %u\n", - osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, - OCFS2_I(inode)->ip_clusters); - memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; + status = ocfs2_et_insert_check(inode, et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } - status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4154,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, free_records, insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, fe_bh, + status = ocfs2_grow_tree(inode, handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -4164,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); + status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); if (status < 0) mlog_errno(status); - else + else if (et->et_ops == &ocfs2_dinode_et_ops) ocfs2_extent_map_insert_rec(inode, &rec); bail: - if (last_eb_bh) - brelse(last_eb_bh); + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, inode, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + mlog(0, "we haven't reserved any metadata!\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + mlog(0, "filesystem is really fragmented...\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + BUG_ON(num_bits > clusters_to_add); + + /* reserve our write early -- insert_extent may update the inode */ + status = ocfs2_journal_access(handle, inode, et->et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + status = ocfs2_insert_extent(osb, handle, inode, et, + *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, et->et_root_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + mlog(0, "need to alloc once more, wanted = %u\n", + clusters_to_add); + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: mlog_exit(status); + if (reason_ret) + *reason_ret = reason; return status; } @@ -4201,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, static int ocfs2_split_and_insert(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -4215,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode, struct ocfs2_extent_rec split_rec = *orig_split_rec; struct ocfs2_insert_type insert; struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; leftright: /* @@ -4224,8 +4618,7 @@ leftright: */ rec = path_leaf_el(path)->l_recs[split_index]; - di = (struct ocfs2_dinode *)di_bh->b_data; - rightmost_el = &di->id2.i_list; + rightmost_el = et->et_root_el; depth = le16_to_cpu(rightmost_el->l_tree_depth); if (depth) { @@ -4236,8 +4629,8 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, - meta_ac); + ret = ocfs2_grow_tree(inode, handle, et, + &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -4274,8 +4667,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, - &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -4317,8 +4709,9 @@ out: * of the tree is required. All other cases will degrade into a less * optimal tree layout. * - * last_eb_bh should be the rightmost leaf block for any inode with a - * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. * * This code is optimized for readability - several passes might be * made over certain portions of the tree. All of those blocks will @@ -4326,7 +4719,7 @@ out: * extra overhead is not expressed in terms of disk reads. */ static int __ocfs2_mark_extent_written(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int split_index, @@ -4366,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode, */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { mlog_exit(ret); goto out; @@ -4403,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_split_covers_rec) el->l_recs[split_index] = *split_rec; else - ret = ocfs2_split_and_insert(inode, handle, path, di_bh, + ret = ocfs2_split_and_insert(inode, handle, path, et, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) @@ -4411,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, } else { ret = ocfs2_try_to_merge_extent(inode, handle, path, split_index, split_rec, - dealloc, &ctxt); + dealloc, &ctxt, et); if (ret) mlog_errno(ret); } @@ -4429,7 +4820,8 @@ out: * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4455,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, /* * XXX: This should be fixed up so that we just re-insert the * next extent records. + * + * XXX: This is a hack on the extent tree, maybe it should be + * an op? */ - ocfs2_extent_map_trunc(inode, 0); + if (et->et_ops == &ocfs2_dinode_et_ops) + ocfs2_extent_map_trunc(inode, 0); - left_path = ocfs2_new_inode_path(di_bh); + left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4489,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, - index, &split_rec, meta_ac, dealloc); + ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); @@ -4499,13 +4896,12 @@ out: return ret; } -static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, +static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { int ret, depth, credits = handle->h_buffer_credits; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -4522,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -4535,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); + credits += path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4544,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4558,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -4570,7 +4966,8 @@ out: static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, - u32 cpos, u32 len) + u32 cpos, u32 len, + struct ocfs2_extent_tree *et) { int ret; u32 left_cpos, rec_range, trunc_range; @@ -4582,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4713,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4724,7 +5121,8 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4733,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, u32 rec_range, trunc_range; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; - struct ocfs2_path *path; + struct ocfs2_path *path = NULL; ocfs2_extent_map_trunc(inode, 0); - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4790,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, di_bh, handle, path, index, + ret = ocfs2_split_tree(inode, et, handle, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -4845,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, } ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; @@ -5188,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -5264,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, bail: if (tl_inode) iput(tl_inode); - if (tl_bh) - brelse(tl_bh); + brelse(tl_bh); if (status < 0 && (*tl_copy)) { kfree(*tl_copy); @@ -6008,20 +6404,13 @@ bail: return status; } -static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); mark_buffer_dirty(bh); return 0; } -static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) -{ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - return ocfs2_journal_dirty_data(handle, bh); -} - static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, unsigned int from, unsigned int to, struct page *page, int zero, u64 *phys) @@ -6040,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, &partial, - ocfs2_ordered_zero_func); - if (ret < 0) - mlog_errno(ret); - } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, - ocfs2_writeback_zero_func); + ocfs2_journal_dirty_data); +#endif if (ret < 0) mlog_errno(ret); } @@ -6215,20 +6605,29 @@ out: return ret; } -static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) { unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); - memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di) { - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); di->id2.i_list.l_tree_depth = 0; di->id2.i_list.l_next_free_rec = 0; - di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); } void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) @@ -6245,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) * We clear the entire i_data structure here so that all * fields can be properly initialized. */ - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); - idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); } int ocfs2_convert_inline_data_to_extents(struct inode *inode, @@ -6262,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct page **pages = NULL; loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; has_data = i_size_read(inode) ? 1 : 0; @@ -6361,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ret = ocfs2_insert_extent(osb, handle, inode, di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + ret = ocfs2_insert_extent(osb, handle, inode, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); @@ -6404,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; mlog_entry_void(); new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_inode_path(fe_bh); + path = ocfs2_new_path(fe_bh, &di->id2.i_list); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6581,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -6695,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) mlog(ML_NOTICE, "Truncate completion has non-empty dealloc context\n"); - if (tc->tc_last_eb_bh) - brelse(tc->tc_last_eb_bh); + brelse(tc->tc_last_eb_bh); kfree(tc); } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 42ff94bd801..70257c84cfb 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -26,30 +26,102 @@ #ifndef OCFS2_ALLOC_H #define OCFS2_ALLOC_H + +/* + * For xattr tree leaf, we limit the leaf byte size to be 64K. + */ +#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536 + +/* + * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract + * the b-tree operations in ocfs2. Now all the b-tree operations are not + * limited to ocfs2_dinode only. Any data which need to allocate clusters + * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree + * and operation. + * + * ocfs2_extent_tree becomes the first-class object for extent tree + * manipulation. Callers of the alloc.c code need to fill it via one of + * the ocfs2_init_*_extent_tree() operations below. + * + * ocfs2_extent_tree contains info for the root of the b-tree, it must have a + * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree + * functions. + * ocfs2_extent_tree_operations abstract the normal operations we do for + * the root of extent b-tree. + */ +struct ocfs2_extent_tree_operations; +struct ocfs2_extent_tree { + struct ocfs2_extent_tree_operations *et_ops; + struct buffer_head *et_root_bh; + struct ocfs2_extent_list *et_root_el; + void *et_object; + unsigned int et_max_leaf_clusters; +}; + +/* + * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the + * specified object buffer. + */ +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv); + struct ocfs2_alloc_context; int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, u8 flags, struct ocfs2_alloc_context *meta_ac); + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe); -/* how many new metadata chunks would an allocation need at maximum? */ -static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et); + +/* + * how many new metadata chunks would an allocation need at maximum? + * + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el) { /* * Rather than do all the work of determining how much we need @@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) * new tree_depth==0 extent_block, and one block at the new * top-of-the tree. */ - return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; + return le16_to_cpu(root_el->l_tree_depth) + 2; } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); @@ -146,4 +218,13 @@ static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el, return le16_to_cpu(rec->e_leaf_clusters); } +/* + * This is only valid for leaf nodes, which are the only ones that can + * have empty extents anyway. + */ +static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) +{ + return !rec->e_leaf_clusters; +} + #endif /* OCFS2_ALLOC_H */ diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 506c24fb507..c22543b3342 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - &bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, err = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(err); return err; @@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) { int ret; struct buffer_head *di_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, } if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, ocfs2_journal_dirty_data); - if (ret < 0) +#endif + if (ret < 0) mlog_errno(ret); } out: @@ -594,7 +592,7 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock, goto bail; } - if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) { + if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno && create) { ocfs2_error(inode->i_sb, "Inode %llu has a hole at block %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset); } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) if (!page_has_buffers(page)) return 0; - return journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page, wait); } static ssize_t ocfs2_direct_IO(int rw, @@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode, tmppage = wc->w_pages[i]; if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD walk_page_buffers(wc->w_handle, page_buffers(tmppage), from, to, NULL, ocfs2_journal_dirty_data); +#endif + } block_commit_write(tmppage, from, to); } @@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, int ret, i, new, should_zero = 0; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; if (new || unwritten) @@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping, * any additional semaphores or cluster locks. */ tmp_pos = cpos; - ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); if (ret < 0) { @@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; + struct ocfs2_extent_tree et; ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { @@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ - ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, - extents_to_split, &data_ac, &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," + " clusters_to_add = %u, extents_to_split = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), + clusters_to_alloc, extents_to_split); + + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list, clusters_to_alloc); } @@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD walk_page_buffers(wc->w_handle, page_buffers(tmppage), from, to, NULL, ocfs2_journal_dirty_data); +#endif + } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f136639f5b4..7e947c67246 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, /* remove from dirty list before I/O. */ clear_buffer_dirty(bh); - get_bh(bh); /* for end_buffer_write_sync() */ + get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, bh); @@ -88,22 +88,103 @@ out: return ret; } -int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, - struct buffer_head *bhs[], int flags, - struct inode *inode) +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]) +{ + int status = 0; + unsigned int i; + struct buffer_head *bh; + + if (!nr) { + mlog(ML_BH_IO, "No buffers will be read!\n"); + goto bail; + } + + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(osb->sb, block++); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "trying to sync read a jbd " + "managed bh (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_ERROR, + "trying to sync read a dirty " + "buffer! (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + } + + for (i = nr; i > 0; i--) { + bh = bhs[i - 1]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "the journal got the buffer while it was " + "locked for io! (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. */ + status = -EIO; + put_bh(bh); + bhs[i - 1] = NULL; + } + } + +bail: + return status; +} + +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, + struct buffer_head *bhs[], int flags) { int status = 0; - struct super_block *sb; int i, ignore_cache = 0; struct buffer_head *bh; - mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", - (unsigned long long)block, nr, flags, inode); + mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", + inode, (unsigned long long)block, nr, flags); + BUG_ON(!inode); BUG_ON((flags & OCFS2_BH_READAHEAD) && - (!inode || !(flags & OCFS2_BH_CACHED))); + (flags & OCFS2_BH_IGNORE_CACHE)); - if (osb == NULL || osb->sb == NULL || bhs == NULL) { + if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; @@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, goto bail; } - sb = osb->sb; - - if (flags & OCFS2_BH_CACHED && !inode) - flags &= ~OCFS2_BH_CACHED; - - if (inode) - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { - bhs[i] = sb_getblk(sb, block++); + bhs[i] = sb_getblk(inode->i_sb, block++); if (bhs[i] == NULL) { - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; } } bh = bhs[i]; - ignore_cache = 0; + ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has @@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * before our is-it-in-flight check. */ - if (flags & OCFS2_BH_CACHED && - !ocfs2_buffer_uptodate(inode, bh)) { + if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { mlog(ML_UPTODATE, "bh (%llu), inode %llu not uptodate\n", (unsigned long long)bh->b_blocknr, (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* We're using ignore_cache here to say + * "go to disk" */ ignore_cache = 1; } /* XXX: Can we ever get this and *not* have the cached * flag set? */ if (buffer_jbd(bh)) { - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) + if (ignore_cache) mlog(ML_BH_IO, "trying to sync read a jbd " "managed bh (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); continue; } - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { + if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ @@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ - if ((flags & OCFS2_BH_CACHED) + if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(inode, bh)) { unlock_buffer(bh); @@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - if (inode) - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(inode, bh); } - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, - (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); + ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", + flags); bail: diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index c2e78614c3e..75e1dcb1ade 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -31,31 +31,29 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int uptodate); -static inline int ocfs2_read_block(struct ocfs2_super *osb, +static inline int ocfs2_read_block(struct inode *inode, u64 off, - struct buffer_head **bh, - int flags, - struct inode *inode); + struct buffer_head **bh); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode); -int ocfs2_read_blocks(struct ocfs2_super *osb, +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, struct buffer_head *bhs[], - int flags, - struct inode *inode); + int flags); +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]); int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh); -#define OCFS2_BH_CACHED 1 +#define OCFS2_BH_IGNORE_CACHE 1 #define OCFS2_BH_READAHEAD 8 -static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, - struct buffer_head **bh, int flags, - struct inode *inode) +static inline int ocfs2_read_block(struct inode *inode, u64 off, + struct buffer_head **bh) { int status = 0; @@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, goto bail; } - status = ocfs2_read_blocks(osb, off, 1, bh, - flags, inode); + status = ocfs2_read_blocks(inode, off, 1, bh, 0); bail: return status; diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 23c732f2752..d8a0cb92cef 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(CONN), define_mask(QUORUM), define_mask(EXPORT), + define_mask(XATTR), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 597e064bb94..57670c68047 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -112,6 +112,7 @@ #define ML_CONN 0x0000000004000000ULL /* net connection management */ #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ +#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index d8bfa0eb41b..52276c02f71 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -138,20 +138,20 @@ static int nst_seq_show(struct seq_file *seq, void *v) " message id: %d\n" " message type: %u\n" " message key: 0x%08x\n" - " sock acquiry: %lu.%lu\n" - " send start: %lu.%lu\n" - " wait start: %lu.%lu\n", + " sock acquiry: %lu.%ld\n" + " send start: %lu.%ld\n" + " wait start: %lu.%ld\n", nst, (unsigned long)nst->st_task->pid, (unsigned long)nst->st_task->tgid, nst->st_task->comm, nst->st_node, nst->st_sc, nst->st_id, nst->st_msg_type, nst->st_msg_key, nst->st_sock_time.tv_sec, - (unsigned long)nst->st_sock_time.tv_usec, + (long)nst->st_sock_time.tv_usec, nst->st_send_time.tv_sec, - (unsigned long)nst->st_send_time.tv_usec, + (long)nst->st_send_time.tv_usec, nst->st_status_time.tv_sec, - nst->st_status_time.tv_usec); + (long)nst->st_status_time.tv_usec); } spin_unlock(&o2net_debug_lock); @@ -276,7 +276,7 @@ static void *sc_seq_next(struct seq_file *seq, void *v, loff_t *pos) return sc; /* unused, just needs to be null when done */ } -#define TV_SEC_USEC(TV) TV.tv_sec, (unsigned long)TV.tv_usec +#define TV_SEC_USEC(TV) TV.tv_sec, (long)TV.tv_usec static int sc_seq_show(struct seq_file *seq, void *v) { @@ -309,12 +309,12 @@ static int sc_seq_show(struct seq_file *seq, void *v) " remote node: %s\n" " page off: %zu\n" " handshake ok: %u\n" - " timer: %lu.%lu\n" - " data ready: %lu.%lu\n" - " advance start: %lu.%lu\n" - " advance stop: %lu.%lu\n" - " func start: %lu.%lu\n" - " func stop: %lu.%lu\n" + " timer: %lu.%ld\n" + " data ready: %lu.%ld\n" + " advance start: %lu.%ld\n" + " advance stop: %lu.%ld\n" + " func start: %lu.%ld\n" + " func stop: %lu.%ld\n" " func key: %u\n" " func type: %u\n", sc, diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index a27d61581bd..2bcf706d9dd 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -143,8 +143,8 @@ static void o2net_sc_postpone_idle(struct o2net_sock_container *sc); static void o2net_sc_reset_idle_timer(struct o2net_sock_container *sc); #ifdef CONFIG_DEBUG_FS -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) +static void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) { INIT_LIST_HEAD(&nst->st_net_debug_item); nst->st_task = task; @@ -153,31 +153,61 @@ void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, nst->st_node = node; } -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_sock_time); } -void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_send_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_send_time); } -void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +static void o2net_set_nst_status_time(struct o2net_send_tracking *nst) { do_gettimeofday(&nst->st_status_time); } -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, +static void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, struct o2net_sock_container *sc) { nst->st_sc = sc; } -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) +static void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id) { nst->st_id = msg_id; } + +#else /* CONFIG_DEBUG_FS */ + +static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, + u32 msgkey, struct task_struct *task, u8 node) +{ +} + +static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) +{ +} + +static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, + struct o2net_sock_container *sc) +{ +} + +static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, + u32 msg_id) +{ +} + #endif /* CONFIG_DEBUG_FS */ static inline int o2net_reconnect_delay(void) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index 18307ff81b7..8d58cfe410b 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -224,42 +224,10 @@ struct o2net_send_tracking { struct timeval st_send_time; struct timeval st_status_time; }; - -void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node); -void o2net_set_nst_sock_time(struct o2net_send_tracking *nst); -void o2net_set_nst_send_time(struct o2net_send_tracking *nst); -void o2net_set_nst_status_time(struct o2net_send_tracking *nst); -void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc); -void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, u32 msg_id); - #else struct o2net_send_tracking { u32 dummy; }; - -static inline void o2net_init_nst(struct o2net_send_tracking *nst, u32 msgtype, - u32 msgkey, struct task_struct *task, u8 node) -{ -} -static inline void o2net_set_nst_sock_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_send_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_status_time(struct o2net_send_tracking *nst) -{ -} -static inline void o2net_set_nst_sock_container(struct o2net_send_tracking *nst, - struct o2net_sock_container *sc) -{ -} -static inline void o2net_set_nst_msg_id(struct o2net_send_tracking *nst, - u32 msg_id) -{ -} #endif /* CONFIG_DEBUG_FS */ #endif /* O2CLUSTER_TCP_INTERNAL_H */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 8a187584808..026e6eb8518 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); +static struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada) +{ + struct buffer_head *bh = NULL; + int tmperr; + u64 p_blkno; + int readflags = 0; + + if (reada) + readflags |= OCFS2_BH_READAHEAD; + + if (((u64)block << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!reada); + return NULL; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (tmperr < 0) { + mlog_errno(tmperr); + goto fail; + } + + tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); + if (tmperr < 0) + goto fail; + + tmperr = 0; + + *err = 0; + return bh; + +fail: + brelse(bh); + bh = NULL; + + *err = -EIO; + return NULL; +} + /* * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. @@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -260,14 +302,13 @@ restart: } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ + if (ocfs2_read_block(dir, block, &bh)) { + /* read error, skip block & hope for the best. + * ocfs2_read_block() has released the bh. */ ocfs2_error(dir->i_sb, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); - brelse(bh); goto next; } i = ocfs2_search_dirblock(bh, dir, name, namelen, @@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { tmp = ocfs2_bread(inode, ++blk, &err, 1); - if (tmp) - brelse(tmp); + brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; @@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name, leave: if (status < 0) { *dirent = NULL; - if (*dirent_bh) { - brelse(*dirent_bh); - *dirent_bh = NULL; - } + brelse(*dirent_bh); + *dirent_bh = NULL; } mlog_exit(status); @@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir, ret = 0; bail: - if (dirent_bh) - brelse(dirent_bh); + brelse(dirent_bh); mlog_exit(ret); return ret; @@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, status = 0; bail: - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dirdata_bh = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, dir, di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); @@ -1300,19 +1337,24 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, di->i_size = cpu_to_le64(sb->s_blocksize); di->i_ctime = di->i_mtime = cpu_to_le64(dir->i_ctime.tv_sec); di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(dir->i_ctime.tv_nsec); - dir->i_blocks = ocfs2_inode_sector_count(dir); /* * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0, - NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, + 0, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } + /* + * Set i_blocks after the extent insert for the most up to + * date ip_clusters value. + */ + dir->i_blocks = ocfs2_inode_sector_count(dir); + ret = ocfs2_journal_dirty(handle, di_bh); if (ret) { mlog_errno(ret); @@ -1332,11 +1374,11 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno, - len, 0, NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, + blkno, len, 0, NULL); if (ret) { mlog_errno(ret); - goto out; + goto out_commit; } } @@ -1378,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, - 1, 0, parent_fe_bh, handle, - data_ac, meta_ac, NULL); + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, + 1, 0, parent_fe_bh, handle, + data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { mlog_errno(status); @@ -1425,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, int credits, num_free_extents, drop_alloc_sem = 0; loff_t dir_i_size; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_extent_list *el = &fe->id2.i_list; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; + struct ocfs2_extent_tree et; mlog_entry_void(); @@ -1474,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); - num_free_extents = ocfs2_num_free_extents(osb, dir, fe); + ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, dir, &et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -1482,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); + status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1497,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } - credits = ocfs2_calc_extend_credits(sb, fe, 1); + credits = ocfs2_calc_extend_credits(sb, el, 1); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -1563,8 +1608,7 @@ bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1691,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1751,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, *ret_de_bh = bh; bh = NULL; out: - if (bh) - brelse(bh); + brelse(bh); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index eae3d643a5e..ec684426034 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, - bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, oi->ip_blkno, bh); if (status < 0) { mlog_errno(status); goto bail_refresh; @@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode, return 0; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - ret_bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index c58668a326f..2baedac5823 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -25,6 +25,7 @@ #include <linux/fs.h> #include <linux/init.h> #include <linux/types.h> +#include <linux/fiemap.h> #define MLOG_MASK_PREFIX ML_EXTENT_MAP #include <cluster/masklog.h> @@ -32,6 +33,7 @@ #include "ocfs2.h" #include "alloc.h" +#include "dlmglue.h" #include "extent_map.h" #include "inode.h" #include "super.h" @@ -282,6 +284,50 @@ out: kfree(new_emi); } +static int ocfs2_last_eb_is_empty(struct inode *inode, + struct ocfs2_dinode *di) +{ + int ret, next_free; + u64 last_eb_blk = le64_to_cpu(di->i_last_eb_blk); + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_list *el; + + ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { + ret = -EROFS; + OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); + goto out; + } + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + + next_free = le16_to_cpu(el->l_next_free_rec); + + if (next_free == 0 || + (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) + ret = 1; + +out: + brelse(eb_bh); + return ret; +} + /* * Return the 1st index within el which contains an extent start * larger than v_cluster. @@ -335,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + ret = ocfs2_read_block(inode, le64_to_cpu(eb->h_next_leaf_blk), - &next_eb_bh, OCFS2_BH_CACHED, inode); + &next_eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -373,42 +419,28 @@ out: return ret; } -int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, - u32 *p_cluster, u32 *num_clusters, - unsigned int *extent_flags) +static int ocfs2_get_clusters_nocache(struct inode *inode, + struct buffer_head *di_bh, + u32 v_cluster, unsigned int *hole_len, + struct ocfs2_extent_rec *ret_rec, + unsigned int *is_last) { - int ret, i; - unsigned int flags = 0; - struct buffer_head *di_bh = NULL; - struct buffer_head *eb_bh = NULL; + int i, ret, tree_height, len; struct ocfs2_dinode *di; - struct ocfs2_extent_block *eb; + struct ocfs2_extent_block *uninitialized_var(eb); struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; - u32 coff; - - if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = -ERANGE; - mlog_errno(ret); - goto out; - } - - ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, - num_clusters, extent_flags); - if (ret == 0) - goto out; + struct buffer_head *eb_bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); - if (ret) { - mlog_errno(ret); - goto out; - } + memset(ret_rec, 0, sizeof(*ret_rec)); + if (is_last) + *is_last = 0; di = (struct ocfs2_dinode *) di_bh->b_data; el = &di->id2.i_list; + tree_height = le16_to_cpu(el->l_tree_depth); - if (el->l_tree_depth) { + if (tree_height > 0) { ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); if (ret) { mlog_errno(ret); @@ -431,46 +463,202 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, i = ocfs2_search_extent_list(el, v_cluster); if (i == -1) { /* - * A hole was found. Return some canned values that - * callers can key on. If asked for, num_clusters will - * be populated with the size of the hole. + * Holes can be larger than the maximum size of an + * extent, so we return their lengths in a seperate + * field. */ - *p_cluster = 0; - if (num_clusters) { + if (hole_len) { ret = ocfs2_figure_hole_clusters(inode, el, eb_bh, - v_cluster, - num_clusters); + v_cluster, &len); if (ret) { mlog_errno(ret); goto out; } + + *hole_len = len; + } + goto out_hole; + } + + rec = &el->l_recs[i]; + + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0)", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *ret_rec = *rec; + + /* + * Checking for last extent is potentially expensive - we + * might have to look at the next leaf over to see if it's + * empty. + * + * The first two checks are to see whether the caller even + * cares for this information, and if the extent is at least + * the last in it's list. + * + * If those hold true, then the extent is last if any of the + * additional conditions hold true: + * - Extent list is in-inode + * - Extent list is right-most + * - Extent list is 2nd to rightmost, with empty right-most + */ + if (is_last) { + if (i == (le16_to_cpu(el->l_next_free_rec) - 1)) { + if (tree_height == 0) + *is_last = 1; + else if (eb->h_blkno == di->i_last_eb_blk) + *is_last = 1; + else if (eb->h_next_leaf_blk == di->i_last_eb_blk) { + ret = ocfs2_last_eb_is_empty(inode, di); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + if (ret == 1) + *is_last = 1; + } + } + } + +out_hole: + ret = 0; +out: + brelse(eb_bh); + return ret; +} + +static void ocfs2_relative_extent_offsets(struct super_block *sb, + u32 v_cluster, + struct ocfs2_extent_rec *rec, + u32 *p_cluster, u32 *num_clusters) + +{ + u32 coff = v_cluster - le32_to_cpu(rec->e_cpos); + + *p_cluster = ocfs2_blocks_to_clusters(sb, le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + + if (num_clusters) + *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; +} + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + u32 coff; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; + mlog_errno(ret); + goto out; } else { rec = &el->l_recs[i]; - BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); if (!rec->e_blkno) { ocfs2_error(inode->i_sb, "Inode %lu has bad extent " - "record (%u, %u, 0)", inode->i_ino, + "record (%u, %u, 0) in xattr", inode->i_ino, le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); ret = -EROFS; goto out; } - coff = v_cluster - le32_to_cpu(rec->e_cpos); - *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, le64_to_cpu(rec->e_blkno)); *p_cluster = *p_cluster + coff; - if (num_clusters) *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} + +int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + unsigned int *extent_flags) +{ + int ret; + unsigned int uninitialized_var(hole_len), flags = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = -ERANGE; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster, + num_clusters, extent_flags); + if (ret == 0) + goto out; + + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } - flags = rec->e_flags; + ret = ocfs2_get_clusters_nocache(inode, di_bh, v_cluster, &hole_len, + &rec, NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + /* + * A hole was found. Return some canned values that + * callers can key on. If asked for, num_clusters will + * be populated with the size of the hole. + */ + *p_cluster = 0; + if (num_clusters) { + *num_clusters = hole_len; + } + } else { + ocfs2_relative_extent_offsets(inode->i_sb, v_cluster, &rec, + p_cluster, num_clusters); + flags = rec.e_flags; - ocfs2_extent_map_insert_rec(inode, rec); + ocfs2_extent_map_insert_rec(inode, &rec); } if (extent_flags) @@ -478,7 +666,6 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, out: brelse(di_bh); - brelse(eb_bh); return ret; } @@ -521,3 +708,114 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, out: return ret; } + +static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, + struct fiemap_extent_info *fieinfo, + u64 map_start) +{ + int ret; + unsigned int id_count; + struct ocfs2_dinode *di; + u64 phys; + u32 flags = FIEMAP_EXTENT_DATA_INLINE|FIEMAP_EXTENT_LAST; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + + di = (struct ocfs2_dinode *)di_bh->b_data; + id_count = le16_to_cpu(di->id2.i_data.id_count); + + if (map_start < id_count) { + phys = oi->ip_blkno << inode->i_sb->s_blocksize_bits; + phys += offsetof(struct ocfs2_dinode, id2.i_data.id_data); + + ret = fiemap_fill_next_extent(fieinfo, 0, phys, id_count, + flags); + if (ret < 0) + return ret; + } + + return 0; +} + +#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) + +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len) +{ + int ret, is_last; + u32 mapping_end, cpos; + unsigned int hole_size; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u64 len_bytes, phys_bytes, virt_bytes; + struct buffer_head *di_bh = NULL; + struct ocfs2_extent_rec rec; + + ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + if (ret) + return ret; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + + /* + * Handle inline-data separately. + */ + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + ret = ocfs2_fiemap_inline(inode, di_bh, fieinfo, map_start); + goto out_unlock; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + mapping_end -= cpos; + is_last = 0; + while (cpos < mapping_end && !is_last) { + u32 fe_flags; + + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + &hole_size, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) { + cpos += hole_size; + continue; + } + + fe_flags = 0; + if (rec.e_flags & OCFS2_EXT_UNWRITTEN) + fe_flags |= FIEMAP_EXTENT_UNWRITTEN; + if (is_last) + fe_flags |= FIEMAP_EXTENT_LAST; + len_bytes = (u64)le16_to_cpu(rec.e_leaf_clusters) << osb->s_clustersize_bits; + phys_bytes = le64_to_cpu(rec.e_blkno) << osb->sb->s_blocksize_bits; + virt_bytes = (u64)le32_to_cpu(rec.e_cpos) << osb->s_clustersize_bits; + + ret = fiemap_fill_next_extent(fieinfo, virt_bytes, phys_bytes, + len_bytes, fe_flags); + if (ret) + break; + + cpos = le32_to_cpu(rec.e_cpos)+ le16_to_cpu(rec.e_leaf_clusters); + } + + if (ret > 0) + ret = 0; + +out_unlock: + brelse(di_bh); + + up_read(&OCFS2_I(inode)->ip_alloc_sem); + + ocfs2_inode_unlock(inode, 0); +out: + + return ret; +} diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index de91e3e41a2..1c4aa8b06f3 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -50,4 +50,11 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, u64 *ret_count, unsigned int *extent_flags); +int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 map_start, u64 map_len); + +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el); + #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ec2ed15c3da..8d3225a7807 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -55,6 +55,7 @@ #include "mmap.h" #include "suballoc.h" #include "super.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file, goto bail; journal = osb->journal->j_journal; - err = journal_force_commit(journal); + err = jbd2_journal_force_commit(journal); bail: mlog_exit(err); @@ -488,7 +489,7 @@ bail: } /* - * extend allocation only here. + * extend file allocation only here. * we'll update all the disk stuff, and oip->alloc_size * * expect stuff to be locked, a transaction started and enough data / @@ -497,189 +498,25 @@ bail: * Will return -EAGAIN, and a reason if a restart is needed. * If passed in, *reason will always be set, even in error. */ -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret) +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) { - int status = 0; - int free_extents; - struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - enum ocfs2_alloc_restarted reason = RESTART_NONE; - u32 bit_off, num_bits; - u64 block; - u8 flags = 0; - - BUG_ON(!clusters_to_add); - - if (mark_unwritten) - flags = OCFS2_EXT_UNWRITTEN; - - free_extents = ocfs2_num_free_extents(osb, inode, fe); - if (free_extents < 0) { - status = free_extents; - mlog_errno(status); - goto leave; - } - - /* there are two cases which could cause us to EAGAIN in the - * we-need-more-metadata case: - * 1) we haven't reserved *any* - * 2) we are so fragmented, we've needed to add metadata too - * many times. */ - if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } else if ((!free_extents) - && (ocfs2_alloc_context_bits_left(meta_ac) - < ocfs2_extend_meta_needed(fe))) { - mlog(0, "filesystem is really fragmented...\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } - - status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, - clusters_to_add, &bit_off, &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - - BUG_ON(num_bits > clusters_to_add); - - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, - *logical_offset, block, num_bits, - flags, meta_ac); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - clusters_to_add -= num_bits; - *logical_offset += num_bits; - - if (clusters_to_add) { - mlog(0, "need to alloc once more, clusters = %u, wanted = " - "%u\n", fe->i_clusters, clusters_to_add); - status = -EAGAIN; - reason = RESTART_TRANS; - } - -leave: - mlog_exit(status); - if (reason_ret) - *reason_ret = reason; - return status; -} - -/* - * For a given allocation, determine which allocators will need to be - * accessed, and lock them, reserving the appropriate number of bits. - * - * Sparse file systems call this from ocfs2_write_begin_nolock() - * and ocfs2_allocate_unwritten_extents(). - * - * File systems which don't support holes call this from - * ocfs2_extend_allocation(). - */ -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac) -{ - int ret = 0, num_free_extents; - unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - *meta_ac = NULL; - if (data_ac) - *data_ac = NULL; - - BUG_ON(clusters_to_add != 0 && data_ac == NULL); - - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); - - num_free_extents = ocfs2_num_free_extents(osb, inode, di); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - /* - * Sparse allocation file systems need to be more conservative - * with reserving room for expansion - the actual allocation - * happens while we've got a journal handle open so re-taking - * a cluster lock (because we ran out of room for another - * extent) will violate ordering rules. - * - * Most of the time we'll only be seeing this 1 cluster at a time - * anyway. - * - * Always lock for any unwritten extents - we might want to - * add blocks during a split. - */ - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { - ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - } - - if (clusters_to_add == 0) - goto out; - - ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } + int ret; + struct ocfs2_extent_tree et; - /* - * We cannot have an error and a non null *data_ac. - */ - } + ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); + ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, + clusters_to_add, mark_unwritten, + &et, handle, + data_ac, meta_ac, reason_ret); return ret; } @@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto leave; @@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, - &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), + clusters_to_add); + ocfs2_init_dinode_extent_tree(&et, inode, bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); if (status) { mlog_errno(status); goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -753,16 +597,16 @@ restarted_transaction: prev_clusters = OCFS2_I(inode)->ip_clusters; - status = ocfs2_do_extend_allocation(osb, - inode, - &logical_start, - clusters_to_add, - mark_unwritten, - bh, - handle, - data_ac, - meta_ac, - &why); + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); if ((status < 0) && (status != -EAGAIN)) { if (status != -ENOSPC) mlog_errno(status); @@ -789,7 +633,7 @@ restarted_transaction: mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, - fe, + &fe->id2.i_list, clusters_to_add); status = ocfs2_extend_trans(handle, credits); if (status < 0) { @@ -826,10 +670,8 @@ leave: restart_func = 0; goto restart_all; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; mlog_exit(status); return status; @@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } - if (i_size_read(inode) > attr->ia_size) + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } status = ocfs2_truncate_file(inode, bh, attr->ia_size); - else + } else status = ocfs2_extend_file(inode, bh, attr->ia_size); if (status < 0) { if (status != -ENOSPC) @@ -1140,8 +988,7 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode) struct buffer_head *bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, struct buffer_head *di_bh = NULL; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, + &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode, handle_t *handle; struct ocfs2_alloc_context *meta_ac = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_tree et; - ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); if (ret) { mlog_errno(ret); return ret; @@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode, goto out; } - ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, dealloc); if (ret) { mlog_errno(ret); @@ -2040,7 +1888,7 @@ out_dio: */ if (old_size != i_size_read(inode) || old_clusters != OCFS2_I(inode)->ip_clusters) { - ret = journal_force_commit(osb->journal->j_journal); + ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } @@ -2227,7 +2075,12 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, .fallocate = ocfs2_fallocate, + .fiemap = ocfs2_fiemap, }; const struct inode_operations ocfs2_special_file_iops = { @@ -2236,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = { .permission = ocfs2_permission, }; +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ const struct file_operations ocfs2_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -2250,6 +2107,7 @@ const struct file_operations ocfs2_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, @@ -2266,5 +2124,51 @@ const struct file_operations ocfs2_dops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif .flock = ocfs2_flock, }; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1e27b4d017e..e92382cbca5 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -28,9 +28,12 @@ extern const struct file_operations ocfs2_fops; extern const struct file_operations ocfs2_dops; +extern const struct file_operations ocfs2_fops_no_plocks; +extern const struct file_operations ocfs2_dops_no_plocks; extern const struct inode_operations ocfs2_file_iops; extern const struct inode_operations ocfs2_special_file_iops; struct ocfs2_alloc_context; +enum ocfs2_alloc_restarted; struct ocfs2_file_private { struct file *fp_file; @@ -38,27 +41,18 @@ struct ocfs2_file_private { struct ocfs2_lock_res fp_flock; }; -enum ocfs2_alloc_restarted { - RESTART_NONE = 0, - RESTART_TRANS, - RESTART_META -}; -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to); -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7e9e4c79aec..4903688f72a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -49,6 +49,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, struct super_block *sb; struct ocfs2_super *osb; int status = -EINVAL; + int use_plocks = 1; mlog_entry("(0x%p, size:%llu)\n", inode, (unsigned long long)le64_to_cpu(fe->i_size)); @@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, sb = inode->i_sb; osb = OCFS2_SB(sb); + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) + use_plocks = 0; + /* this means that read_inode cannot create a superblock inode * today. change if needed. */ if (!OCFS2_IS_VALID_DINODE(fe) || @@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, switch (inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_fop = &ocfs2_fops; + if (use_plocks) + inode->i_fop = &ocfs2_fops; + else + inode->i_fop = &ocfs2_fops_no_plocks; inode->i_op = &ocfs2_file_iops; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFDIR: inode->i_op = &ocfs2_dir_iops; - inode->i_fop = &ocfs2_dops; + if (use_plocks) + inode->i_fop = &ocfs2_dops; + else + inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFLNK: @@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } - status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, - can_lock ? inode : NULL); + if (can_lock) + status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); + else + status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, * data and fast symlinks. */ if (fe->i_clusters) { + if (ocfs2_should_order_data(inode)) + ocfs2_begin_ordered_truncate(inode, 0); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + /*Free extended attribute resources associated with this inode.*/ + status = ocfs2_xattr_remove(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) @@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode) oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; + jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + &oi->ip_jinode); bail: mlog_exit_void(); @@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode) } /* - * TODO: this should probably be merged into ocfs2_get_block - * - * However, you now need to pay attention to the cont_prepare_write() - * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much - * expects never to extend). - */ -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada) -{ - struct buffer_head *bh = NULL; - int tmperr; - u64 p_blkno; - int readflags = OCFS2_BH_CACHED; - - if (reada) - readflags |= OCFS2_BH_READAHEAD; - - if (((u64)block << inode->i_sb->s_blocksize_bits) >= - i_size_read(inode)) { - BUG_ON(!reada); - return NULL; - } - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, - NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (tmperr < 0) { - mlog_errno(tmperr); - goto fail; - } - - tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, - readflags, inode); - if (tmperr < 0) - goto fail; - - tmperr = 0; - - *err = 0; - return bh; - -fail: - if (bh) { - brelse(bh); - bh = NULL; - } - *err = -EIO; - return NULL; -} - -/* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 390a85596aa..2f37af9bcc4 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -40,6 +40,9 @@ struct ocfs2_inode_info /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; + /* protects extended attribute changes on this inode */ + struct rw_semaphore ip_xattr_sem; + /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -68,6 +71,7 @@ struct ocfs2_inode_info struct ocfs2_extent_map ip_extent_map; struct inode vfs_inode; + struct jbd2_inode ip_jinode; }; /* @@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache; extern const struct address_space_operations ocfs2_aops; -struct buffer_head *ocfs2_bread(struct inode *inode, int block, - int *err, int reada); void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7b142f0ce99..9fcd36dcc9a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -102,8 +102,7 @@ bail_unlock: bail: mutex_unlock(&inode->i_mutex); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 7a37240f7a3..81e40677eec 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); mlog_errno(status); @@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) down_read(&osb->journal->j_trans_barrier); - handle = journal_start(journal, max_buffs); + handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); @@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, BUG_ON(!handle); - ret = journal_stop(handle); + ret = jbd2_journal_stop(handle); if (ret < 0) mlog_errno(ret); @@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, * transaction. extend_trans will either extend the current handle by * nblocks, or commit it and start a new one with nblocks credits. * - * This might call journal_restart() which will commit dirty buffers + * This might call jbd2_journal_restart() which will commit dirty buffers * and then restart the transaction. Before calling * ocfs2_extend_trans(), any changed blocks should have been * dirtied. After calling it, all blocks which need to be changed must @@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #endif if (status > 0) { - mlog(0, "journal_extend failed, trying journal_restart\n"); - status = journal_restart(handle, nblocks); + mlog(0, + "jbd2_journal_extend failed, trying " + "jbd2_journal_restart\n"); + status = jbd2_journal_restart(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle, switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: - status = journal_get_write_access(handle, bh); + status = jbd2_journal_get_write_access(handle, bh); break; case OCFS2_JOURNAL_ACCESS_UNDO: - status = journal_get_undo_access(handle, bh); + status = jbd2_journal_get_undo_access(handle, bh); break; default: @@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle, mlog_entry("(bh->b_blocknr=%llu)\n", (unsigned long long)bh->b_blocknr); - status = journal_dirty_metadata(handle, bh); + status = jbd2_journal_dirty_metadata(handle, bh); if (status < 0) mlog(ML_ERROR, "Could not dirty metadata buffer. " "(bh->b_blocknr=%llu)\n", @@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle, return status; } +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { @@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, return err; } +#endif -#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) { @@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) spin_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - journal->j_flags |= JFS_BARRIER; + journal->j_flags |= JBD2_BARRIER; else - journal->j_flags &= ~JFS_BARRIER; + journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); } @@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); /* call the kernels journal init function now */ - j_journal = journal_init_inode(inode); + j_journal = jbd2_journal_init_inode(inode); if (j_journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EINVAL; goto done; } - mlog(0, "Returned from journal_init_inode\n"); + mlog(0, "Returned from jbd2_journal_init_inode\n"); mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & @@ -550,8 +554,7 @@ done: if (status < 0) { if (inode_lock) ocfs2_inode_unlock(inode, 1); - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) { OCFS2_I(inode)->ip_open_count--; iput(inode); @@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (journal->j_state != OCFS2_JOURNAL_LOADED) goto done; - /* need to inc inode use count as journal_destroy will iput. */ + /* need to inc inode use count - jbd2_journal_destroy will iput. */ if (!igrab(inode)) BUG(); @@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); if (ocfs2_mount_local(osb)) { - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); } @@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) } /* Shutdown the kernel journal system */ - journal_destroy(journal->j_journal); + jbd2_journal_destroy(journal->j_journal); OCFS2_I(inode)->ip_open_count--; @@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb, { int olderr; - olderr = journal_errno(journal); + olderr = jbd2_journal_errno(journal); if (olderr) { mlog(ML_ERROR, "File system error %d recorded in " "journal %u.\n", olderr, slot); mlog(ML_ERROR, "File system on device %s needs checking.\n", sb->s_id); - journal_ack_err(journal); - journal_clear_err(journal); + jbd2_journal_ack_err(journal); + jbd2_journal_clear_err(journal); } } @@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) osb = journal->j_osb; - status = journal_load(journal->j_journal); + status = jbd2_journal_load(journal->j_journal); if (status < 0) { mlog(ML_ERROR, "Failed to load journal!\n"); goto done; @@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) BUG_ON(!journal); - status = journal_wipe(journal->j_journal, full); + status = jbd2_journal_wipe(journal->j_journal, full); if (status < 0) { mlog_errno(status); goto bail; @@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode) /* We are reading journal data which should not * be put in the uptodate cache */ - status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs, 0, - NULL); + status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs); if (status < 0) { mlog_errno(status); goto bail; @@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode) bail: for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); mlog_exit(status); return status; } @@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb, } SET_INODE_JOURNAL(inode); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } mlog(0, "calling journal_init_inode\n"); - journal = journal_init_inode(inode); + journal = jbd2_journal_init_inode(inode); if (journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EIO; goto done; } - status = journal_load(journal); + status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); goto done; } @@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ mlog(0, "flushing the journal.\n"); - journal_lock_updates(journal); - status = journal_flush(journal); - journal_unlock_updates(journal); + jbd2_journal_lock_updates(journal); + status = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); @@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); done: /* drop the lock on this nodes journal */ @@ -1282,8 +1284,7 @@ done: if (inode) iput(inode); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1418,13 +1419,13 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) { unsigned int node_num; int status, i; + u32 gen; struct buffer_head *bh = NULL; struct ocfs2_dinode *di; /* This is called with the super block cluster lock, so we * know that the slot map can't change underneath us. */ - spin_lock(&osb->osb_lock); for (i = 0; i < osb->max_slots; i++) { /* Read journal inode to get the recovery generation */ status = ocfs2_read_journal_inode(osb, i, &bh, NULL); @@ -1433,23 +1434,31 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) goto bail; } di = (struct ocfs2_dinode *)bh->b_data; - osb->slot_recovery_generations[i] = - ocfs2_get_recovery_generation(di); + gen = ocfs2_get_recovery_generation(di); brelse(bh); bh = NULL; + spin_lock(&osb->osb_lock); + osb->slot_recovery_generations[i] = gen; + mlog(0, "Slot %u recovery generation is %u\n", i, osb->slot_recovery_generations[i]); - if (i == osb->slot_num) + if (i == osb->slot_num) { + spin_unlock(&osb->osb_lock); continue; + } status = ocfs2_slot_to_node_num_locked(osb, i, &node_num); - if (status == -ENOENT) + if (status == -ENOENT) { + spin_unlock(&osb->osb_lock); continue; + } - if (__ocfs2_recovery_map_test(osb, node_num)) + if (__ocfs2_recovery_map_test(osb, node_num)) { + spin_unlock(&osb->osb_lock); continue; + } spin_unlock(&osb->osb_lock); /* Ok, we have a slot occupied by another node which @@ -1465,10 +1474,7 @@ int ocfs2_mark_dead_nodes(struct ocfs2_super *osb) mlog_errno(status); goto bail; } - - spin_lock(&osb->osb_lock); } - spin_unlock(&osb->osb_lock); status = 0; bail: diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 2178ebffa05..d4d14e9a3ce 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -27,7 +27,12 @@ #define OCFS2_JOURNAL_H #include <linux/fs.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, @@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before - * the current handle commits. + * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before + * the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -268,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle, */ int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh); +#endif /* * Credit Macros: @@ -283,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* simple file updates like chmod, etc. */ #define OCFS2_INODE_UPDATE_CREDITS 1 +/* extended attribute block update */ +#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 + /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) @@ -340,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ + OCFS2_UNLINK_CREDITS) +/* global bitmap dinode, group desc., relinked group, + * suballocator dinode, group desc., relinked group, + * dinode, xattr block */ +#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + + OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, u32 bits_wanted) { - int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; + int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; /* bitmap dinode, group desc. + relinked group. */ bitmap_blocks = OCFS2_SUBALLOC_ALLOC; @@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, * however many metadata chunks needed * a remaining suballoc * alloc. */ sysfile_bitmap_blocks = 1 + - (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); /* this does not include *new* metadata blocks, which are - * accounted for in sysfile_bitmap_blocks. fe + + * accounted for in sysfile_bitmap_blocks. root_el + * prev. last_eb_blk + blocks along edge of tree. * calc_symlink_credits passes because we just need 1 * credit for the dinode there. */ - dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); + extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); - return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) @@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, return credits; } +static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); +} + +static inline int ocfs2_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, + new_size); +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 28e492e4ec8..687b28713c3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -47,8 +48,6 @@ #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); - static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, @@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) +#ifdef CONFIG_OCFS2_FS_STATS + +static int ocfs2_la_debug_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE +#define LA_DEBUG_VER 1 +static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + static DEFINE_MUTEX(la_debug_mutex); + struct ocfs2_super *osb = file->private_data; + int written, ret; + char *buf = osb->local_alloc_debug_buf; + + mutex_lock(&la_debug_mutex); + memset(buf, 0, LA_DEBUG_BUF_SZ); + + written = snprintf(buf, LA_DEBUG_BUF_SZ, + "0x%x\t0x%llx\t%u\t%u\t0x%x\n", + LA_DEBUG_VER, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_default_bits, + osb->local_alloc_bits, osb->local_alloc_state); + + ret = simple_read_from_buffer(userbuf, count, ppos, buf, written); + + mutex_unlock(&la_debug_mutex); + return ret; +} + +static const struct file_operations ocfs2_la_debug_fops = { + .open = ocfs2_la_debug_open, + .read = ocfs2_la_debug_read, +}; + +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS); + if (!osb->local_alloc_debug_buf) + return; + + osb->local_alloc_debug = debugfs_create_file("local_alloc_stats", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_la_debug_fops); + if (!osb->local_alloc_debug) { + kfree(osb->local_alloc_debug_buf); + osb->local_alloc_debug_buf = NULL; + } +} + +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + if (osb->local_alloc_debug) + debugfs_remove(osb->local_alloc_debug); + + if (osb->local_alloc_debug_buf) + kfree(osb->local_alloc_debug_buf); + + osb->local_alloc_debug_buf = NULL; + osb->local_alloc_debug = NULL; +} +#else /* CONFIG_OCFS2_FS_STATS */ +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + return; +} +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + return; +} +#endif + +static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { - BUG_ON(osb->s_clustersize_bits > 20); + return (osb->local_alloc_state == OCFS2_LA_THROTTLED || + osb->local_alloc_state == OCFS2_LA_ENABLED); +} - /* Size local alloc windows by the megabyte */ - return osb->local_alloc_size << (20 - osb->s_clustersize_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters) +{ + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) + if (num_clusters >= osb->local_alloc_default_bits) { + cancel_delayed_work(&osb->la_enable_wq); + osb->local_alloc_state = OCFS2_LA_ENABLED; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_la_enable_worker(struct work_struct *work) +{ + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + la_enable_wq.work); + spin_lock(&osb->osb_lock); + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_unlock(&osb->osb_lock); } /* * Tell us whether a given allocation should use the local alloc * file. Otherwise, it has to go to the main bitmap. + * + * This function does semi-dirty reads of local alloc size and state! + * This is ok however, as the values are re-checked once under mutex. */ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) { - int la_bits = ocfs2_local_alloc_window_bits(osb); int ret = 0; + int la_bits; + + spin_lock(&osb->osb_lock); + la_bits = osb->local_alloc_bits; - if (osb->local_alloc_state != OCFS2_LA_ENABLED) + if (!ocfs2_la_state_enabled(osb)) goto bail; /* la_bits should be at least twice the size (in clusters) of @@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) bail: mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + spin_unlock(&osb->osb_lock); return ret; } @@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); - if (osb->local_alloc_size == 0) + ocfs2_init_la_debug(osb); + + if (osb->local_alloc_bits == 0) goto bail; - if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { + if (osb->local_alloc_bits >= osb->bitmap_cpg) { mlog(ML_NOTICE, "Requested local alloc window %d is larger " "than max possible %u. Using defaults.\n", - ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); - osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + osb->local_alloc_bits, (osb->bitmap_cpg - 1)); + osb->local_alloc_bits = + ocfs2_megabytes_to_clusters(osb->sb, + OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); } /* read the alloc off disk */ @@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) iput(inode); - mlog(0, "Local alloc window bits = %d\n", - ocfs2_local_alloc_window_bits(osb)); + if (status < 0) + ocfs2_shutdown_la_debug(osb); + + mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); mlog_exit(status); return status; @@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); + cancel_delayed_work(&osb->la_enable_wq); + flush_workqueue(ocfs2_wq); + + ocfs2_shutdown_la_debug(osb); + if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -295,8 +410,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock: - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); @@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -372,8 +486,7 @@ bail: *alloc_copy = NULL; } - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) { mutex_unlock(&inode->i_mutex); @@ -441,8 +554,7 @@ out_unlock: out_mutex: mutex_unlock(&main_bm_inode->i_mutex); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); iput(main_bm_inode); @@ -453,8 +565,48 @@ out: return status; } +/* Check to see if the local alloc window is within ac->ac_max_block */ +static int ocfs2_local_alloc_in_range(struct inode *inode, + struct ocfs2_alloc_context *ac, + u32 bits_wanted) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + int start; + u64 block_off; + + if (!ac->ac_max_block) + return 1; + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + if (start == -1) { + mlog_errno(-ENOSPC); + return 0; + } + + /* + * Converting (bm_off + start + bits_wanted) to blocks gives us + * the blkno just past our actual allocation. This is perfect + * to compare with ac_max_block. + */ + block_off = ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(la->la_bm_off) + + start + bits_wanted); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)block_off, + (unsigned long long)ac->ac_max_block); + if (block_off > ac->ac_max_block) + return 0; + + return 1; +} + /* - * make sure we've got at least bitswanted contiguous bits in the + * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_mutex. * * We will add ourselves to the transaction passed in, but may start @@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mutex_lock(&local_alloc_inode->i_mutex); - if (osb->local_alloc_state != OCFS2_LA_ENABLED) { - status = -ENOSPC; - goto bail; - } - - if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { - mlog(0, "Asking for more than my max window size!\n"); + /* + * We must double check state and allocator bits because + * another process may have changed them while holding i_mutex. + */ + spin_lock(&osb->osb_lock); + if (!ocfs2_la_state_enabled(osb) || + (bits_wanted > osb->local_alloc_bits)) { + spin_unlock(&osb->osb_lock); status = -ENOSPC; goto bail; } + spin_unlock(&osb->osb_lock); alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; @@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + + /* + * Under certain conditions, the window slide code + * might have reduced the number of bits available or + * disabled the the local alloc entirely. Re-check + * here and return -ENOSPC if necessary. + */ + status = -ENOSPC; + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) + goto bail; + } + + if (ac->ac_max_block) + mlog(0, "Calling in_range for max block %llu\n", + (unsigned long long)ac->ac_max_block); + + if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac, + bits_wanted)) { + /* + * The window is outside ac->ac_max_block. + * This errno tells the caller to keep localalloc enabled + * but to get the allocation from the main bitmap. + */ + status = -EFBIG; + goto bail; } ac->ac_inode = local_alloc_inode; @@ -789,6 +973,85 @@ bail: return status; } +enum ocfs2_la_event { + OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */ + OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has + * enough bits theoretically + * free, but a contiguous + * allocation could not be + * found. */ + OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have + * enough bits free to satisfy + * our request. */ +}; +#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) +/* + * Given an event, calculate the size of our next local alloc window. + * + * This should always be called under i_mutex of the local alloc inode + * so that local alloc disabling doesn't race with processes trying to + * use the allocator. + * + * Returns the state which the local alloc was left in. This value can + * be ignored by some paths. + */ +static int ocfs2_recalc_la_window(struct ocfs2_super *osb, + enum ocfs2_la_event event) +{ + unsigned int bits; + int state; + + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED); + goto out_unlock; + } + + /* + * ENOSPC and fragmentation are treated similarly for now. + */ + if (event == OCFS2_LA_EVENT_ENOSPC || + event == OCFS2_LA_EVENT_FRAGMENTED) { + /* + * We ran out of contiguous space in the primary + * bitmap. Drastically reduce the number of bits used + * by local alloc until we have to disable it. + */ + bits = osb->local_alloc_bits >> 1; + if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + /* + * By setting state to THROTTLED, we'll keep + * the number of local alloc bits used down + * until an event occurs which would give us + * reason to assume the bitmap situation might + * have changed. + */ + osb->local_alloc_state = OCFS2_LA_THROTTLED; + osb->local_alloc_bits = bits; + } else { + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + OCFS2_LA_ENABLE_INTERVAL); + goto out_unlock; + } + + /* + * Don't increase the size of the local alloc window until we + * know we might be able to fulfill the request. Otherwise, we + * risk bouncing around the global bitmap during periods of + * low space. + */ + if (osb->local_alloc_state != OCFS2_LA_THROTTLED) + osb->local_alloc_bits = osb->local_alloc_default_bits; + +out_unlock: + state = osb->local_alloc_state; + spin_unlock(&osb->osb_lock); + + return state; +} + static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac, struct inode **bitmap_inode, @@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); +retry_enospc: + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status == -ENOSPC) { + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == + OCFS2_LA_DISABLED) + goto bail; + + ocfs2_free_ac_resource(*ac); + memset(*ac, 0, sizeof(struct ocfs2_alloc_context)); + goto retry_enospc; + } if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + mlog_errno(status); goto bail; } @@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, "one\n"); mlog(0, "Allocating %u clusters for a new window.\n", - ocfs2_local_alloc_window_bits(osb)); + osb->local_alloc_bits); /* Instruct the allocation code to try the most recently used * cluster group. We'll re-record the group used this pass @@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(osb, handle, ac, - ocfs2_local_alloc_window_bits(osb), + status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); + if (status == -ENOSPC) { +retry_enospc: + /* + * Note: We could also try syncing the journal here to + * allow use of any free bits which the current + * transaction can't give us access to. --Mark + */ + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == + OCFS2_LA_DISABLED) + goto bail; + + status = ocfs2_claim_clusters(osb, handle, ac, + osb->local_alloc_bits, + &cluster_off, + &cluster_count); + if (status == -ENOSPC) + goto retry_enospc; + /* + * We only shrunk the *minimum* number of in our + * request - it's entirely possible that the allocator + * might give us more than we asked for. + */ + if (status == 0) { + spin_lock(&osb->osb_lock); + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); + } + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, mlog_entry_void(); + ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); + /* This will lock the main bitmap for us. */ status = ocfs2_local_alloc_reserve_for_window(osb, &ac, @@ -976,8 +1277,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); if (main_bm_inode) iput(main_bm_inode); diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 3f76631e110..ac5ea9f8665 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters); +void ocfs2_la_enable_worker(struct work_struct *work); + #endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 203f8714387..544ac624517 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/fcntl.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -32,6 +33,7 @@ #include "dlmglue.h" #include "file.h" +#include "inode.h" #include "locks.h" static int ocfs2_do_flock(struct file *file, struct inode *inode, @@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) else return ocfs2_do_flock(file, inode, cmd, fl); } + +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); +} diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 9743ef2324e..496d488b271 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -27,5 +27,6 @@ #define OCFS2_LOCKS_H int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl); #endif /* OCFS2_LOCKS_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d5d808fe014..485a6aa0ad3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -60,6 +60,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -327,14 +328,9 @@ leave: if (status == -ENOSPC) mlog(0, "Disk is full\n"); - if (new_fe_bh) - brelse(new_fe_bh); - - if (de_bh) - brelse(de_bh); - - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(new_fe_bh); + brelse(de_bh); + brelse(parent_fe_bh); if ((status < 0) && inode) iput(inode); @@ -647,12 +643,9 @@ out_unlock_inode: out: ocfs2_inode_unlock(dir, 1); - if (de_bh) - brelse(de_bh); - if (fe_bh) - brelse(fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(de_bh); + brelse(fe_bh); + brelse(parent_fe_bh); mlog_exit(err); @@ -851,17 +844,10 @@ leave: iput(orphan_dir); } - if (fe_bh) - brelse(fe_bh); - - if (dirent_bh) - brelse(dirent_bh); - - if (parent_node_bh) - brelse(parent_node_bh); - - if (orphan_entry_bh) - brelse(orphan_entry_bh); + brelse(fe_bh); + brelse(dirent_bh); + brelse(parent_node_bh); + brelse(orphan_entry_bh); mlog_exit(status); @@ -1372,24 +1358,15 @@ bail: if (new_inode) iput(new_inode); - if (newfe_bh) - brelse(newfe_bh); - if (old_inode_bh) - brelse(old_inode_bh); - if (old_dir_bh) - brelse(old_dir_bh); - if (new_dir_bh) - brelse(new_dir_bh); - if (new_de_bh) - brelse(new_de_bh); - if (old_de_bh) - brelse(old_de_bh); - if (old_inode_de_bh) - brelse(old_inode_de_bh); - if (orphan_entry_bh) - brelse(orphan_entry_bh); - if (insert_entry_bh) - brelse(insert_entry_bh); + brelse(newfe_bh); + brelse(old_inode_bh); + brelse(old_dir_bh); + brelse(new_dir_bh); + brelse(new_de_bh); + brelse(old_de_bh); + brelse(old_inode_de_bh); + brelse(orphan_entry_bh); + brelse(insert_entry_bh); mlog_exit(status); @@ -1492,8 +1469,7 @@ bail: if (bhs) { for(i = 0; i < blocks; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); kfree(bhs); } @@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, - new_fe_bh, - handle, data_ac, NULL, - NULL); + status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, + new_fe_bh, + handle, data_ac, NULL, + NULL); if (status < 0) { if (status != -ENOSPC && status != -EINTR) { mlog(ML_ERROR, @@ -1659,12 +1635,9 @@ bail: ocfs2_inode_unlock(dir, 1); - if (new_fe_bh) - brelse(new_fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); - if (de_bh) - brelse(de_bh); + brelse(new_fe_bh); + brelse(parent_fe_bh); + brelse(de_bh); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) @@ -1759,8 +1732,7 @@ leave: iput(orphan_dir_inode); } - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - status = ocfs2_read_block(osb, + status = ocfs2_read_block(orphan_dir_inode, OCFS2_I(orphan_dir_inode)->ip_blkno, - &orphan_dir_bh, OCFS2_BH_CACHED, - orphan_dir_inode); + &orphan_dir_bh); if (status < 0) { mlog_errno(status); goto leave; @@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); leave: - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, } leave: - if (target_de_bh) - brelse(target_de_bh); + brelse(target_de_bh); mlog_exit(status); return status; @@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7f625f2b111..a21a465490c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -34,7 +34,12 @@ #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif /* For union ocfs2_dlm_lksb */ #include "stackglue.h" @@ -171,9 +176,13 @@ struct ocfs2_alloc_stats enum ocfs2_local_alloc_state { - OCFS2_LA_UNUSED = 0, - OCFS2_LA_ENABLED, - OCFS2_LA_DISABLED + OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for + * this mountpoint. */ + OCFS2_LA_ENABLED, /* Local alloc is in use. */ + OCFS2_LA_THROTTLED, /* Local alloc is in use, but number + * of bits has been reduced. */ + OCFS2_LA_DISABLED /* Local alloc has temporarily been + * disabled. */ }; enum ocfs2_mount_options @@ -184,6 +193,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ + OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ + OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -214,6 +225,7 @@ struct ocfs2_super u32 bitmap_cpg; u8 *uuid; char *uuid_str; + u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; @@ -241,6 +253,7 @@ struct ocfs2_super int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; + unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; @@ -252,11 +265,27 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; - int local_alloc_size; - enum ocfs2_local_alloc_state local_alloc_state; + struct delayed_work la_enable_wq; + + /* + * Must hold local alloc i_mutex and osb->osb_lock to change + * local_alloc_bits. Reads can be done under either lock. + */ + unsigned int local_alloc_bits; + unsigned int local_alloc_default_bits; + + enum ocfs2_local_alloc_state local_alloc_state; /* protected + * by osb_lock */ + struct buffer_head *local_alloc_bh; + u64 la_last_gd; +#ifdef CONFIG_OCFS2_FS_STATS + struct dentry *local_alloc_debug; + char *local_alloc_debug_buf; +#endif + /* Next two fields are for local node slot recovery during * mount. */ int dirty; @@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) return pages_per_cluster; } +static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, + unsigned int megs) +{ + BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); + + return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) { spin_lock(&osb->osb_lock); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 4f619850ccf..f24ce3d3f95 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -64,6 +64,7 @@ #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" +#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -90,7 +91,8 @@ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ - | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ + | OCFS2_FEATURE_INCOMPAT_XATTR) #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* @@ -127,10 +129,6 @@ /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 -/* Support for the extended slot map */ -#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 - - /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as @@ -142,6 +140,12 @@ */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + +/* Support for extended attributes */ +#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -299,6 +303,12 @@ struct ocfs2_new_group_input { */ #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 +/* + * Inline extended attribute size (in bytes) + * The value chosen should be aligned to 16 byte boundaries. + */ +#define OCFS2_MIN_XATTR_INLINE_SIZE 256 + struct ocfs2_system_inode_info { char *si_name; int si_iflags; @@ -563,7 +573,7 @@ struct ocfs2_super_block { /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; - __le32 s_reserved1; + __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ @@ -571,7 +581,11 @@ struct ocfs2_super_block { /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace stack. Only valid with INCOMPAT flag. */ -/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ +/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size + for this fs*/ + __le16 s_reserved0; + __le32 s_reserved1; +/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ /*140*/ /* @@ -621,7 +635,8 @@ struct ocfs2_dinode { belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ -/*10*/ __le32 i_reserved0; +/*10*/ __le16 i_reserved0; + __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ @@ -640,11 +655,12 @@ struct ocfs2_dinode { __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; - __le32 i_attr; +/*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; -/*70*/ __le64 i_reserved2[8]; + __le64 i_xattr_loc; +/*80*/ __le64 i_reserved2[7]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -715,6 +731,136 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +/* + * On disk extended attribute structure for OCFS2. + */ + +/* + * ocfs2_xattr_entry indicates one extend attribute. + * + * Note that it can be stored in inode, one block or one xattr bucket. + */ +struct ocfs2_xattr_entry { + __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ + __le16 xe_name_offset; /* byte offset from the 1st etnry in the local + local xattr storage(inode, xattr block or + xattr bucket). */ + __u8 xe_name_len; /* xattr name len, does't include prefix. */ + __u8 xe_type; /* the low 7 bits indicates the name prefix's + * type and the highest 1 bits indicate whether + * the EA is stored in the local storage. */ + __le64 xe_value_size; /* real xattr value length. */ +}; + +/* + * On disk structure for xattr header. + * + * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in + * the local xattr storage. + */ +struct ocfs2_xattr_header { + __le16 xh_count; /* contains the count of how + many records are in the + local xattr storage. */ + __le16 xh_free_start; /* current offset for storing + xattr. */ + __le16 xh_name_value_len; /* total length of name/value + length in this bucket. */ + __le16 xh_num_buckets; /* bucket nums in one extent + record, only valid in the + first bucket. */ + __le64 xh_csum; + struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ +}; + +/* + * On disk structure for xattr value root. + * + * It is used when one extended attribute's size is larger, and we will save it + * in an outside cluster. It will stored in a b-tree like file content. + */ +struct ocfs2_xattr_value_root { +/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ + __le32 xr_reserved0; + __le64 xr_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ +}; + +/* + * On disk structure for xattr tree root. + * + * It is used when there are too many extended attributes for one file. These + * attributes will be organized and stored in an indexed-btree. + */ +struct ocfs2_xattr_tree_root { +/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ + __le32 xt_reserved0; + __le64 xt_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ +}; + +#define OCFS2_XATTR_INDEXED 0x1 +#define OCFS2_HASH_SHIFT 5 +#define OCFS2_XATTR_ROUND 3 +#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ + ~(OCFS2_XATTR_ROUND)) + +#define OCFS2_XATTR_BUCKET_SIZE 4096 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ + / OCFS2_MIN_BLOCKSIZE) + +/* + * On disk structure for xattr block. + */ +struct ocfs2_xattr_block { +/*00*/ __u8 xb_signature[8]; /* Signature for verification */ + __le16 xb_suballoc_slot; /* Slot suballocator this + block belongs to. */ + __le16 xb_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 xb_fs_generation; /* Must match super block */ +/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ + __le64 xb_csum; +/*20*/ __le16 xb_flags; /* Indicates whether this block contains + real xattr or a xattr tree. */ + __le16 xb_reserved0; + __le32 xb_reserved1; + __le64 xb_reserved2; +/*30*/ union { + struct ocfs2_xattr_header xb_header; /* xattr header if this + block contains xattr */ + struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this + block cotains xattr + tree. */ + } xb_attrs; +}; + +#define OCFS2_XATTR_ENTRY_LOCAL 0x80 +#define OCFS2_XATTR_TYPE_MASK 0x7F +static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, + int local) +{ + if (local) + xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; + else + xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) +{ + xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; +} + +static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_TYPE_MASK; +} + #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { @@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_data.id_data); } +static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, + struct ocfs2_dinode *di) +{ + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + xattrsize; + else + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; @@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_extent_recs_per_inode_with_xattr( + struct super_block *sb, + struct ocfs2_dinode *di) +{ + int size; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - + xattrsize; + else + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; @@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) return 0; } + +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { @@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) return 0; } + +static inline int ocfs2_xattr_recs_per_xb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #endif /* __KERNEL__ */ diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h new file mode 100644 index 00000000000..b91c78f8f55 --- /dev/null +++ b/fs/ocfs2/ocfs2_jbd_compat.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_jbd_compat.h + * + * Compatibility defines for JBD. + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_JBD_COMPAT_H +#define OCFS2_JBD_COMPAT_H + +#ifndef CONFIG_OCFS2_COMPAT_JBD +# error Should not have been included +#endif + +struct jbd2_inode { + unsigned int dummy; +}; + +#define JBD2_BARRIER JFS_BARRIER +#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE + +#define jbd2_journal_ack_err journal_ack_err +#define jbd2_journal_clear_err journal_clear_err +#define jbd2_journal_destroy journal_destroy +#define jbd2_journal_dirty_metadata journal_dirty_metadata +#define jbd2_journal_errno journal_errno +#define jbd2_journal_extend journal_extend +#define jbd2_journal_flush journal_flush +#define jbd2_journal_force_commit journal_force_commit +#define jbd2_journal_get_write_access journal_get_write_access +#define jbd2_journal_get_undo_access journal_get_undo_access +#define jbd2_journal_init_inode journal_init_inode +#define jbd2_journal_invalidatepage journal_invalidatepage +#define jbd2_journal_load journal_load +#define jbd2_journal_lock_updates journal_lock_updates +#define jbd2_journal_restart journal_restart +#define jbd2_journal_start journal_start +#define jbd2_journal_start_commit journal_start_commit +#define jbd2_journal_stop journal_stop +#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers +#define jbd2_journal_unlock_updates journal_unlock_updates +#define jbd2_journal_wipe journal_wipe +#define jbd2_log_wait_commit log_wait_commit + +static inline int jbd2_journal_file_inode(handle_t *handle, + struct jbd2_inode *inode) +{ + return 0; +} + +static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + return 0; +} + +static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, + struct inode *inode) +{ + return; +} + +static inline void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + return; +} + + +#endif /* OCFS2_JBD_COMPAT_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 8166968e901..ffd48db229a 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) if (cluster > clusters) break; - ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); if (ret < 0) { mlog_errno(ret); break; @@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode, * update the superblock last. * It doesn't matter if the write failed. */ - ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, - &super_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, + &super_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); - ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, - main_bm_inode); + ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); if (ret < 0) { mlog(ML_ERROR, "Can't read the group descriptor # %llu " "from the device.", (unsigned long long)input->group); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bb5ff8939bf..bdda2d8f850 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If * this is not true, the read of -1 (UINT64_MAX) will fail. */ - ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, - si->si_inode); + ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, + OCFS2_BH_IGNORE_CACHE); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, (unsigned long long)blkno); bh = NULL; /* Acquire a fresh bh */ - status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 353fc35c674..faec2d87935 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -28,6 +28,7 @@ #include "ocfs2.h" /* For struct ocfs2_lock_res */ #include "stackglue.h" +#include <linux/dlm_plock.h> /* * The control protocol starts with a handshake. Until the handshake @@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) { } +static int user_plock(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl) +{ + /* + * This more or less just demuxes the plock request into any + * one of three dlm calls. + * + * Internally, fs/dlm will pass these to a misc device, which + * a userspace daemon will read and write to. + * + * For now, cancel requests (which happen internally only), + * are turned into unlocks. Most of this function taken from + * gfs2_lock. + */ + + if (cmd == F_CANCELLK) { + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + + if (IS_GETLK(cmd)) + return dlm_posix_get(conn->cc_lockspace, ino, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); + else + return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); +} + /* * Compare a requested locking protocol version against the current one. * @@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .dlm_unlock = user_dlm_unlock, .lock_status = user_dlm_lock_status, .lock_lvb = user_dlm_lvb, + .plock = user_plock, .dump_lksb = user_dlm_dump_lksb, }; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 10e149ae5e3..68b668b0e60 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -97,13 +97,14 @@ static int ocfs2_stack_driver_request(const char *stack_name, goto out; } - /* Ok, the stack is pinned */ - p->sp_count++; active_stack = p; - rc = 0; out: + /* If we found it, pin it */ + if (!rc) + active_stack->sp_count++; + spin_unlock(&ocfs2_stack_lock); return rc; } @@ -287,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); +int ocfs2_stack_supports_plocks(void) +{ + return active_stack && active_stack->sp_ops->plock; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); + +/* + * ocfs2_plock() can only be safely called if + * ocfs2_stack_supports_plocks() returned true + */ +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl) +{ + WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); + if (active_stack->sp_ops->plock) + return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(ocfs2_plock); + int ocfs2_cluster_connect(const char *stack_name, const char *group, int grouplen, diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index db56281dd1b..c571af375ef 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -28,6 +28,10 @@ #include "dlm/dlmapi.h" #include <linux/dlm.h> +/* Needed for plock-related prototypes */ +struct file; +struct file_lock; + /* * dlmconstants.h does not have a LOCAL flag. We hope to remove it * some day, but right now we need it. Let's fake it. This value is larger @@ -187,6 +191,17 @@ struct ocfs2_stack_operations { void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); /* + * Cluster-aware posix locks + * + * This is NULL for stacks which do not support posix locks. + */ + int (*plock)(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl); + + /* * This is an optoinal debugging hook. If provided, the * stack can dump debugging information about this lock. */ @@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); +int ocfs2_stack_supports_plocks(void); +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl); + void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d2d278fb981..c5ff18b46b5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh); + struct buffer_head *bh, + u64 max_block); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, @@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac); -static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; @@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) iput(inode); ac->ac_inode = NULL; } - if (ac->ac_bh) { - brelse(ac->ac_bh); - ac->ac_bh = NULL; - } + brelse(ac->ac_bh); + ac->ac_bh = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) */ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh) + struct buffer_head *bh, + u64 max_block) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; @@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, mlog_entry_void(); cl = &fe->id2.i_chain; - status = ocfs2_reserve_clusters(osb, - le16_to_cpu(cl->cl_cpg), - &ac); + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -394,8 +399,7 @@ bail: if (ac) ocfs2_free_alloc_context(ac); - if (bg_bh) - brelse(bg_bh); + brelse(bg_bh); mlog_exit(status); return status; @@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } - status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, get_bh(bh); ac->ac_bh = bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; } -int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, - struct ocfs2_alloc_context **ac) +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) { int status; u32 slot; @@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; slot = osb->slot_num; (*ac)->ac_group_search = ocfs2_block_group_search; @@ -532,6 +536,15 @@ bail: return status; } +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { @@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* * slot is set when we successfully steal inode from other nodes. * It is reset in 3 places: * 1. when we flush the truncate log @@ -661,9 +682,9 @@ bail: /* Callers don't need to care which bitmap (local alloc or main) to * use so we figure it out for them, but unfortunately this clutters * things a bit. */ -int ocfs2_reserve_clusters(struct ocfs2_super *osb, - u32 bits_wanted, - struct ocfs2_alloc_context **ac) +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac) { int status; @@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, } (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; status = -ENOSPC; if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); - if ((status < 0) && (status != -ENOSPC)) { + if (status == -EFBIG) { + /* The local alloc window is outside ac_max_block. + * use the main bitmap. */ + status = -ENOSPC; + } else if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; - } else if (status == -ENOSPC) { - /* reserve_local_bits will return enospc with - * the local alloc inode still locked, so we - * can change this safely here. */ - mlog(0, "Disabling local alloc\n"); - /* We set to OCFS2_LA_DISABLED so that umount - * can clean up what's left of the local - * allocation */ - osb->local_alloc_state = OCFS2_LA_DISABLED; } } @@ -718,6 +735,13 @@ bail: return status; } +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); +} + /* * More or less lifted from ext3. I'll leave their description below: * @@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int search = -ENOSPC; int ret; + u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 tmp_off, tmp_found; unsigned int max_bits, gd_cluster_off; @@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode, if (ret) return ret; + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + tmp_off + tmp_found); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + /* ocfs2_block_group_find_clear_bits() might * return success, but we still want to return * -ENOSPC unless it found the minimum number @@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, *bit_off = tmp_off; *bits_found = tmp_found; search = 0; /* success */ + } else if (tmp_found) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, tmp_found); } } @@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode, static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int ret = -ENOSPC; + u64 blkoff; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) + if (bg->bg_free_bits_count) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), bit_off, bits_found); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + + *bits_found; + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } return ret; } @@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *gd; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, - &group_bh, OCFS2_BH_CACHED, alloc_inode); + ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; @@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - bit_off, &found); + ac->ac_max_block, bit_off, &found); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); @@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bits_wanted, chain, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + status = ocfs2_read_block(alloc_inode, le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh, OCFS2_BH_CACHED, alloc_inode); + &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, bit_off, + bits_wanted, min_bits, + ac->ac_max_block, bit_off, &tmp_bits)) == -ENOSPC) { if (!bg->bg_next_group) break; - if (prev_group_bh) { - brelse(prev_group_bh); - prev_group_bh = NULL; - } + brelse(prev_group_bh); + prev_group_bh = NULL; + next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), - next_group, &group_bh, - OCFS2_BH_CACHED, alloc_inode); + status = ocfs2_read_block(alloc_inode, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, *bg_blkno = le64_to_cpu(bg->bg_blkno); *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: - if (group_bh) - brelse(group_bh); - if (prev_group_bh) - brelse(prev_group_bh); + brelse(group_bh); + brelse(prev_group_bh); mlog_exit(status); return status; @@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle, { int status = 0; u32 tmp_used; - struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; @@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, (unsigned long long)bg_blkno, start_bit); - status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, - alloc_inode); + status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, } bail: - if (group_bh) - brelse(group_bh); + brelse(group_bh); mlog_exit(status); return status; @@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle, status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, bg_start_bit, bg_blkno, num_clusters); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out; + } + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); + +out: mlog_exit(status); return status; } @@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); } } + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, inode, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 544c600662b..4df159d8f45 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -28,10 +28,11 @@ typedef int (group_search_t)(struct inode *, struct buffer_head *, - u32, - u32, - u16 *, - u16 *); + u32, /* bits_wanted */ + u32, /* min_bits */ + u64, /* max_block */ + u16 *, /* *bit_off */ + u16 *); /* *bits_found */ struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ @@ -51,6 +52,8 @@ struct ocfs2_alloc_context { group_search_t *ac_group_search; u64 ac_last_group; + u64 ac_max_block; /* Highest block number to allocate. 0 is + is the same as ~0 - unlimited */ }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); @@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) return ac->ac_bits_wanted - ac->ac_bits_given; } +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac); int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac); int ocfs2_reserve_clusters(struct ocfs2_super *osb, @@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode) * apis above. */ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac); +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); /* given a cluster offset, calculate which block group it belongs to * and return that block offset. */ @@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, struct ocfs2_group_desc *gd); +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 88255d3f52b..304b63ac78c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -64,6 +64,7 @@ #include "sysfile.h" #include "uptodate.h" #include "ver.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -154,10 +155,13 @@ enum { Opt_localalloc, Opt_localflocks, Opt_stack, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_inode64, Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_barrier, "barrier=%u"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, @@ -173,6 +177,9 @@ static match_table_t tokens = { {Opt_localalloc, "localalloc=%d"}, {Opt_localflocks, "localflocks"}, {Opt_stack, "cluster_stack=%s"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_inode64, "inode64"}, {Opt_err, NULL} }; @@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { + if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + &target)) { if (wait) - log_wait_commit(OCFS2_SB(sb)->journal->j_journal, - target); + jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); } return 0; } @@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } @@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + /* Probably don't want this on remount; it might + * mess with other nodes */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && + (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); + goto out; + } + /* We're going to/from readonly mode. */ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { /* Lock here so the check of HARD_RO and the potential @@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; - osb->local_alloc_size = parsed_options.localalloc_opt; + osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); + osb->local_alloc_bits = osb->local_alloc_default_bits; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; read_super_error: - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) iput(inode); @@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_data_writeback: mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; break; + case Opt_user_xattr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_nouser_xattr: + mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; + break; case Opt_atime_quantum: if (match_int(&args[0], &option)) { status = 0; @@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb, if (option < 0) return 0; if (option == 0) - option = JBD_DEFAULT_MAX_COMMIT_AGE; + option = JBD2_DEFAULT_MAX_COMMIT_AGE; mopt->commit_interval = HZ * option; break; case Opt_localalloc: @@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); unsigned long opts = osb->s_mount_opt; + unsigned int local_alloc_megs; if (opts & OCFS2_MOUNT_HB_LOCAL) seq_printf(s, ",_netdev,heartbeat=local"); @@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",commit=%u", (unsigned) (osb->osb_commit_interval / HZ)); - if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) - seq_printf(s, ",localalloc=%d", osb->local_alloc_size); + local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); + if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) seq_printf(s, ",localflocks,"); @@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_NOUSERXATTR) + seq_printf(s, ",nouser_xattr"); + else + seq_printf(s, ",user_xattr"); + + if (opts & OCFS2_MOUNT_INODE64) + seq_printf(s, ",inode64"); + return 0; } @@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data) oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); + init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; @@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ @@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->slot_num = OCFS2_INVALID_SLOT; + osb->s_xattr_inline_size = le16_to_cpu( + di->id2.i_super.s_xattr_inline_size); + osb->local_alloc_state = OCFS2_LA_UNUSED; osb->local_alloc_bh = NULL; + INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); init_waitqueue_head(&osb->osb_mount_event); @@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->first_cluster_group_blkno = le64_to_cpu(di->id2.i_super.s_first_cluster_group); osb->fs_generation = le32_to_cpu(di->i_fs_generation); + osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); mlog(0, "vol_label: %s\n", osb->vol_label); mlog(0, "uuid: %s\n", osb->uuid_str); mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index ba9dbb51d25..cbd03dfdc7b 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -50,6 +50,7 @@ #include "inode.h" #include "journal.h" #include "symlink.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, mlog_entry_void(); - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); if (status < 0) { mlog_errno(status); link = ERR_PTR(status); @@ -157,8 +154,7 @@ bail: kunmap(page); page_cache_release(page); } - if (bh) - brelse(bh); + brelse(bh); return ERR_PTR(status); } @@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 4da8851f2b2..187b99ff036 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -53,7 +53,11 @@ #include <linux/highmem.h> #include <linux/buffer_head.h> #include <linux/rbtree.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +#endif #define MLOG_MASK_PREFIX ML_UPTODATE @@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached--; } -/* Called when we remove a chunk of metadata from an inode. We don't - * bother reverting things to an inlined array in the case of a remove - * which moves us back under the limit. */ -void ocfs2_remove_from_cache(struct inode *inode, - struct buffer_head *bh) +static void ocfs2_remove_block_from_cache(struct inode *inode, + sector_t block) { int index; - sector_t block = bh->b_blocknr; struct ocfs2_meta_cache_item *item = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; @@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode, kmem_cache_free(ocfs2_uptodate_cachep, item); } +/* + * Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. + */ +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh) +{ + sector_t block = bh->b_blocknr; + + ocfs2_remove_block_from_cache(inode, block); +} + +/* Called when we remove xattr clusters from an inode. */ +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len) +{ + unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; + + for (i = 0; i < b_len; i++, block++) + ocfs2_remove_block_from_cache(inode, block); +} + int __init init_ocfs2_uptodate_cache(void) { ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 2e73206059a..531b4b3a0c4 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh); void ocfs2_remove_from_cache(struct inode *inode, struct buffer_head *bh); +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len); int ocfs2_buffer_read_ahead(struct inode *inode, struct buffer_head *bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c new file mode 100644 index 00000000000..802c4149221 --- /dev/null +++ b/fs/ocfs2/xattr.c @@ -0,0 +1,4832 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.c + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is taken from ext3. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/sched.h> +#include <linux/splice.h> +#include <linux/mount.h> +#include <linux/writeback.h> +#include <linux/falloc.h> +#include <linux/sort.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#define MLOG_MASK_PREFIX ML_XATTR +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "symlink.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" +#include "suballoc.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "super.h" +#include "xattr.h" + + +struct ocfs2_xattr_def_value_root { + struct ocfs2_xattr_value_root xv; + struct ocfs2_extent_rec er; +}; + +struct ocfs2_xattr_bucket { + struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + struct ocfs2_xattr_header *xh; +}; + +#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) +#define OCFS2_XATTR_INLINE_SIZE 80 + +static struct ocfs2_xattr_def_value_root def_xv = { + .xv.xr_list.l_count = cpu_to_le16(1), +}; + +struct xattr_handler *ocfs2_xattr_handlers[] = { + &ocfs2_xattr_user_handler, + &ocfs2_xattr_trusted_handler, + NULL +}; + +static struct xattr_handler *ocfs2_xattr_handler_map[] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, +}; + +struct ocfs2_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ocfs2_xattr_search { + struct buffer_head *inode_bh; + /* + * xattr_bh point to the block buffer head which has extended attribute + * when extended attribute in inode, xattr_bh is equal to inode_bh. + */ + struct buffer_head *xattr_bh; + struct ocfs2_xattr_header *header; + struct ocfs2_xattr_bucket bucket; + void *base; + void *end; + struct ocfs2_xattr_entry *here; + int not_found; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset); + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size); + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs); + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh); + +static inline const char *ocfs2_xattr_prefix(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < OCFS2_XATTR_MAX) + handler = ocfs2_xattr_handler_map[name_index]; + + return handler ? handler->prefix : NULL; +} + +static u32 ocfs2_xattr_name_hash(struct inode *inode, + const char *name, + int name_len) +{ + /* Get hash value of uuid from super block */ + u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash; + int i; + + /* hash extended attribute name */ + for (i = 0; i < name_len; i++) { + hash = (hash << OCFS2_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^ + *name++; + } + + return hash; +} + +/* + * ocfs2_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static void ocfs2_xattr_hash_entry(struct inode *inode, + struct ocfs2_xattr_header *header, + struct ocfs2_xattr_entry *entry) +{ + u32 hash = 0; + char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); + + hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len); + entry->xe_name_hash = cpu_to_le32(hash); + + return; +} + +static int ocfs2_xattr_extend_allocation(struct inode *inode, + u32 clusters_to_add, + struct buffer_head *xattr_bh, + struct ocfs2_xattr_value_root *xv) +{ + int status = 0; + int restart_func = 0; + int credits = 0; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); + struct ocfs2_extent_tree et; + + mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); + + ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); + +restart_all: + + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + status = ocfs2_journal_access(handle, inode, xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = le32_to_cpu(xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(osb, + inode, + &logical_start, + clusters_to_add, + 0, + &et, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, xattr_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + mlog(0, "restarting function.\n"); + restart_func = 1; + } else { + BUG_ON(why != RESTART_TRANS); + + mlog(0, "restarting transaction.\n"); + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + et.et_root_el, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + +leave: + if (handle) { + ocfs2_commit_trans(osb, handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + + return status; +} + +static int __ocfs2_remove_xattr_range(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xv->xr_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_xattr_shrink_size(struct inode *inode, + u32 old_clusters, + u32 new_clusters, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv) +{ + int ret = 0; + u32 trunc_len, cpos, phys_cpos, alloc_size; + u64 block; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (old_clusters <= new_clusters) + return 0; + + cpos = new_clusters; + trunc_len = old_clusters - new_clusters; + while (trunc_len) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, + phys_cpos, alloc_size, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + ocfs2_remove_xattr_clusters_from_cache(inode, block, + alloc_size); + cpos += alloc_size; + trunc_len -= alloc_size; + } + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static int ocfs2_xattr_value_truncate(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + int len) +{ + int ret; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); + u32 old_clusters = le32_to_cpu(xv->xr_clusters); + + if (new_clusters == old_clusters) + return 0; + + if (new_clusters > old_clusters) + ret = ocfs2_xattr_extend_allocation(inode, + new_clusters - old_clusters, + root_bh, xv); + else + ret = ocfs2_xattr_shrink_size(inode, + old_clusters, new_clusters, + root_bh, xv); + + return ret; +} + +static int ocfs2_xattr_list_entry(char *buffer, size_t size, + size_t *result, const char *prefix, + const char *name, int name_len) +{ + char *p = buffer + *result; + int prefix_len = strlen(prefix); + int total_len = prefix_len + name_len + 1; + + *result += total_len; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + return 0; + + if (*result > size) + return -ERANGE; + + memcpy(p, prefix, prefix_len); + memcpy(p + prefix_len, name, name_len); + p[prefix_len + name_len] = '\0'; + + return 0; +} + +static int ocfs2_xattr_list_entries(struct inode *inode, + struct ocfs2_xattr_header *header, + char *buffer, size_t buffer_size) +{ + size_t result = 0; + int i, type, ret; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); + + ret = ocfs2_xattr_list_entry(buffer, buffer_size, + &result, prefix, name, + entry->xe_name_len); + if (ret) + return ret; + } + } + + return result; +} + +static int ocfs2_xattr_ibody_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_xattr_header *header = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); + + return ret; +} + +static int ocfs2_xattr_block_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + ret = ocfs2_xattr_list_entries(inode, header, + buffer, buffer_size); + } else { + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + ret = ocfs2_xattr_tree_list_index_block(inode, xt, + buffer, buffer_size); + } +cleanup: + brelse(blk_bh); + + return ret; +} + +ssize_t ocfs2_listxattr(struct dentry *dentry, + char *buffer, + size_t size) +{ + int ret = 0, i_ret = 0, b_ret = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + + if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return ret; + + ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + if (i_ret < 0) + b_ret = 0; + else { + if (buffer) { + buffer += i_ret; + size -= i_ret; + } + b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + buffer, size); + if (b_ret < 0) + i_ret = 0; + } + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(dentry->d_inode, 0); + + brelse(di_bh); + + return i_ret + b_ret; +} + +static int ocfs2_xattr_find_entry(int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *entry; + size_t name_len; + int i, cmp = 1; + + if (name == NULL) + return -EINVAL; + + name_len = strlen(name); + entry = xs->here; + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + cmp = name_index - ocfs2_xattr_get_type(entry); + if (!cmp) + cmp = name_len - entry->xe_name_len; + if (!cmp) + cmp = memcmp(name, (xs->base + + le16_to_cpu(entry->xe_name_offset)), + name_len); + if (cmp == 0) + break; + entry += 1; + } + xs->here = entry; + + return cmp ? -ENODATA : 0; +} + +static int ocfs2_xattr_get_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + void *buffer, + size_t len) +{ + u32 cpos, p_cluster, num_clusters, bpc, clusters; + u64 blkno; + int i, ret = 0; + size_t cplen, blocksize; + struct buffer_head *bh = NULL; + struct ocfs2_extent_list *el; + + el = &xv->xr_list; + clusters = le32_to_cpu(xv->xr_clusters); + bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + blocksize = inode->i_sb->s_blocksize; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + /* Copy ocfs2_xattr_value */ + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + cplen = len >= blocksize ? blocksize : len; + memcpy(buffer, bh->b_data, cplen); + len -= cplen; + buffer += cplen; + + brelse(bh); + bh = NULL; + if (len == 0) + break; + } + cpos += num_clusters; + } +out: + return ret; +} + +static int ocfs2_xattr_ibody_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return -ENODATA; + + xs->end = (void *)di + inode->i_sb->s_blocksize; + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret) + return ret; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + if (size > buffer_size) + return -ERANGE; + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + le16_to_cpu(xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len), size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + le16_to_cpu( + xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len)); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return size; +} + +static int ocfs2_xattr_block_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = -ENODATA, name_offset, name_len, block_off, i; + + if (!di->i_xattr_loc) + return ret; + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret) + goto cleanup; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + ret = -ERANGE; + if (size > buffer_size) + goto cleanup; + + name_offset = le16_to_cpu(xs->here->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len); + i = xs->here - xs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + xs->bucket.xh, + i, + &block_off, + &name_offset); + xs->base = xs->bucket.bhs[block_off]->b_data; + } + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + name_offset + name_len, size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + name_offset + name_len); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + goto cleanup; + } + } + } + ret = size; +cleanup: + for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) + brelse(xs->bucket.bhs[i]); + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + brelse(blk_bh); + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_dinode *di = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + ret = -ENODATA; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size, &xis); + if (ret == -ENODATA) + ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, + buffer_size, &xbs); + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return ret; +} + +static int __ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + const void *value, + int value_len) +{ + int ret = 0, i, cp_len, credits; + u16 blocksize = inode->i_sb->s_blocksize; + u32 p_cluster, num_clusters; + u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); + u64 blkno; + struct buffer_head *bh = NULL; + handle_t *handle; + + BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); + + credits = clusters * bpc; + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, + inode, + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + cp_len = value_len > blocksize ? blocksize : value_len; + memcpy(bh->b_data, value, cp_len); + value_len -= cp_len; + value += cp_len; + if (cp_len < blocksize) + memset(bh->b_data + cp_len, 0, + blocksize - cp_len); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + brelse(bh); + bh = NULL; + + /* + * XXX: do we need to empty all the following + * blocks in this cluster? + */ + if (!value_len) + break; + } + cpos += num_clusters; + } +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + brelse(bh); + + return ret; +} + +static int ocfs2_xattr_cleanup(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + /* Decrease xattr count */ + le16_add_cpu(&xs->header->xh_count, -1); + /* Remove the xattr entry and tree root which has already be set*/ + memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); + memset(val, 0, size); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_update_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + xs->here->xe_name_offset = cpu_to_le16(offs); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) + ocfs2_xattr_set_local(xs->here, 1); + else + ocfs2_xattr_set_local(xs->here, 0); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +/* + * ocfs2_xattr_set_value_outside() + * + * Set large size value in B tree. + */ +static int ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + struct ocfs2_xattr_value_root *xv = NULL; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + int ret = 0; + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(name_len)); + xv->xr_clusters = 0; + xv->xr_last_eb_blk = 0; + xv->xr_list.l_tree_depth = 0; + xv->xr_list.l_count = cpu_to_le16(1); + xv->xr_list.l_next_free_rec = 0; + + ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +/* + * ocfs2_xattr_set_entry_local() + * + * Set, replace or remove extended attribute in local. + */ +static void ocfs2_xattr_set_entry_local(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_entry *last, + size_t min_offs) +{ + size_t name_len = strlen(xi->name); + int i; + + if (xi->value && xs->not_found) { + /* Insert the new xattr entry. */ + le16_add_cpu(&xs->header->xh_count, 1); + ocfs2_xattr_set_type(last, xi->name_index); + ocfs2_xattr_set_local(last, 1); + last->xe_name_len = name_len; + } else { + void *first_val; + void *val; + size_t offs, size; + + first_val = xs->base + min_offs; + offs = le16_to_cpu(xs->here->xe_name_offset); + val = xs->base + offs; + + if (le64_to_cpu(xs->here->xe_value_size) > + OCFS2_XATTR_INLINE_SIZE) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + + if (xi->value && size == OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + /* The old and the new value have the + same size. Just replace the value. */ + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + /* Clear value bytes. */ + memset(val + OCFS2_XATTR_SIZE(name_len), + 0, + OCFS2_XATTR_SIZE(xi->value_len)); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + return; + } + /* Remove the old name+value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + xs->here->xe_name_hash = 0; + xs->here->xe_name_offset = 0; + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = 0; + + min_offs += size; + + /* Adjust all value offsets. */ + last = xs->header->xh_entries; + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t o = le16_to_cpu(last->xe_name_offset); + + if (o < offs) + last->xe_name_offset = cpu_to_le16(o + size); + last += 1; + } + + if (!xi->value) { + /* Remove the old entry. */ + last -= 1; + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xs->header->xh_count, -1); + } + } + if (xi->value) { + /* Insert the new name+value. */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + void *val = xs->base + min_offs - size; + + xs->here->xe_name_offset = cpu_to_le16(min_offs - size); + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xs->here, 1); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + } + + return; +} + +/* + * ocfs2_xattr_set_entry() + * + * Set extended attribute entry into inode or block. + * + * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, + * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), + * then set value in B tree with set_value_outside(). + */ +static int ocfs2_xattr_set_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + int flag) +{ + struct ocfs2_xattr_entry *last; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); + size_t size_l = 0; + handle_t *handle = NULL; + int free, i, ret; + struct ocfs2_xattr_info xi_l = { + .name_index = xi->name_index, + .name = xi->name, + .value = xi->value, + .value_len = xi->value_len, + }; + + /* Compute min_offs, last and free space. */ + last = xs->header->xh_entries; + + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + if (free < 0) + return -EFAULT; + + if (!xs->not_found) { + size_t size = 0; + if (ocfs2_xattr_is_local(xs->here)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + free += (size + sizeof(struct ocfs2_xattr_entry)); + } + /* Check free space in inode or block */ + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE) { + ret = -ENOSPC; + goto out; + } + size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + xi_l.value = (void *)&def_xv; + xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; + } else if (xi->value) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + ret = -ENOSPC; + goto out; + } + } + + if (!xs->not_found) { + /* For existing extended attribute */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + void *val = xs->base + offs; + + if (ocfs2_xattr_is_local(xs->here) && size == size_l) { + /* Replace existing local xattr with tree root */ + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else if (!ocfs2_xattr_is_local(xs->here)) { + /* For existing xattr which has value outside */ + struct ocfs2_xattr_value_root *xv = NULL; + xv = (struct ocfs2_xattr_value_root *)(val + + OCFS2_XATTR_SIZE(name_len)); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If new value need set outside also, + * first truncate old value to new value, + * then set new value with set_value_outside(). + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_xattr_set_value_outside(inode, + xv, + xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_update_entry(inode, + xi, + xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else { + /* + * If new value need set in local, + * just trucate old value to zero. + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + 0); + if (ret < 0) + mlog_errno(ret); + } + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + /* set extended attribute in external block. */ + ret = ocfs2_extend_trans(handle, + OCFS2_INODE_UPDATE_CREDITS + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Set value in local, include set tree root in local. + * This is the first step for value size >INLINE_SIZE. + */ + ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs); + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && + (flag & OCFS2_INLINE_XATTR_FL)) { + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + } + /* Update xattr flag */ + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= flag; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + /* Update inode ctime */ + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + + if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * Set value outside in B tree. + * This is the second step for value size > INLINE_SIZE. + */ + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); + if (ret < 0) { + int ret2; + + mlog_errno(ret); + /* + * If set value outside failed, we have to clean + * the junk tree root we have already set in local. + */ + ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); + if (ret2 < 0) + mlog_errno(ret2); + } + } +out: + return ret; + +} + +static int ocfs2_remove_value_outside(struct inode*inode, + struct buffer_head *bh, + struct ocfs2_xattr_header *header) +{ + int ret = 0, i; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + + if (!ocfs2_xattr_is_local(entry)) { + struct ocfs2_xattr_value_root *xv; + void *val; + + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return ret; +} + +static int ocfs2_xattr_ibody_remove(struct inode *inode, + struct buffer_head *di_bh) +{ + + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_xattr_header *header; + int ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_remove_value_outside(inode, di_bh, header); + + return ret; +} + +static int ocfs2_xattr_block_remove(struct inode *inode, + struct buffer_head *blk_bh) +{ + struct ocfs2_xattr_block *xb; + int ret = 0; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); + ret = ocfs2_remove_value_outside(inode, blk_bh, header); + } else + ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + + return ret; +} + +static int ocfs2_xattr_free_block(struct inode *inode, + u64 block) +{ + struct inode *xb_alloc_inode; + struct buffer_head *xb_alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + int ret = 0; + u64 blk, bg_blkno; + u16 bit; + + ret = ocfs2_read_block(inode, block, &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto out; + } + + ret = ocfs2_xattr_block_remove(inode, blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + blk = le64_to_cpu(xb->xb_blkno); + bit = le16_to_cpu(xb->xb_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + xb_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(xb->xb_suballoc_slot)); + if (!xb_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&xb_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh, + bit, bg_blkno, 1); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_unlock: + ocfs2_inode_unlock(xb_alloc_inode, 1); + brelse(xb_alloc_bh); +out_mutex: + mutex_unlock(&xb_alloc_inode->i_mutex); + iput(xb_alloc_inode); +out: + brelse(blk_bh); + return ret; +} + +/* + * ocfs2_xattr_remove() + * + * Free extended attribute resources associated with this inode. + */ +int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle; + int ret; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_ibody_remove(inode, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (di->i_xattr_loc) { + ret = ocfs2_xattr_free_block(inode, + le64_to_cpu(di->i_xattr_loc)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + di->i_xattr_loc = 0; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_has_space_inline(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + int free; + + if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size); + } else if (ocfs2_inode_is_fast_symlink(inode)) { + free = ocfs2_fast_symlink_chars(inode->i_sb) - + le64_to_cpu(di->i_size); + } else { + struct ocfs2_extent_list *el = &di->id2.i_list; + free = (le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec)) * + sizeof(struct ocfs2_extent_rec); + } + if (free >= xattrsize) + return 1; + + return 0; +} + +/* + * ocfs2_xattr_ibody_find() + * + * Find extended attribute in inode block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_ibody_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + int has_space = 0; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + down_read(&oi->ip_alloc_sem); + has_space = ocfs2_xattr_has_space_inline(inode, di); + up_read(&oi->ip_alloc_sem); + if (!has_space) + return 0; + } + + xs->xattr_bh = xs->inode_bh; + xs->end = (void *)di + inode->i_sb->s_blocksize; + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + else + xs->header = (struct ocfs2_xattr_header *) + (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + /* Find the named attribute. */ + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret && ret != -ENODATA) + return ret; + xs->not_found = ret; + } + + return 0; +} + +/* + * ocfs2_xattr_ibody_set() + * + * Set, replace or remove an extended attribute into inode block. + * + */ +static int ocfs2_xattr_ibody_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return -ENOSPC; + + down_write(&oi->ip_alloc_sem); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + } + + ret = ocfs2_xattr_set_entry(inode, xi, xs, + (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +/* + * ocfs2_xattr_block_find() + * + * Find extended attribute in external block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret && ret != -ENODATA) { + xs->xattr_bh = NULL; + goto cleanup; + } + xs->not_found = ret; + return 0; +cleanup: + brelse(blk_bh); + + return ret; +} + +/* + * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree + * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header + * re-initialized. + */ +static int ocfs2_restore_xattr_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + + BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) || + le16_to_cpu(el->l_next_free_rec) != 0); + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED); + + ocfs2_journal_dirty(handle, xs->xattr_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * ocfs2_xattr_block_set() + * + * Set, replace or remove an extended attribute into external block. + * + */ +static int ocfs2_xattr_block_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct buffer_head *new_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + struct ocfs2_xattr_block *xblk = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + int ret; + + if (!xs->xattr_bh) { + /* + * Alloc one external block for extended attribute + * outside of inode. + */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + handle = ocfs2_start_trans(osb, + OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_xattr_block */ + xs->xattr_bh = new_bh; + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + xs->header = &xblk->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)xblk + inode->i_sb->s_blocksize; + xs->here = xs->header->xh_entries; + + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (ret < 0) + return ret; + } else + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + + if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { + /* Set extended attribute into external block */ + ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); + if (!ret || ret != -ENOSPC) + goto end; + + ret = ocfs2_xattr_create_index_block(inode, xs); + if (ret) + goto end; + } + + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); + if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0) + ret = ocfs2_restore_xattr_block(inode, xs); + +end: + + return ret; +} + +/* + * ocfs2_xattr_set() + * + * Set, replace or remove an extended attribute for this inode. + * value is NULL to remove an existing extended attribute, else either + * create or replace an extended attribute. + */ +int ocfs2_xattr_set(struct inode *inode, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags) +{ + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int ret; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + struct ocfs2_xattr_info xi = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + /* + * Scan inode and external block to find the same name + * extended attribute and collect search infomation. + */ + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + if (xis.not_found && xbs.not_found) { + ret = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + ret = 0; + if (!value) + goto cleanup; + } else { + ret = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + if (!value) { + /* Remove existing extended attribute */ + if (!xis.not_found) + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + else if (!xbs.not_found) + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + if (!ret && !xbs.not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else if (ret == -ENOSPC) { + if (di->i_xattr_loc && !xbs.xattr_bh) { + ret = ocfs2_xattr_block_find(inode, name_index, + name, &xbs); + if (ret) + goto cleanup; + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + if (ret) + goto cleanup; + if (!xis.not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + } + } + } +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + brelse(xbs.xattr_bh); + for (i = 0; i < blk_per_bucket; i++) + brelse(xbs.bucket.bhs[i]); + + return ret; +} + +/* + * Find the xattr extent rec which may contains name_hash. + * e_cpos will be the first name hash of the xattr rec. + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list. + */ +static int ocfs2_xattr_get_rec(struct inode *inode, + u32 name_hash, + u64 *p_blkno, + u32 *e_cpos, + u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + u64 e_blkno = 0; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= name_hash) { + e_blkno = le64_to_cpu(rec->e_blkno); + break; + } + } + + if (!e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *p_blkno = le64_to_cpu(rec->e_blkno); + *num_clusters = le16_to_cpu(rec->e_leaf_clusters); + if (e_cpos) + *e_cpos = le32_to_cpu(rec->e_cpos); +out: + brelse(eb_bh); + return ret; +} + +typedef int (xattr_bucket_func)(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); + +static int ocfs2_find_xe_in_bucket(struct inode *inode, + struct buffer_head *header_bh, + int name_index, + const char *name, + u32 name_hash, + u16 *xe_index, + int *found) +{ + int i, ret = 0, cmp = 1, block_off, new_offset; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t name_len = strlen(name); + struct ocfs2_xattr_entry *xe = NULL; + struct buffer_head *name_bh = NULL; + char *xe_name; + + /* + * We don't use binary search in the bucket because there + * may be multiple entries with the same name hash. + */ + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) + continue; + else if (name_hash < le32_to_cpu(xe->xe_name_hash)) + break; + + cmp = name_index - ocfs2_xattr_get_type(xe); + if (!cmp) + cmp = name_len - xe->xe_name_len; + if (cmp) + continue; + + ret = ocfs2_xattr_bucket_get_name_value(inode, + xh, + i, + &block_off, + &new_offset); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off, + &name_bh); + if (ret) { + mlog_errno(ret); + break; + } + xe_name = name_bh->b_data + new_offset; + + cmp = memcmp(name, xe_name, name_len); + brelse(name_bh); + name_bh = NULL; + + if (cmp == 0) { + *xe_index = i; + *found = 1; + ret = 0; + break; + } + } + + return ret; +} + +/* + * Find the specified xattr entry in a series of buckets. + * This series start from p_blkno and last for num_clusters. + * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains + * the num of the valid buckets. + * + * Return the buffer_head this xattr should reside in. And if the xattr's + * hash is in the gap of 2 buckets, return the lower bucket. + */ +static int ocfs2_xattr_bucket_find(struct inode *inode, + int name_index, + const char *name, + u32 name_hash, + u64 p_blkno, + u32 first_hash, + u32 num_clusters, + struct ocfs2_xattr_search *xs) +{ + int ret, found = 0; + struct buffer_head *bh = NULL; + struct buffer_head *lower_bh = NULL; + struct ocfs2_xattr_header *xh = NULL; + struct ocfs2_xattr_entry *xe = NULL; + u16 index = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int low_bucket = 0, bucket, high_bucket; + u32 last_hash; + u64 blkno; + + ret = ocfs2_read_block(inode, p_blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; + + while (low_bucket <= high_bucket) { + brelse(bh); + bh = NULL; + bucket = (low_bucket + high_bucket) / 2; + + blkno = p_blkno + bucket * blk_per_bucket; + + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xe = &xh->xh_entries[0]; + if (name_hash < le32_to_cpu(xe->xe_name_hash)) { + high_bucket = bucket - 1; + continue; + } + + /* + * Check whether the hash of the last entry in our + * bucket is larger than the search one. for an empty + * bucket, the last one is also the first one. + */ + if (xh->xh_count) + xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; + + last_hash = le32_to_cpu(xe->xe_name_hash); + + /* record lower_bh which may be the insert place. */ + brelse(lower_bh); + lower_bh = bh; + bh = NULL; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) { + low_bucket = bucket + 1; + continue; + } + + /* the searched xattr should reside in this bucket if exists. */ + ret = ocfs2_find_xe_in_bucket(inode, lower_bh, + name_index, name, name_hash, + &index, &found); + if (ret) { + mlog_errno(ret); + goto out; + } + break; + } + + /* + * Record the bucket we have found. + * When the xattr's hash value is in the gap of 2 buckets, we will + * always set it to the previous bucket. + */ + if (!lower_bh) { + /* + * We can't find any bucket whose first name_hash is less + * than the find name_hash. + */ + BUG_ON(bh->b_blocknr != p_blkno); + lower_bh = bh; + bh = NULL; + } + xs->bucket.bhs[0] = lower_bh; + xs->bucket.xh = (struct ocfs2_xattr_header *) + xs->bucket.bhs[0]->b_data; + lower_bh = NULL; + + xs->header = xs->bucket.xh; + xs->base = xs->bucket.bhs[0]->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (found) { + /* + * If we have found the xattr enty, read all the blocks in + * this bucket. + */ + ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + xs->here = &xs->header->xh_entries[index]; + mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); + } else + ret = -ENODATA; + +out: + brelse(bh); + brelse(lower_bh); + return ret; +} + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + int ret; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u64 p_blkno = 0; + u32 first_hash, num_clusters = 0; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return -ENODATA; + + mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n", + name, name_hash, name_index); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); + + mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " + "in the rec is %u\n", num_clusters, p_blkno, first_hash); + + ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, + p_blkno, first_hash, num_clusters, xs); + +out: + return ret; +} + +static int ocfs2_iterate_xattr_buckets(struct inode *inode, + u64 blkno, + u32 clusters, + xattr_bucket_func *func, + void *para) +{ + int i, j, ret = 0; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + u32 num_buckets = clusters * bpc; + struct ocfs2_xattr_bucket bucket; + + memset(&bucket, 0, sizeof(bucket)); + + mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", + clusters, blkno); + + for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, + bucket.bhs, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data; + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); + + mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno, + le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); + if (func) { + ret = func(inode, &bucket, para); + if (ret) { + mlog_errno(ret); + break; + } + } + + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + memset(&bucket, 0, sizeof(bucket)); + } + +out: + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + + return ret; +} + +struct ocfs2_xattr_tree_list { + char *buffer; + size_t buffer_size; + size_t result; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset) +{ + u16 name_offset; + + if (index < 0 || index >= le16_to_cpu(xh->xh_count)) + return -EINVAL; + + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); + + *block_off = name_offset >> inode->i_sb->s_blocksize_bits; + *new_offset = name_offset % inode->i_sb->s_blocksize; + + return 0; +} + +static int ocfs2_list_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, type; + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; + int i, block_off, new_offset; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket->xh, + i, + &block_off, + &new_offset); + if (ret) + break; + + name = (const char *)bucket->bhs[block_off]->b_data + + new_offset; + ret = ocfs2_xattr_list_entry(xl->buffer, + xl->buffer_size, + &xl->result, + prefix, name, + entry->xe_name_len); + if (ret) + break; + } + } + + return ret; +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_extent_list *el = &xt->xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; + u64 p_blkno = 0; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_list_xattr_bucket, + &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + + ret = xl.result; +out: + return ret; +} + +static int cmp_xe(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_hash = le32_to_cpu(l->xe_name_hash); + u32 r_hash = le32_to_cpu(r->xe_name_hash); + + if (l_hash > r_hash) + return 1; + if (l_hash < r_hash) + return -1; + return 0; +} + +static void swap_xe(void *a, void *b, int size) +{ + struct ocfs2_xattr_entry *l = a, *r = b, tmp; + + tmp = *l; + memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); + memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); +} + +/* + * When the ocfs2_xattr_block is filled up, new bucket will be created + * and all the xattr entries will be moved to the new bucket. + * Note: we need to sort the entries since they are not saved in order + * in the ocfs2_xattr_block. + */ +static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *xh_bh, + struct buffer_head *data_bh) +{ + int i, blocksize = inode->i_sb->s_blocksize; + u16 offset, size, off_change; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)xh_bh->b_data; + u16 count = le16_to_cpu(xb_xh->xh_count); + char *target = xh_bh->b_data, *src = xb_bh->b_data; + + mlog(0, "cp xattr from block %llu to bucket %llu\n", + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)xh_bh->b_blocknr); + + memset(xh_bh->b_data, 0, blocksize); + if (data_bh) + memset(data_bh->b_data, 0, blocksize); + /* + * Since the xe_name_offset is based on ocfs2_xattr_header, + * there is a offset change corresponding to the change of + * ocfs2_xattr_header's position. + */ + off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + xe = &xb_xh->xh_entries[count - 1]; + offset = le16_to_cpu(xe->xe_name_offset) + off_change; + size = blocksize - offset; + + /* copy all the names and values. */ + if (data_bh) + target = data_bh->b_data; + memcpy(target + offset, src + offset, size); + + /* Init new header now. */ + xh->xh_count = xb_xh->xh_count; + xh->xh_num_buckets = cpu_to_le16(1); + xh->xh_name_value_len = cpu_to_le16(size); + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); + + /* copy all the entries. */ + target = xh_bh->b_data; + offset = offsetof(struct ocfs2_xattr_header, xh_entries); + size = count * sizeof(struct ocfs2_xattr_entry); + memcpy(target + offset, (char *)xb_xh + offset, size); + + /* Change the xe offset for all the xe because of the move. */ + off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize + + offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + for (i = 0; i < count; i++) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); + + mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n", + offset, size, off_change); + + sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); +} + +/* + * After we move xattr from block to index btree, we have to + * update ocfs2_xattr_search to the new xe and base. + * + * When the entry is in xattr block, xattr_bh indicates the storage place. + * While if the entry is in index b-tree, "bucket" indicates the + * real place of the xattr. + */ +static int ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + int ret = 0; + char *buf = old_bh->b_data; + struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; + struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; + int i, blocksize = inode->i_sb->s_blocksize; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + xs->bucket.bhs[0] = new_bh; + get_bh(new_bh); + xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data; + xs->header = xs->bucket.xh; + + xs->base = new_bh->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (!xs->not_found) { + if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + 0); + if (ret) { + mlog_errno(ret); + return ret; + } + + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; + } + } + + return ret; +} + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret, credits = OCFS2_SUBALLOC_ALLOC; + u32 bit_off, len; + u64 blkno; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_alloc_context *data_ac; + struct buffer_head *xh_bh = NULL, *data_bh = NULL; + struct buffer_head *xb_bh = xs->xattr_bh; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xr; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog(0, "create xattr index block for %llu\n", + (unsigned long long)xb_bh->b_blocknr); + + BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: + * We can use this lock for now, and maybe move to a dedicated mutex + * if performance becomes a problem later. + */ + down_write(&oi->ip_alloc_sem); + + /* + * 3 more credits, one for xattr block update, one for the 1st block + * of the new xattr bucket and one for the value/data. + */ + credits += 3; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_sem; + } + + ret = ocfs2_journal_access(handle, inode, xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * The bucket may spread in many blocks, and + * we will only touch the 1st block and the last block + * in the whole bucket(one for entry and one for data). + */ + blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno); + + xh_bh = sb_getblk(inode->i_sb, blkno); + if (!xh_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, xh_bh); + + ret = ocfs2_journal_access(handle, inode, xh_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (bpb > 1) { + data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1); + if (!data_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, data_bh); + + ret = ocfs2_journal_access(handle, inode, data_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); + + ocfs2_journal_dirty(handle, xh_bh); + if (data_bh) + ocfs2_journal_dirty(handle, data_bh); + + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); + + /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xr = &xb->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + + xr->xt_list.l_recs[0].e_cpos = 0; + xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + + xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); + + ret = ocfs2_journal_dirty(handle, xb_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_sem: + up_write(&oi->ip_alloc_sem); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + brelse(xh_bh); + brelse(data_bh); + + return ret; +} + +static int cmp_xe_offset(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_name_offset = le16_to_cpu(l->xe_name_offset); + u32 r_name_offset = le16_to_cpu(r->xe_name_offset); + + if (l_name_offset < r_name_offset) + return 1; + if (l_name_offset > r_name_offset) + return -1; + return 0; +} + +/* + * defrag a xattr bucket if we find that the bucket has some + * holes beteen name/value pairs. + * We will move all the name/value pairs to the end of the bucket + * so that we can spare some space for insertion. + */ +static int ocfs2_defrag_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + int ret, i; + size_t end, offset, len, value_len; + struct ocfs2_xattr_header *xh; + char *entries, *buf, *bucket_buf = NULL; + u64 blkno = bucket->bhs[0]->b_blocknr; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u16 xh_free_start; + size_t blocksize = inode->i_sb->s_blocksize; + handle_t *handle; + struct buffer_head **bhs; + struct ocfs2_xattr_entry *xe; + + bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!bhs) + return -ENOMEM; + + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, 0); + if (ret) + goto out; + + /* + * In order to make the operation more efficient and generic, + * we copy all the blocks into a contiguous memory and do the + * defragment there, so if anything is error, we will not touch + * the real block. + */ + bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS); + if (!bucket_buf) { + ret = -EIO; + goto out; + } + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) + memcpy(buf, bhs[i]->b_data, blocksize); + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto commit; + } + } + + xh = (struct ocfs2_xattr_header *)bucket_buf; + entries = (char *)xh->xh_entries; + xh_free_start = le16_to_cpu(xh->xh_free_start); + + mlog(0, "adjust xattr bucket in %llu, count = %u, " + "xh_free_start = %u, xh_name_value_len = %u.\n", + blkno, le16_to_cpu(xh->xh_count), xh_free_start, + le16_to_cpu(xh->xh_name_value_len)); + + /* + * sort all the entries by their offset. + * the largest will be the first, so that we can + * move them to the end one by one. + */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe_offset, swap_xe); + + /* Move all name/values to the end of the bucket. */ + xe = xh->xh_entries; + end = OCFS2_XATTR_BUCKET_SIZE; + for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { + offset = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + value_len = OCFS2_XATTR_SIZE( + le64_to_cpu(xe->xe_value_size)); + else + value_len = OCFS2_XATTR_ROOT_SIZE; + len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len; + + /* + * We must make sure that the name/value pair + * exist in the same block. So adjust end to + * the previous block end if needed. + */ + if (((end - len) / blocksize != + (end - 1) / blocksize)) + end = end - end % blocksize; + + if (end > offset + len) { + memmove(bucket_buf + end - len, + bucket_buf + offset, len); + xe->xe_name_offset = cpu_to_le16(end - len); + } + + mlog_bug_on_msg(end < offset + len, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + end -= len; + } + + mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + if (xh_free_start == end) + goto commit; + + memset(bucket_buf + xh_free_start, 0, end - xh_free_start); + xh->xh_free_start = cpu_to_le16(end); + + /* sort the entries by their name_hash. */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { + memcpy(bhs[i]->b_data, buf, blocksize); + ocfs2_journal_dirty(handle, bhs[i]); + } + +commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + + if (bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(bhs[i]); + } + kfree(bhs); + + kfree(bucket_buf); + return ret; +} + +/* + * Move half nums of the xattr bucket in the previous cluster to this new + * cluster. We only touch the last cluster of the previous extend record. + * + * first_bh is the first buffer_head of a series of bucket in the same + * extent rec and header_bh is the header of one bucket in this cluster. + * They will be updated if we move the data header_bh contains to the new + * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster. + */ +static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blkno, + u64 prev_blkno, + u32 num_clusters, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; + struct ocfs2_xattr_header *new_xh; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)((*first_bh)->b_data); + + BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize); + + prev_bh = *first_bh; + get_bh(prev_bh); + xh = (struct ocfs2_xattr_header *)prev_bh->b_data; + + prev_blkno += (num_clusters - 1) * bpc + bpc / 2; + + mlog(0, "move half of xattrs in cluster %llu to %llu\n", + prev_blkno, new_blkno); + + /* + * We need to update the 1st half of the new cluster and + * 1 more for the update of the 1st bucket of the previous + * extent record. + */ + credits = bpc / 2 + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, prev_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { + old_bh = new_bh = NULL; + new_bh = sb_getblk(inode->i_sb, new_blkno); + if (!new_bh) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + ret = ocfs2_read_block(inode, prev_blkno, &old_bh); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + memcpy(new_bh->b_data, old_bh->b_data, blocksize); + + if (i == 0) { + new_xh = (struct ocfs2_xattr_header *)new_bh->b_data; + new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2); + + if (first_hash) + *first_hash = le32_to_cpu( + new_xh->xh_entries[0].xe_name_hash); + new_first_bh = new_bh; + get_bh(new_first_bh); + } + + ocfs2_journal_dirty(handle, new_bh); + + if (*header_bh == old_bh) { + brelse(*header_bh); + *header_bh = new_bh; + get_bh(*header_bh); + + brelse(*first_bh); + *first_bh = new_first_bh; + get_bh(*first_bh); + } + brelse(new_bh); + brelse(old_bh); + } + + le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2)); + + ocfs2_journal_dirty(handle, prev_bh); +out: + brelse(prev_bh); + brelse(new_first_bh); + return ret; +} + +static int ocfs2_read_xattr_bucket(struct inode *inode, + u64 blkno, + struct buffer_head **bhs, + int new) +{ + int ret = 0; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (!new) + return ocfs2_read_blocks(inode, blkno, + blk_per_bucket, bhs, 0); + + for (i = 0; i < blk_per_bucket; i++) { + bhs[i] = sb_getblk(inode->i_sb, blkno + i); + if (bhs[i] == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + } + + return ret; +} + +/* + * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk). + * first_hash will record the 1st hash of the new bucket. + */ +static int ocfs2_half_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) +{ + int ret, i; + u16 count, start, len, name_value_len, xe_len, name_offset; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct buffer_head **s_bhs, **t_bhs = NULL; + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + int blocksize = inode->i_sb->s_blocksize; + + mlog(0, "move half of xattrs from bucket %llu to %llu\n", + blk, new_blk); + + s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, s_bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* copy the whole bucket to the new first. */ + for (i = 0; i < blk_per_bucket; i++) + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + + /* update the new bucket. */ + xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + count = le16_to_cpu(xh->xh_count); + start = count / 2; + + /* + * Calculate the total name/value len and xh_free_start for + * the old bucket first. + */ + name_offset = OCFS2_XATTR_BUCKET_SIZE; + name_value_len = 0; + for (i = 0; i < start; i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + name_value_len += xe_len; + if (le16_to_cpu(xe->xe_name_offset) < name_offset) + name_offset = le16_to_cpu(xe->xe_name_offset); + } + + /* + * Now begin the modification to the new bucket. + * + * In the new bucket, We just move the xattr entry to the beginning + * and don't touch the name/value. So there will be some holes in the + * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is + * called. + */ + xe = &xh->xh_entries[start]; + len = sizeof(struct ocfs2_xattr_entry) * (count - start); + mlog(0, "mv xattr entry len %d from %d to %d\n", len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); + memmove((char *)xh->xh_entries, (char *)xe, len); + xe = &xh->xh_entries[count - start]; + len = sizeof(struct ocfs2_xattr_entry) * start; + memset((char *)xe, 0, len); + + le16_add_cpu(&xh->xh_count, -start); + le16_add_cpu(&xh->xh_name_value_len, -name_value_len); + + /* Calculate xh_free_start for the new bucket. */ + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + if (le16_to_cpu(xe->xe_name_offset) < + le16_to_cpu(xh->xh_free_start)) + xh->xh_free_start = xe->xe_name_offset; + } + + /* set xh->xh_num_buckets for the new xh. */ + if (new_bucket_head) + xh->xh_num_buckets = cpu_to_le16(1); + else + xh->xh_num_buckets = 0; + + for (i = 0; i < blk_per_bucket; i++) { + ocfs2_journal_dirty(handle, t_bhs[i]); + if (ret) + mlog_errno(ret); + } + + /* store the first_hash of the new bucket. */ + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + + /* + * Now only update the 1st block of the old bucket. + * Please note that the entry has been sorted already above. + */ + xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + memset(&xh->xh_entries[start], 0, + sizeof(struct ocfs2_xattr_entry) * (count - start)); + xh->xh_count = cpu_to_le16(start); + xh->xh_free_start = cpu_to_le16(name_offset); + xh->xh_name_value_len = cpu_to_le16(name_value_len); + + ocfs2_journal_dirty(handle, s_bhs[0]); + if (ret) + mlog_errno(ret); + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy xattr from one bucket to another bucket. + * + * The caller must make sure that the journal transaction + * has enough space for journaling. + */ +static int ocfs2_cp_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 s_blkno, + u64 t_blkno, + int t_is_new) +{ + int ret, i; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head **s_bhs, **t_bhs = NULL; + + BUG_ON(s_blkno == t_blkno); + + mlog(0, "cp bucket %llu to %llu, target is %d\n", + s_blkno, t_blkno, t_is_new); + + s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); + if (ret) + goto out; + + t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); + if (ret) + goto out; + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + ocfs2_journal_dirty(handle, t_bhs[i]); + } + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy one xattr cluster from src_blk to to_blk. + * The to_blk will become the first bucket header of the cluster, so its + * xh_num_buckets will be initialized as the bucket num in the cluster. + */ +static int ocfs2_cp_xattr_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head *first_bh, + u64 src_blk, + u64 to_blk, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + struct buffer_head *bh = NULL; + struct ocfs2_xattr_header *xh; + u64 to_blk_start = to_blk; + + mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk); + + /* + * We need to update the new cluster and 1 more for the update of + * the 1st bucket of the previous extent rec. + */ + credits = bpc + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < num_buckets; i++) { + ret = ocfs2_cp_xattr_bucket(inode, handle, + src_blk, to_blk, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } + + /* update the old bucket header. */ + xh = (struct ocfs2_xattr_header *)first_bh->b_data; + le16_add_cpu(&xh->xh_num_buckets, -num_buckets); + + ocfs2_journal_dirty(handle, first_bh); + + /* update the new bucket header. */ + ret = ocfs2_read_block(inode, to_blk_start, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xh->xh_num_buckets = cpu_to_le16(num_buckets); + + ocfs2_journal_dirty(handle, bh); + + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); +out: + brelse(bh); + return ret; +} + +/* + * Move half of the xattrs in this cluster to the new cluster. + * This function should only be called when bucket size == cluster size. + * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. + */ +static int ocfs2_half_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) +{ + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits = 2 * blk_per_bucket; + + BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); + + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + return ocfs2_half_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); +} + +/* + * Move some xattrs from the old cluster to the new one since they are not + * contiguous in ocfs2 xattr tree. + * + * new_blk starts a new separate cluster, and we will move some xattrs from + * prev_blk to it. v_start will be set as the first name hash value in this + * new cluster so that it can be used as e_cpos during tree insertion and + * don't collide with our original b-tree operations. first_bh and header_bh + * will also be updated since they will be used in ocfs2_extend_xattr_bucket + * to extend the insert bucket. + * + * The problem is how much xattr should we move to the new one and when should + * we update first_bh and header_bh? + * 1. If cluster size > bucket size, that means the previous cluster has more + * than 1 bucket, so just move half nums of bucket into the new cluster and + * update the first_bh and header_bh if the insert bucket has been moved + * to the new cluster. + * 2. If cluster_size == bucket_size: + * a) If the previous extent rec has more than one cluster and the insert + * place isn't in the last cluster, copy the entire last cluster to the + * new one. This time, we don't need to upate the first_bh and header_bh + * since they will not be moved into the new cluster. + * b) Otherwise, move the bottom half of the xattrs in the last cluster into + * the new one. And we set the extend flag to zero if the insert place is + * moved into the new allocated cluster since no extend is needed. + */ +static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blk, + u64 prev_blk, + u32 prev_clusters, + u32 *v_start, + int *extend) +{ + int ret = 0; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + + mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", + prev_blk, prev_clusters, new_blk); + + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) + ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, + handle, + first_bh, + header_bh, + new_blk, + prev_blk, + prev_clusters, + v_start); + else { + u64 last_blk = prev_blk + bpc * (prev_clusters - 1); + + if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) + ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, + last_blk, new_blk, + v_start); + else { + ret = ocfs2_half_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); + + if ((*header_bh)->b_blocknr == last_blk && extend) + *extend = 0; + } + } + + return ret; +} + +/* + * Add a new cluster for xattr storage. + * + * If the new cluster is contiguous with the previous one, it will be + * appended to the same extent record, and num_clusters will be updated. + * If not, we will insert a new extent for it and move some xattrs in + * the last cluster into the new allocated one. + * We also need to limit the maximum size of a btree leaf, otherwise we'll + * lose the benefits of hashing because we'll have to search large leaves. + * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize, + * if it's bigger). + * + * first_bh is the first block of the previous extent rec and header_bh + * indicates the bucket we will insert the new xattrs. They will be updated + * when the header_bh is moved into the new cluster. + */ +static int ocfs2_add_new_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u32 *num_clusters, + u32 prev_cpos, + u64 prev_blkno, + int *extend) +{ + int ret, credits; + u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 prev_clusters = *num_clusters; + u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; + u64 block; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + + mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " + "previous xattr blkno = %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + prev_cpos, prev_blkno); + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (prev_blkno + prev_clusters * bpc == block && + (prev_clusters + num_bits) << osb->s_clustersize_bits <= + OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { + /* + * If this cluster is contiguous with the old one and + * adding this new cluster, we don't surpass the limit of + * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be + * initialized and used like other buckets in the previous + * cluster. + * So add it as a contiguous one. The caller will handle + * its init process. + */ + v_start = prev_cpos + prev_clusters; + *num_clusters = prev_clusters + num_bits; + mlog(0, "Add contiguous %u clusters to previous extent rec.\n", + num_bits); + } else { + ret = ocfs2_adjust_xattr_cross_cluster(inode, + handle, + first_bh, + header_bh, + block, + prev_blkno, + prev_clusters, + &v_start, + extend); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + + if (handle->h_buffer_credits < credits) { + /* + * The journal has been restarted before, and don't + * have enough space for the insertion, so extend it + * here. + */ + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", + num_bits, block, v_start); + ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, + num_bits, 0, meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + +leave: + if (handle) + ocfs2_commit_trans(osb, handle); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +/* + * Extend a new xattr bucket and move xattrs to the end one by one until + * We meet with start_bh. Only move half of the xattrs to the bucket after it. + */ +static int ocfs2_extend_xattr_bucket(struct inode *inode, + struct buffer_head *first_bh, + struct buffer_head *start_bh, + u32 num_clusters) +{ + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 start_blk = start_bh->b_blocknr, end_blk; + u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); + handle_t *handle; + struct ocfs2_xattr_header *first_xh = + (struct ocfs2_xattr_header *)first_bh->b_data; + u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); + + mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " + "from %llu, len = %u\n", start_blk, + (unsigned long long)first_bh->b_blocknr, num_clusters); + + BUG_ON(bucket >= num_buckets); + + end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; + + /* + * We will touch all the buckets after the start_bh(include it). + * Add one more bucket and modify the first_bh. + */ + credits = end_blk - start_blk + 2 * blk_per_bucket + 1; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + while (end_blk != start_blk) { + ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, + end_blk + blk_per_bucket, 0); + if (ret) + goto commit; + end_blk -= blk_per_bucket; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + ret = ocfs2_half_xattr_bucket(inode, handle, start_blk, + start_blk + blk_per_bucket, NULL, 0); + + le16_add_cpu(&first_xh->xh_num_buckets, 1); + ocfs2_journal_dirty(handle, first_bh); + +commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Add new xattr bucket in an extent record and adjust the buckets accordingly. + * xb_bh is the ocfs2_xattr_block. + * We will move all the buckets starting from header_bh to the next place. As + * for this one, half num of its xattrs will be moved to the next one. + * + * We will allocate a new cluster if current cluster is full and adjust + * header_bh and first_bh if the insert place is moved to the new cluster. + */ +static int ocfs2_add_new_xattr_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *header_bh) +{ + struct ocfs2_xattr_header *first_xh = NULL; + struct buffer_head *first_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int ret, num_buckets, extend = 1; + u64 p_blkno; + u32 e_cpos, num_clusters; + + mlog(0, "Add new xattr bucket starting form %llu\n", + (unsigned long long)header_bh->b_blocknr); + + /* + * Add refrence for header_bh here because it may be + * changed in ocfs2_add_new_xattr_cluster and we need + * to free it in the end. + */ + get_bh(header_bh); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_block(inode, p_blkno, &first_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; + first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; + + if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { + ret = ocfs2_add_new_xattr_cluster(inode, + xb_bh, + &first_bh, + &header_bh, + &num_clusters, + e_cpos, + p_blkno, + &extend); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (extend) + ret = ocfs2_extend_xattr_bucket(inode, + first_bh, + header_bh, + num_clusters); + if (ret) + mlog_errno(ret); +out: + brelse(first_bh); + brelse(header_bh); + return ret; +} + +static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int offs) +{ + int block_off = offs >> inode->i_sb->s_blocksize_bits; + + offs = offs % inode->i_sb->s_blocksize; + return bucket->bhs[block_off]->b_data + offs; +} + +/* + * Handle the normal xattr set, including replace, delete and new. + * + * Note: "local" indicates the real data's locality. So we can't + * just its bucket locality by its length. + */ +static void ocfs2_xattr_set_entry_normal(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + struct ocfs2_xattr_entry *last, *xe; + int name_len = strlen(xi->name); + struct ocfs2_xattr_header *xh = xs->header; + u16 count = le16_to_cpu(xh->xh_count), start; + size_t blocksize = inode->i_sb->s_blocksize; + char *val; + size_t offs, size, new_size; + + last = &xh->xh_entries[count]; + if (!xs->not_found) { + xe = xs->here; + offs = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + /* + * If the new value will be stored outside, xi->value has been + * initalized as an empty ocfs2_xattr_value_root, and the same + * goes with xi->value_len, so we can set new_size safely here. + * See ocfs2_xattr_set_in_bucket. + */ + new_size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, -size); + if (xi->value) { + if (new_size > size) + goto set_new_name_value; + + /* Now replace the old value with new one. */ + if (local) + xe->xe_value_size = cpu_to_le64(xi->value_len); + else + xe->xe_value_size = 0; + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs); + memset(val + OCFS2_XATTR_SIZE(name_len), 0, + size - OCFS2_XATTR_SIZE(name_len)); + if (OCFS2_XATTR_SIZE(xi->value_len) > 0) + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, new_size); + ocfs2_xattr_set_local(xe, local); + return; + } else { + /* + * Remove the old entry if there is more than one. + * We don't remove the last entry so that we can + * use it to indicate the hash value of the empty + * bucket. + */ + last -= 1; + le16_add_cpu(&xh->xh_count, -1); + if (xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } else + xh->xh_free_start = + cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + + return; + } + } else { + /* find a new entry for insert. */ + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < + le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + xe = &xh->xh_entries[low]; + if (low != count) + memmove(xe + 1, xe, (void *)last - (void *)xe); + + le16_add_cpu(&xh->xh_count, 1); + memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); + xe->xe_name_hash = cpu_to_le32(name_hash); + xe->xe_name_len = name_len; + ocfs2_xattr_set_type(xe, xi->name_index); + } + +set_new_name_value: + /* Insert the new name+value. */ + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); + + /* + * We must make sure that the name/value pair + * exists in the same block. + */ + offs = le16_to_cpu(xh->xh_free_start); + start = offs - size; + + if (start >> inode->i_sb->s_blocksize_bits != + (offs - 1) >> inode->i_sb->s_blocksize_bits) { + offs = offs - offs % blocksize; + xh->xh_free_start = cpu_to_le16(offs); + } + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs - size); + xe->xe_name_offset = cpu_to_le16(offs - size); + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len); + + xe->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xe, local); + xs->here = xe; + le16_add_cpu(&xh->xh_free_start, -size); + le16_add_cpu(&xh->xh_name_value_len, size); + + return; +} + +static int ocfs2_xattr_bucket_handle_journal(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_search *xs, + struct buffer_head **bhs, + u16 bh_num) +{ + int ret = 0, off, block_off; + struct ocfs2_xattr_entry *xe = xs->here; + + /* + * First calculate all the blocks we should journal_access + * and journal_dirty. The first block should always be touched. + */ + ret = ocfs2_journal_dirty(handle, bhs[0]); + if (ret) + mlog_errno(ret); + + /* calc the data. */ + off = le16_to_cpu(xe->xe_name_offset); + block_off = off >> inode->i_sb->s_blocksize_bits; + ret = ocfs2_journal_dirty(handle, bhs[block_off]); + if (ret) + mlog_errno(ret); + + return ret; +} + +/* + * Set the xattr entry in the specified bucket. + * The bucket is indicated by xs->bucket and it should have the enough + * space for the xattr insertion. + */ +static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + int i, ret; + handle_t *handle = NULL; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", + (unsigned long)xi->value_len, xi->name_index, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr); + + if (!xs->bucket.bhs[1]) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + 0); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + + /*Only dirty the blocks we have touched in set xattr. */ + ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs, + xs->bucket.bhs, blk_per_bucket); + if (ret) + mlog_errno(ret); +out: + ocfs2_commit_trans(osb, handle); + + return ret; +} + +static int ocfs2_xattr_value_update_size(struct inode *inode, + struct buffer_head *xe_bh, + struct ocfs2_xattr_entry *xe, + u64 new_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + + handle = ocfs2_start_trans(osb, 1); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + xe->xe_value_size = cpu_to_le64(new_size); + + ret = ocfs2_journal_dirty(handle, xe_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Truncate the specified xe_off entry in xattr bucket. + * bucket is indicated by header_bh and len is the new length. + * Both the ocfs2_xattr_value_root and the entry will be updated here. + * + * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. + */ +static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, + struct buffer_head *header_bh, + int xe_off, + int len) +{ + int ret, offset; + u64 value_blk; + struct buffer_head *value_bh = NULL; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t blocksize = inode->i_sb->s_blocksize; + + xe = &xh->xh_entries[xe_off]; + + BUG_ON(!xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + value_blk = offset / blocksize; + + /* We don't allow ocfs2_xattr_value to be stored in different block. */ + BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); + value_blk += header_bh->b_blocknr; + + ret = ocfs2_read_block(inode, value_blk, &value_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xv = (struct ocfs2_xattr_value_root *) + (value_bh->b_data + offset % blocksize); + + mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", + xe_off, (unsigned long long)header_bh->b_blocknr, len); + ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + brelse(value_bh); + return ret; +} + +static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, + struct ocfs2_xattr_search *xs, + int len) +{ + int ret, offset; + struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; + + BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); + + offset = xe - xh->xh_entries; + ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], + offset, len); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, + struct ocfs2_xattr_search *xs, + char *val, + int value_len) +{ + int offset; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe = xs->here; + + BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + + return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); +} + +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", + cpos, len, (unsigned long long)blkno); + + ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + handle_t *handle = NULL; + struct ocfs2_xattr_header *xh = xs->bucket.xh; + struct ocfs2_xattr_entry *last = &xh->xh_entries[ + le16_to_cpu(xh->xh_count) - 1]; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return; + } + + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Remove the old entry. */ + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xh->xh_count, -1); + + ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +} + +/* + * Set the xattr name/value in the bucket specified in xs. + * + * As the new value in xi may be stored in the bucket or in an outside cluster, + * we divide the whole process into 3 steps: + * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket) + * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs) + * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside) + * 4. If the clusters for the new outside value can't be allocated, we need + * to free the xattr we allocated in set. + */ +static int ocfs2_xattr_set_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + int ret, local = 1; + size_t value_len; + char *val = (char *)xi->value; + struct ocfs2_xattr_entry *xe = xs->here; + u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name, + strlen(xi->name)); + + if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { + /* + * We need to truncate the xattr storage first. + * + * If both the old and new value are stored to + * outside block, we only need to truncate + * the storage and then set the value outside. + * + * If the new value should be stored within block, + * we should free all the outside block first and + * the modification to the xattr block will be done + * by following steps. + */ + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_len = xi->value_len; + else + value_len = 0; + + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) + goto out; + + if (value_len) + goto set_value_outside; + } + + value_len = xi->value_len; + /* So we have to handle the inside block change now. */ + if (value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If the new value will be stored outside of block, + * initalize a new empty value root and insert it first. + */ + local = 0; + xi->value = &def_xv; + xi->value_len = OCFS2_XATTR_ROOT_SIZE; + } + + ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (value_len <= OCFS2_XATTR_INLINE_SIZE) + goto out; + + /* allocate the space now for the outside block storage. */ + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) { + mlog_errno(ret); + + if (xs->not_found) { + /* + * We can't allocate enough clusters for outside + * storage and we have allocated xattr already, + * so need to remove it. + */ + ocfs2_xattr_bucket_remove_xs(inode, xs); + } + goto out; + } + +set_value_outside: + ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); +out: + return ret; +} + +/* check whether the xattr bucket is filled up with the same hash value. */ +static int ocfs2_check_xattr_bucket_collision(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + struct ocfs2_xattr_header *xh = bucket->xh; + + if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == + xh->xh_entries[0].xe_name_hash) { + mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " + "hash = %u\n", + (unsigned long long)bucket->bhs[0]->b_blocknr, + le32_to_cpu(xh->xh_entries[0].xe_name_hash)); + return -ENOSPC; + } + + return 0; +} + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + u16 count, header_size, xh_free_start; + int i, free, max_free, need, old; + size_t value_size = 0, name_len = strlen(xi->name); + size_t blocksize = inode->i_sb->s_blocksize; + int ret, allocation = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog_entry("Set xattr %s in xattr index block\n", xi->name); + +try_again: + xh = xs->header; + count = le16_to_cpu(xh->xh_count); + xh_free_start = le16_to_cpu(xh->xh_free_start); + header_size = sizeof(struct ocfs2_xattr_header) + + count * sizeof(struct ocfs2_xattr_entry); + max_free = OCFS2_XATTR_BUCKET_SIZE - + le16_to_cpu(xh->xh_name_value_len) - header_size; + + mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " + "of %u which exceed block size\n", + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + header_size); + + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_size = OCFS2_XATTR_ROOT_SIZE; + else if (xi->value) + value_size = OCFS2_XATTR_SIZE(xi->value_len); + + if (xs->not_found) + need = sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + value_size; + else { + need = value_size + OCFS2_XATTR_SIZE(name_len); + + /* + * We only replace the old value if the new length is smaller + * than the old one. Otherwise we will allocate new space in the + * bucket to store it. + */ + xe = xs->here; + if (ocfs2_xattr_is_local(xe)) + old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + if (old >= value_size) + need = 0; + } + + free = xh_free_start - header_size; + /* + * We need to make sure the new name/value pair + * can exist in the same block. + */ + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " + "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" + " %u\n", xs->not_found, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + free, need, max_free, le16_to_cpu(xh->xh_free_start), + le16_to_cpu(xh->xh_name_value_len)); + + if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + if (need <= max_free && + count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + /* + * We can create the space by defragment. Since only the + * name/value will be moved, the xe shouldn't be changed + * in xs. + */ + ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh_free_start = le16_to_cpu(xh->xh_free_start); + free = xh_free_start - header_size; + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + if (free >= need) + goto xattr_set; + + mlog(0, "Can't get enough space for xattr insert by " + "defragment. Need %u bytes, but we have %d, so " + "allocate new bucket for it.\n", need, free); + } + + /* + * We have to add new buckets or clusters and one + * allocation should leave us enough space for insert. + */ + BUG_ON(allocation); + + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket.bhs[0]); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) + brelse(xs->bucket.bhs[i]); + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->name_index, + xi->name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + allocation = 1; + goto try_again; + } + +xattr_set: + ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); +out: + mlog_exit(ret); + return ret; +} + +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0; + struct ocfs2_xattr_header *xh = bucket->xh; + u16 i; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_xattr_bucket_value_truncate(inode, + bucket->bhs[0], + i, 0); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos, num_clusters; + u64 p_blkno; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_delete_xattr_in_bucket, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rm_xattr_cluster(inode, xb_bh, + p_blkno, e_cpos, num_clusters); + if (ret) { + mlog_errno(ret); + break; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + +out: + return ret; +} + +/* + * 'trusted' attributes support + */ + +#define XATTR_TRUSTED_PREFIX "trusted." + +static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ocfs2_xattr_trusted_list, + .get = ocfs2_xattr_trusted_get, + .set = ocfs2_xattr_trusted_set, +}; + + +/* + * 'user' attributes support + */ + +#define XATTR_USER_PREFIX "user." + +static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + buffer, size); +} + +static int ocfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ocfs2_xattr_user_list, + .get = ocfs2_xattr_user_get, + .set = ocfs2_xattr_user_set, +}; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h new file mode 100644 index 00000000000..c25c7c62a05 --- /dev/null +++ b/fs/ocfs2/xattr.h @@ -0,0 +1,68 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.h + * + * Function prototypes + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_XATTR_H +#define OCFS2_XATTR_H + +#include <linux/init.h> +#include <linux/xattr.h> + +enum ocfs2_xattr_type { + OCFS2_XATTR_INDEX_USER = 1, + OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + OCFS2_XATTR_INDEX_TRUSTED, + OCFS2_XATTR_INDEX_SECURITY, + OCFS2_XATTR_MAX +}; + +extern struct xattr_handler ocfs2_xattr_user_handler; +extern struct xattr_handler ocfs2_xattr_trusted_handler; + +extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh); +extern struct xattr_handler *ocfs2_xattr_handlers[]; + +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) +{ + u16 len = sb->s_blocksize - + offsetof(struct ocfs2_xattr_header, xh_entries); + + return len / sizeof(struct ocfs2_xattr_entry); +} +#endif /* OCFS2_XATTR_H */ diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index d29047b1b9b..cbf047a847c 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -346,7 +346,7 @@ enum { Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_uid, "uid=%u"}, {Opt_gid, "gid=%u"}, {Opt_umask, "umask=%o"}, diff --git a/fs/open.c b/fs/open.c index 07da9359481..5596049863b 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1141,8 +1141,7 @@ EXPORT_SYMBOL(sys_close); asmlinkage long sys_vhangup(void) { if (capable(CAP_SYS_TTY_CONFIG)) { - /* XXX: this needs locking */ - tty_vhangup(current->signal->tty); + tty_vhangup_self(); return 0; } return -EPERM; diff --git a/fs/partitions/acorn.c b/fs/partitions/acorn.c index 3d3e1663147..a97b477ac0f 100644 --- a/fs/partitions/acorn.c +++ b/fs/partitions/acorn.c @@ -275,16 +275,6 @@ adfspart_check_ADFS(struct parsed_partitions *state, struct block_device *bdev) id = data[0x1fc] & 15; put_dev_sector(sect); -#ifdef CONFIG_BLK_DEV_MFM - if (MAJOR(bdev->bd_dev) == MFM_ACORN_MAJOR) { - extern void xd_set_geometry(struct block_device *, - unsigned char, unsigned char, unsigned int); - xd_set_geometry(bdev, dr->secspertrack, heads, 1); - invalidate_bh_lrus(); - truncate_inode_pages(bdev->bd_inode->i_mapping, 0); - } -#endif - /* * Work out start of non-adfs partition. */ diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 7d6b34e201d..cfb0c80690a 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -120,22 +120,21 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) = * a pointer to that same buffer (for convenience). */ -char *disk_name(struct gendisk *hd, int part, char *buf) +char *disk_name(struct gendisk *hd, int partno, char *buf) { - if (!part) + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part); + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part); + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); return buf; } const char *bdevname(struct block_device *bdev, char *buf) { - int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; - return disk_name(bdev->bd_disk, part, buf); + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); } EXPORT_SYMBOL(bdevname); @@ -169,7 +168,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); - state->limit = hd->minors; + state->limit = disk_max_parts(hd); i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); @@ -196,6 +195,14 @@ check_partition(struct gendisk *hd, struct block_device *bdev) return ERR_PTR(res); } +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -204,21 +211,22 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -static ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -static ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + int cpu; - preempt_disable(); - part_round_stats(p); - preempt_enable(); + cpu = part_stat_lock(); + part_round_stats(cpu, p); + part_stat_unlock(); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -238,17 +246,17 @@ static ssize_t part_stat_show(struct device *dev, } #ifdef CONFIG_FAIL_MAKE_REQUEST -static ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%d\n", p->make_it_fail); } -static ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { struct hd_struct *p = dev_to_part(dev); int i; @@ -260,6 +268,7 @@ static ssize_t part_fail_store(struct device *dev, } #endif +static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); @@ -269,6 +278,7 @@ static struct device_attribute dev_attr_fail = #endif static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, &dev_attr_stat.attr, @@ -300,40 +310,34 @@ struct device_type part_type = { .release = part_release, }; -static inline void partition_sysfs_add_subdir(struct hd_struct *p) +static void delete_partition_rcu_cb(struct rcu_head *head) { - struct kobject *k; + struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); - k = kobject_get(&p->dev.kobj); - p->holder_dir = kobject_create_and_add("holders", k); - kobject_put(k); + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(part_to_dev(part)); } -static inline void disk_sysfs_add_subdirs(struct gendisk *disk) +void delete_partition(struct gendisk *disk, int partno) { - struct kobject *k; + struct disk_part_tbl *ptbl = disk->part_tbl; + struct hd_struct *part; - k = kobject_get(&disk->dev.kobj); - disk->holder_dir = kobject_create_and_add("holders", k); - disk->slave_dir = kobject_create_and_add("slaves", k); - kobject_put(k); -} - -void delete_partition(struct gendisk *disk, int part) -{ - struct hd_struct *p = disk->part[part-1]; - - if (!p) + if (partno >= ptbl->len) return; - if (!p->nr_sects) + + part = ptbl->part[partno]; + if (!part) return; - disk->part[part-1] = NULL; - p->start_sect = 0; - p->nr_sects = 0; - part_stat_set_all(p, 0); - kobject_put(p->holder_dir); - device_del(&p->dev); - put_device(&p->dev); + + blk_free_devt(part_devt(part)); + rcu_assign_pointer(ptbl->part[partno], NULL); + kobject_put(part->holder_dir); + device_del(part_to_dev(part)); + + call_rcu(&part->rcu_head, delete_partition_rcu_cb); } static ssize_t whole_disk_show(struct device *dev, @@ -344,102 +348,132 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); -int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) +int add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags) { struct hd_struct *p; + dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + struct disk_part_tbl *ptbl; + const char *dname; int err; + err = disk_expand_part_tbl(disk, partno); + if (err) + return err; + ptbl = disk->part_tbl; + + if (ptbl->part[partno]) + return -EBUSY; + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; if (!init_part_stats(p)) { err = -ENOMEM; - goto out0; + goto out_free; } + pdev = part_to_dev(p); + p->start_sect = start; p->nr_sects = len; - p->partno = part; - p->policy = disk->policy; + p->partno = partno; + p->policy = get_disk_ro(disk); - if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) - snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%sp%d", disk->dev.bus_id, part); + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); else - snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%s%d", disk->dev.bus_id, part); + snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); + + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; - device_initialize(&p->dev); - p->dev.devt = MKDEV(disk->major, disk->first_minor + part); - p->dev.class = &block_class; - p->dev.type = &part_type; - p->dev.parent = &disk->dev; - disk->part[part-1] = p; + err = blk_alloc_devt(p, &devt); + if (err) + goto out_free; + pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ - p->dev.uevent_suppress = 1; - err = device_add(&p->dev); + pdev->uevent_suppress = 1; + err = device_add(pdev); if (err) - goto out1; - partition_sysfs_add_subdir(p); - p->dev.uevent_suppress = 0; + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!p->holder_dir) + goto out_del; + + pdev->uevent_suppress = 0; if (flags & ADDPART_FLAG_WHOLEDISK) { - err = device_create_file(&p->dev, &dev_attr_whole_disk); + err = device_create_file(pdev, &dev_attr_whole_disk); if (err) - goto out2; + goto out_del; } + /* everything is up and running, commence */ + INIT_RCU_HEAD(&p->rcu_head); + rcu_assign_pointer(ptbl->part[partno], p); + /* suppress uevent if the disk supresses it */ - if (!disk->dev.uevent_suppress) - kobject_uevent(&p->dev.kobj, KOBJ_ADD); + if (!ddev->uevent_suppress) + kobject_uevent(&pdev->kobj, KOBJ_ADD); return 0; -out2: - device_del(&p->dev); -out1: - put_device(&p->dev); - free_part_stats(p); -out0: +out_free: kfree(p); return err; +out_del: + kobject_put(p->holder_dir); + device_del(pdev); +out_put: + put_device(pdev); + blk_free_devt(devt); + return err; } /* Not exported, helper to add_disk(). */ void register_disk(struct gendisk *disk) { + struct device *ddev = disk_to_dev(disk); struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; char *s; - int i; - struct hd_struct *p; int err; - disk->dev.parent = disk->driverfs_dev; - disk->dev.devt = MKDEV(disk->major, disk->first_minor); + ddev->parent = disk->driverfs_dev; - strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); + strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); /* ewww... some of these buggers have / in the name... */ - s = strchr(disk->dev.bus_id, '/'); + s = strchr(ddev->bus_id, '/'); if (s) *s = '!'; /* delay uevents, until we scanned partition table */ - disk->dev.uevent_suppress = 1; + ddev->uevent_suppress = 1; - if (device_add(&disk->dev)) + if (device_add(ddev)) return; #ifndef CONFIG_SYSFS_DEPRECATED - err = sysfs_create_link(block_depr, &disk->dev.kobj, - kobject_name(&disk->dev.kobj)); + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); if (err) { - device_del(&disk->dev); + device_del(ddev); return; } #endif - disk_sysfs_add_subdirs(disk); + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); /* No minors to use for partitions */ - if (disk->minors == 1) + if (!disk_partitionable(disk)) goto exit; /* No such device (e.g., media were just removed) */ @@ -458,51 +492,80 @@ void register_disk(struct gendisk *disk) exit: /* announce disk after possible partitions are created */ - disk->dev.uevent_suppress = 0; - kobject_uevent(&disk->dev.kobj, KOBJ_ADD); + ddev->uevent_suppress = 0; + kobject_uevent(&ddev->kobj, KOBJ_ADD); /* announce possible partitions */ - for (i = 1; i < disk->minors; i++) { - p = disk->part[i-1]; - if (!p || !p->nr_sects) - continue; - kobject_uevent(&p->dev.kobj, KOBJ_ADD); - } + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { + struct disk_part_iter piter; + struct hd_struct *part; struct parsed_partitions *state; - int p, res; + int p, highest, res; if (bdev->bd_part_count) return -EBUSY; res = invalidate_partition(disk, 0); if (res) return res; - bdev->bd_invalidated = 0; - for (p = 1; p < disk->minors; p++) - delete_partition(disk, p); + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(disk, part->partno); + disk_part_iter_exit(&piter); + if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; if (IS_ERR(state)) /* I/O error reading the partition table */ return -EIO; /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + /* Detect the highest partition number and preallocate + * disk->part_tbl. This is an optimization and not strictly + * necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + disk_expand_part_tbl(disk, highest); + + /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; if (!size) continue; - if (from + size > get_capacity(disk)) { - printk(KERN_ERR " %s: p%d exceeds device capacity\n", - disk->disk_name, p); + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d ignored, start %llu is behind the end of the disk\n", + disk->disk_name, p, (unsigned long long) from); continue; } + if (from + size > get_capacity(disk)) { + /* + * we can not ignore partitions of broken tables + * created by for example camera firmware, but we + * limit them to the end of the disk to avoid + * creating invalid block devices + */ + printk(KERN_WARNING + "%s: p%d size %llu limited to end of disk\n", + disk->disk_name, p, (unsigned long long) size); + size = get_capacity(disk) - from; + } res = add_partition(disk, p, from, size, state->parts[p].flags); if (res) { printk(KERN_ERR " %s: p%d could not be added: %d\n", @@ -541,25 +604,31 @@ EXPORT_SYMBOL(read_dev_sector); void del_gendisk(struct gendisk *disk) { - int p; + struct disk_part_iter piter; + struct hd_struct *part; /* invalidate stuff */ - for (p = disk->minors - 1; p > 0; p--) { - invalidate_partition(disk, p); - delete_partition(disk, p); + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); } + disk_part_iter_exit(&piter); + invalidate_partition(disk, 0); - disk->capacity = 0; + blk_free_devt(disk_to_dev(disk)->devt); + set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; unlink_gendisk(disk); - disk_stat_set_all(disk, 0); - disk->stamp = 0; + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; - kobject_put(disk->holder_dir); + kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); disk->driverfs_dev = NULL; #ifndef CONFIG_SYSFS_DEPRECATED - sysfs_remove_link(block_depr, disk->dev.bus_id); + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); #endif - device_del(&disk->dev); + device_del(disk_to_dev(disk)); } diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 17ae8ecd9e8..98dbe1a8452 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -5,15 +5,13 @@ * add_gd_partition adds a partitions details to the devices partition * description. */ -enum { MAX_PART = 256 }; - struct parsed_partitions { char name[BDEVNAME_SIZE]; struct { sector_t from; sector_t size; int flags; - } parts[MAX_PART]; + } parts[DISK_MAX_PARTS]; int next; int limit; }; diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig index 73cd7a418f0..50f8f0600f0 100644 --- a/fs/proc/Kconfig +++ b/fs/proc/Kconfig @@ -57,3 +57,13 @@ config PROC_SYSCTL As it is generally a good thing, you should say Y here unless building a kernel for install/rescue disks or your system is very limited in memory. + +config PROC_PAGE_MONITOR + default y + depends on PROC_FS && MMU + bool "Enable /proc page monitoring" if EMBEDDED + help + Various /proc files exist to monitor process memory utilization: + /proc/pid/smaps, /proc/pid/clear_refs, /proc/pid/pagemap, + /proc/kpagecount, and /proc/kpageflags. Disabling these + interfaces will reduce the size of the kernel by approximately 4kb. diff --git a/fs/proc/array.c b/fs/proc/array.c index 0d6eb33597c..bb9f4b05703 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -86,11 +86,6 @@ #include <asm/processor.h> #include "internal.h" -/* Gcc optimizes away "strlen(x)" for constant x */ -#define ADDBUF(buffer, string) \ -do { memcpy(buffer, string, strlen(string)); \ - buffer += strlen(string); } while (0) - static inline void task_name(struct seq_file *m, struct task_struct *p) { int i; @@ -261,7 +256,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) sigemptyset(&ignored); sigemptyset(&caught); - rcu_read_lock(); if (lock_task_sighand(p, &flags)) { pending = p->pending.signal; shpending = p->signal->shared_pending.signal; @@ -272,7 +266,6 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) qlim = p->signal->rlim[RLIMIT_SIGPENDING].rlim_cur; unlock_task_sighand(p, &flags); } - rcu_read_unlock(); seq_printf(m, "Threads:\t%d\n", num_threads); seq_printf(m, "SigQ:\t%lu/%lu\n", qsize, qlim); @@ -337,65 +330,6 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns, return 0; } -/* - * Use precise platform statistics if available: - */ -#ifdef CONFIG_VIRT_CPU_ACCOUNTING -static cputime_t task_utime(struct task_struct *p) -{ - return p->utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ - return p->stime; -} -#else -static cputime_t task_utime(struct task_struct *p) -{ - clock_t utime = cputime_to_clock_t(p->utime), - total = utime + cputime_to_clock_t(p->stime); - u64 temp; - - /* - * Use CFS's precise accounting: - */ - temp = (u64)nsec_to_clock_t(p->se.sum_exec_runtime); - - if (total) { - temp *= utime; - do_div(temp, total); - } - utime = (clock_t)temp; - - p->prev_utime = max(p->prev_utime, clock_t_to_cputime(utime)); - return p->prev_utime; -} - -static cputime_t task_stime(struct task_struct *p) -{ - clock_t stime; - - /* - * Use CFS's precise accounting. (we subtract utime from - * the total, to make sure the total observed by userspace - * grows monotonically - apps rely on that): - */ - stime = nsec_to_clock_t(p->se.sum_exec_runtime) - - cputime_to_clock_t(task_utime(p)); - - if (stime >= 0) - p->prev_stime = max(p->prev_stime, clock_t_to_cputime(stime)); - - return p->prev_stime; -} -#endif - -static cputime_t task_gtime(struct task_struct *p) -{ - return p->gtime; -} - static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task, int whole) { @@ -454,20 +388,20 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, /* add up live thread stats at the group level */ if (whole) { + struct task_cputime cputime; struct task_struct *t = task; do { min_flt += t->min_flt; maj_flt += t->maj_flt; - utime = cputime_add(utime, task_utime(t)); - stime = cputime_add(stime, task_stime(t)); gtime = cputime_add(gtime, task_gtime(t)); t = next_thread(t); } while (t != task); min_flt += sig->min_flt; maj_flt += sig->maj_flt; - utime = cputime_add(utime, sig->utime); - stime = cputime_add(stime, sig->stime); + thread_group_cputime(task, &cputime); + utime = cputime.utime; + stime = cputime.stime; gtime = cputime_add(gtime, sig->gtime); } diff --git a/fs/proc/base.c b/fs/proc/base.c index a28840b11b8..b5918ae8ca7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -148,9 +148,6 @@ static unsigned int pid_entry_count_dirs(const struct pid_entry *entries, return count; } -int maps_protect; -EXPORT_SYMBOL(maps_protect); - static struct fs_struct *get_fs_struct(struct task_struct *task) { struct fs_struct *fs; @@ -164,7 +161,6 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) static int get_nr_threads(struct task_struct *tsk) { - /* Must be called with the rcu_read_lock held */ unsigned long flags; int count = 0; @@ -471,14 +467,10 @@ static int proc_pid_limits(struct task_struct *task, char *buffer) struct rlimit rlim[RLIM_NLIMITS]; - rcu_read_lock(); - if (!lock_task_sighand(task,&flags)) { - rcu_read_unlock(); + if (!lock_task_sighand(task, &flags)) return 0; - } memcpy(rlim, task->signal->rlim, sizeof(struct rlimit) * RLIM_NLIMITS); unlock_task_sighand(task, &flags); - rcu_read_unlock(); /* * print the file header @@ -2443,6 +2435,13 @@ static int proc_tgid_io_accounting(struct task_struct *task, char *buffer) } #endif /* CONFIG_TASK_IO_ACCOUNTING */ +static int proc_pid_personality(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + seq_printf(m, "%08x\n", task->personality); + return 0; +} + /* * Thread groups */ @@ -2459,6 +2458,7 @@ static const struct pid_entry tgid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), ONE("status", S_IRUGO, pid_status), + ONE("personality", S_IRUSR, pid_personality), INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), @@ -2794,6 +2794,7 @@ static const struct pid_entry tid_base_stuff[] = { REG("environ", S_IRUSR, environ), INF("auxv", S_IRUSR, pid_auxv), ONE("status", S_IRUGO, pid_status), + ONE("personality", S_IRUSR, pid_personality), INF("limits", S_IRUSR, pid_limits), #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, pid_sched), @@ -3088,9 +3089,7 @@ static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct generic_fillattr(inode, stat); if (p) { - rcu_read_lock(); stat->nlink += get_nr_threads(p); - rcu_read_unlock(); put_task_struct(p); } diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 4fb81e9c94e..7821589a17d 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -330,6 +330,7 @@ retry: spin_lock(&proc_inum_lock); ida_remove(&proc_inum_ida, i); spin_unlock(&proc_inum_lock); + return 0; } return PROC_DYNAMIC_FIRST + i; } @@ -546,8 +547,8 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp for (tmp = dir->subdir; tmp; tmp = tmp->next) if (strcmp(tmp->name, dp->name) == 0) { - printk(KERN_WARNING "proc_dir_entry '%s' already " - "registered\n", dp->name); + printk(KERN_WARNING "proc_dir_entry '%s/%s' already registered\n", + dir->name, dp->name); dump_stack(); break; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 8bb03f056c2..c6b4fa7e3b4 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -342,7 +342,7 @@ static int proc_reg_open(struct inode *inode, struct file *file) if (!pde->proc_fops) { spin_unlock(&pde->pde_unload_lock); kfree(pdeo); - return rv; + return -EINVAL; } pde->pde_users++; open = pde->proc_fops->open; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 442202314d5..3bfb7b8747b 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -45,8 +45,6 @@ do { \ extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); #endif -extern int maps_protect; - extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index ded96986296..59ea42e1ef0 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -24,6 +24,7 @@ #include <linux/tty.h> #include <linux/string.h> #include <linux/mman.h> +#include <linux/quicklist.h> #include <linux/proc_fs.h> #include <linux/ioport.h> #include <linux/mm.h> @@ -44,7 +45,6 @@ #include <linux/blkdev.h> #include <linux/hugetlb.h> #include <linux/jiffies.h> -#include <linux/sysrq.h> #include <linux/vmalloc.h> #include <linux/crash_dump.h> #include <linux/pid_namespace.h> @@ -67,7 +67,6 @@ extern int get_hardware_list(char *); extern int get_stram_list(char *); extern int get_exec_domain_list(char *); -extern int get_dma_list(char *); static int proc_calc_metrics(char *page, char **start, off_t off, int count, int *eof, int len) @@ -182,6 +181,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "SReclaimable: %8lu kB\n" "SUnreclaim: %8lu kB\n" "PageTables: %8lu kB\n" +#ifdef CONFIG_QUICKLIST + "Quicklists: %8lu kB\n" +#endif "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" "WritebackTmp: %8lu kB\n" @@ -214,6 +216,9 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(global_page_state(NR_SLAB_RECLAIMABLE)), K(global_page_state(NR_SLAB_UNRECLAIMABLE)), K(global_page_state(NR_PAGETABLE)), +#ifdef CONFIG_QUICKLIST + K(quicklist_total_size()), +#endif K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), K(global_page_state(NR_WRITEBACK_TEMP)), @@ -677,6 +682,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_FILE_LOCKING static int locks_open(struct inode *inode, struct file *filp) { return seq_open(filp, &locks_seq_operations); @@ -688,6 +694,7 @@ static const struct file_operations proc_locks_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_FILE_LOCKING */ static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -696,28 +703,6 @@ static int execdomains_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } -#ifdef CONFIG_MAGIC_SYSRQ -/* - * writing 'C' to /proc/sysrq-trigger is like sysrq-C - */ -static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf, - size_t count, loff_t *ppos) -{ - if (count) { - char c; - - if (get_user(c, buf)) - return -EFAULT; - __handle_sysrq(c, NULL, 0); - } - return count; -} - -static const struct file_operations proc_sysrq_trigger_operations = { - .write = write_sysrq_trigger, -}; -#endif - #ifdef CONFIG_PROC_PAGE_MONITOR #define KPMSIZE sizeof(u64) #define KPMMASK (KPMSIZE - 1) @@ -881,7 +866,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif +#ifdef CONFIG_FILE_LOCKING proc_create("locks", 0, NULL, &proc_locks_operations); +#endif proc_create("devices", 0, NULL, &proc_devinfo_operations); proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK @@ -924,7 +911,4 @@ void __init proc_misc_init(void) #ifdef CONFIG_PROC_VMCORE proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); #endif -#ifdef CONFIG_MAGIC_SYSRQ - proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); -#endif } diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index f9a8b892718..945a81043ba 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -66,7 +66,7 @@ static struct ctl_table *find_in_table(struct ctl_table *p, struct qstr *name) return NULL; } -struct ctl_table_header *grab_header(struct inode *inode) +static struct ctl_table_header *grab_header(struct inode *inode) { if (PROC_I(inode)->sysctl) return sysctl_head_grab(PROC_I(inode)->sysctl); @@ -395,10 +395,10 @@ static struct dentry_operations proc_sys_dentry_operations = { .d_compare = proc_sys_compare, }; -static struct proc_dir_entry *proc_sys_root; - int proc_sys_init(void) { + struct proc_dir_entry *proc_sys_root; + proc_sys_root = proc_mkdir("sys", NULL); proc_sys_root->proc_iops = &proc_sys_dir_operations; proc_sys_root->proc_fops = &proc_sys_dir_file_operations; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 73d1891ee62..4806830ea2a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -210,9 +210,6 @@ static int show_map(struct seq_file *m, void *v) dev_t dev = 0; int len; - if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; - if (file) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; dev = inode->i_sb->s_dev; @@ -742,22 +739,11 @@ const struct file_operations proc_pagemap_operations = { #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -static int show_numa_map_checked(struct seq_file *m, void *v) -{ - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; - - return show_numa_map(m, v); -} - static const struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, - .show = show_numa_map_checked + .show = show_numa_map, }; static int numa_maps_open(struct inode *inode, struct file *file) diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 5d84e7121df..219bd79ea89 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -110,11 +110,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, static int show_map(struct seq_file *m, void *_vml) { struct vm_list_struct *vml = _vml; - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - if (maps_protect && !ptrace_may_access(task, PTRACE_MODE_READ)) - return -EACCES; return nommu_vma_show(m, vml->vma); } diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 9ac0f5e064e..841368b87a2 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -165,14 +165,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, return acc; } -static int open_vmcore(struct inode *inode, struct file *filp) -{ - return 0; -} - const struct file_operations proc_vmcore_operations = { .read = read_vmcore, - .open = open_vmcore, }; static struct vmcore* __init get_new_element(void) diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 52312ec93ff..5145cb9125a 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -58,7 +58,7 @@ const struct inode_operations ramfs_file_inode_operations = { * size 0 on the assumption that it's going to be used for an mmap of shared * memory */ -static int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) +int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) { struct pagevec lru_pvec; unsigned long npages, xpages, loop, limit; diff --git a/fs/readdir.c b/fs/readdir.c index 4e026e5407f..93a7559bbfd 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -80,8 +80,10 @@ static int fillonedir(void * __buf, const char * name, int namlen, loff_t offset if (buf->result) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->result = -EOVERFLOW; return -EOVERFLOW; + } buf->result++; dirent = buf->dirent; if (!access_ok(VERIFY_WRITE, dirent, @@ -155,8 +157,10 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset, if (reclen > buf->count) return -EINVAL; d_ino = ino; - if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) + if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { + buf->error = -EOVERFLOW; return -EOVERFLOW; + } dirent = buf->previous; if (dirent) { if (__put_user(offset, &dirent->d_off)) diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index b9dbeeca704..37173fa07d1 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -8,8 +8,6 @@ /* proc info support a la one created by Sizif@Botik.RU for PGC */ -/* $Id: procfs.c,v 1.1.8.2 2001/07/15 17:08:42 god Exp $ */ - #include <linux/module.h> #include <linux/time.h> #include <linux/seq_file.h> @@ -621,7 +619,6 @@ int reiserfs_global_version_in_proc(char *buffer, char **start, #endif /* - * $Log: procfs.c,v $ * Revision 1.1.8.2 2001/07/15 17:08:42 god * . use get_super() in procfs.c * . remove remove_save_link() from reiserfs_do_truncate() diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index bb3cb5b7cdb..ad92461cbfc 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -155,7 +155,7 @@ static struct dentry *get_xa_file_dentry(const struct inode *inode, xadir = open_xa_dir(inode, flags); if (IS_ERR(xadir)) { return ERR_CAST(xadir); - } else if (xadir && !xadir->d_inode) { + } else if (!xadir->d_inode) { dput(xadir); return ERR_PTR(-ENODATA); } diff --git a/fs/seq_file.c b/fs/seq_file.c index 5d54205e486..bd20f7f5a93 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -108,9 +108,9 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Done; } /* we need at least one record in buffer */ + pos = m->index; + p = m->op->start(m, &pos); while (1) { - pos = m->index; - p = m->op->start(m, &pos); err = PTR_ERR(p); if (!p || IS_ERR(p)) break; @@ -119,6 +119,11 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) break; if (unlikely(err)) m->count = 0; + if (unlikely(!m->count)) { + p = m->op->next(m, p, &pos); + m->index = pos; + continue; + } if (m->count < m->size) goto Fill; m->op->stop(m, p); @@ -128,6 +133,8 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) goto Enomem; m->count = 0; m->version = 0; + pos = m->index; + p = m->op->start(m, &pos); } m->op->stop(m, p); m->count = 0; diff --git a/fs/splice.c b/fs/splice.c index 1bbc6f4bb09..a1e701c2715 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -898,6 +898,9 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; + if (unlikely(out->f_flags & O_APPEND)) + return -EINVAL; + ret = rw_verify_area(WRITE, out, ppos, len); if (unlikely(ret < 0)) return ret; diff --git a/fs/sysfs/bin.c b/fs/sysfs/bin.c index 006fc64227d..66f6e58a7e4 100644 --- a/fs/sysfs/bin.c +++ b/fs/sysfs/bin.c @@ -61,6 +61,7 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) int size = dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; if (size) { if (offs > size) @@ -69,23 +70,33 @@ read(struct file *file, char __user *userbuf, size_t bytes, loff_t *off) count = size - offs; } + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; + mutex_lock(&bb->mutex); count = fill_read(dentry, bb->buffer, offs, count); - if (count < 0) - goto out_unlock; + if (count < 0) { + mutex_unlock(&bb->mutex); + goto out_free; + } - if (copy_to_user(userbuf, bb->buffer, count)) { + memcpy(temp, bb->buffer, count); + + mutex_unlock(&bb->mutex); + + if (copy_to_user(userbuf, temp, count)) { count = -EFAULT; - goto out_unlock; + goto out_free; } pr_debug("offs = %lld, *off = %lld, count = %d\n", offs, *off, count); *off = offs + count; - out_unlock: - mutex_unlock(&bb->mutex); + out_free: + kfree(temp); return count; } @@ -118,6 +129,7 @@ static ssize_t write(struct file *file, const char __user *userbuf, int size = dentry->d_inode->i_size; loff_t offs = *off; int count = min_t(size_t, bytes, PAGE_SIZE); + char *temp; if (size) { if (offs > size) @@ -126,19 +138,27 @@ static ssize_t write(struct file *file, const char __user *userbuf, count = size - offs; } - mutex_lock(&bb->mutex); + temp = kmalloc(count, GFP_KERNEL); + if (!temp) + return -ENOMEM; - if (copy_from_user(bb->buffer, userbuf, count)) { + if (copy_from_user(temp, userbuf, count)) { count = -EFAULT; - goto out_unlock; + goto out_free; } + mutex_lock(&bb->mutex); + + memcpy(bb->buffer, temp, count); + count = flush_write(dentry, bb->buffer, offs, count); + mutex_unlock(&bb->mutex); + if (count > 0) *off = offs + count; - out_unlock: - mutex_unlock(&bb->mutex); +out_free: + kfree(temp); return count; } diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index aedaeba82ae..3a05a596e3b 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -370,17 +370,17 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, memset(acxt, 0, sizeof(*acxt)); acxt->parent_sd = parent_sd; - /* Lookup parent inode. inode initialization and I_NEW - * clearing are protected by sysfs_mutex. By grabbing it and - * looking up with _nowait variant, inode state can be - * determined reliably. + /* Lookup parent inode. inode initialization is protected by + * sysfs_mutex, so inode existence can be determined by + * looking up inode while holding sysfs_mutex. */ mutex_lock(&sysfs_mutex); - inode = ilookup5_nowait(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, - parent_sd); + inode = ilookup5(sysfs_sb, parent_sd->s_ino, sysfs_ilookup_test, + parent_sd); + if (inode) { + WARN_ON(inode->i_state & I_NEW); - if (inode && !(inode->i_state & I_NEW)) { /* parent inode available */ acxt->parent_inode = inode; @@ -393,8 +393,7 @@ void sysfs_addrm_start(struct sysfs_addrm_cxt *acxt, mutex_lock(&inode->i_mutex); mutex_lock(&sysfs_mutex); } - } else - iput(inode); + } } /** @@ -636,6 +635,7 @@ struct sysfs_dirent *sysfs_get_dirent(struct sysfs_dirent *parent_sd, return sd; } +EXPORT_SYMBOL_GPL(sysfs_get_dirent); static int create_dir(struct kobject *kobj, struct sysfs_dirent *parent_sd, const char *name, struct sysfs_dirent **p_sd) @@ -829,16 +829,12 @@ int sysfs_rename_dir(struct kobject * kobj, const char *new_name) if (!new_dentry) goto out_unlock; - /* rename kobject and sysfs_dirent */ + /* rename sysfs_dirent */ error = -ENOMEM; new_name = dup_name = kstrdup(new_name, GFP_KERNEL); if (!new_name) goto out_unlock; - error = kobject_set_name(kobj, "%s", new_name); - if (error) - goto out_unlock; - dup_name = sd->s_name; sd->s_name = new_name; diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index c9e4e5091da..1f4a3f87726 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -19,10 +19,18 @@ #include <linux/poll.h> #include <linux/list.h> #include <linux/mutex.h> +#include <linux/limits.h> #include <asm/uaccess.h> #include "sysfs.h" +/* used in crash dumps to help with debugging */ +static char last_sysfs_file[PATH_MAX]; +void sysfs_printk_last_file(void) +{ + printk(KERN_EMERG "last sysfs file: %s\n", last_sysfs_file); +} + /* * There's one sysfs_buffer for each open file and one * sysfs_open_dirent for each sysfs_dirent with one or more open @@ -328,6 +336,11 @@ static int sysfs_open_file(struct inode *inode, struct file *file) struct sysfs_buffer *buffer; struct sysfs_ops *ops; int error = -EACCES; + char *p; + + p = d_path(&file->f_path, last_sysfs_file, sizeof(last_sysfs_file)); + if (p) + memmove(last_sysfs_file, p, strlen(p) + 1); /* need attr_sd for attr and ops, its parent for kobj */ if (!sysfs_get_active_two(attr_sd)) @@ -440,7 +453,23 @@ static unsigned int sysfs_poll(struct file *filp, poll_table *wait) return POLLERR|POLLPRI; } -void sysfs_notify(struct kobject *k, char *dir, char *attr) +void sysfs_notify_dirent(struct sysfs_dirent *sd) +{ + struct sysfs_open_dirent *od; + + spin_lock(&sysfs_open_dirent_lock); + + od = sd->s_attr.open; + if (od) { + atomic_inc(&od->event); + wake_up_interruptible(&od->poll); + } + + spin_unlock(&sysfs_open_dirent_lock); +} +EXPORT_SYMBOL_GPL(sysfs_notify_dirent); + +void sysfs_notify(struct kobject *k, const char *dir, const char *attr) { struct sysfs_dirent *sd = k->sd; @@ -450,19 +479,8 @@ void sysfs_notify(struct kobject *k, char *dir, char *attr) sd = sysfs_find_dirent(sd, dir); if (sd && attr) sd = sysfs_find_dirent(sd, attr); - if (sd) { - struct sysfs_open_dirent *od; - - spin_lock(&sysfs_open_dirent_lock); - - od = sd->s_attr.open; - if (od) { - atomic_inc(&od->event); - wake_up_interruptible(&od->poll); - } - - spin_unlock(&sysfs_open_dirent_lock); - } + if (sd) + sysfs_notify_dirent(sd); mutex_unlock(&sysfs_mutex); } diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 14f0023984d..ab343e371d6 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -16,6 +16,7 @@ #include <linux/mount.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <linux/module.h> #include "sysfs.h" @@ -115,3 +116,17 @@ out_err: sysfs_dir_cachep = NULL; goto out; } + +#undef sysfs_get +struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +{ + return __sysfs_get(sd); +} +EXPORT_SYMBOL_GPL(sysfs_get); + +#undef sysfs_put +void sysfs_put(struct sysfs_dirent *sd) +{ + __sysfs_put(sd); +} +EXPORT_SYMBOL_GPL(sysfs_put); diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index a5db496f71c..93c6d6b27c4 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -124,7 +124,7 @@ int sysfs_create_subdir(struct kobject *kobj, const char *name, struct sysfs_dirent **p_sd); void sysfs_remove_subdir(struct sysfs_dirent *sd); -static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) +static inline struct sysfs_dirent *__sysfs_get(struct sysfs_dirent *sd) { if (sd) { WARN_ON(!atomic_read(&sd->s_count)); @@ -132,12 +132,14 @@ static inline struct sysfs_dirent *sysfs_get(struct sysfs_dirent *sd) } return sd; } +#define sysfs_get(sd) __sysfs_get(sd) -static inline void sysfs_put(struct sysfs_dirent *sd) +static inline void __sysfs_put(struct sysfs_dirent *sd) { if (sd && atomic_dec_and_test(&sd->s_count)) release_sysfs_dirent(sd); } +#define sysfs_put(sd) __sysfs_put(sd) /* * inode.c diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 15409815747..73db464cd08 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) int subtract_lebs; long long available; - /* - * Force the amount available to the total size reported if the used - * space is zero. - */ - if (c->lst.total_used <= UBIFS_INO_NODE_SZ && - c->budg_data_growth + c->budg_dd_growth == 0) { - /* Do the same calculation as for c->block_cnt */ - available = c->main_lebs - 2; - available *= c->leb_size - c->dark_wm; - return available; - } - available = c->main_bytes - c->lst.total_used; /* @@ -714,34 +702,106 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, } /** - * ubifs_budg_get_free_space - return amount of free space. + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexind nodes as well. This introduces additional + * overhead, and UBIFS it has to report sligtly less free space to meet the + * above expectetion. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +{ + int divisor, factor, f; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reseves thrice as more space + * for the index. + */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / (f - 1); + free *= factor; + do_div(free, divisor); + return free; +} + +/** + * ubifs_get_free_space - return amount of free space. * @c: UBIFS file-system description object * - * This function returns amount of free space on the file-system. + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * amount of free flash space it has (well, because not all dirty space is + * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectetion about what free space is. Users seem to + * accustomed to assume that if the file-system reports N bytes of free space, + * they would be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. */ -long long ubifs_budg_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space(struct ubifs_info *c) { - int min_idx_lebs, rsvd_idx_lebs; + int min_idx_lebs, rsvd_idx_lebs, lebs; long long available, outstanding, free; - /* Do exactly the same calculations as in 'do_budget_space()' */ spin_lock(&c->space_lock); min_idx_lebs = ubifs_calc_min_idx_lebs(c); + outstanding = c->budg_data_growth + c->budg_dd_growth; - if (min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; - else - rsvd_idx_lebs = 0; - - if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - - c->lst.taken_empty_lebs) { + /* + * Force the amount available to the total size reported if the used + * space is zero. + */ + if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) { spin_unlock(&c->space_lock); - return 0; + return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT; } available = ubifs_calc_available(c, min_idx_lebs); - outstanding = c->budg_data_growth + c->budg_dd_growth; - c->min_idx_lebs = min_idx_lebs; + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); spin_unlock(&c->space_lock); if (available > outstanding) diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index b9cb7747375..d7f7645779f 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -538,7 +538,7 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node) printk(KERN_DEBUG "\t%d orphan inode numbers:\n", n); for (i = 0; i < n; i++) printk(KERN_DEBUG "\t ino %llu\n", - le64_to_cpu(orph->inos[i])); + (unsigned long long)le64_to_cpu(orph->inos[i])); break; } default: diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 5c96f1fb701..526c01ec800 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -426,7 +426,7 @@ static int ubifs_readdir(struct file *file, void *dirent, filldir_t filldir) while (1) { dbg_gen("feed '%s', ino %llu, new f_pos %#x", - dent->name, le64_to_cpu(dent->inum), + dent->name, (unsigned long long)le64_to_cpu(dent->inum), key_hash_flash(c, &dent->key)); ubifs_assert(dent->ch.sqnum > ubifs_inode(dir)->creat_sqnum); @@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) if (err) { if (err != -ENOSPC) return err; - err = 0; budgeted = 0; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 4071d1cae29..3d698e2022b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int err; struct ubifs_budget_req req; loff_t old_size = inode->i_size, new_size = attr->ia_size; - int offset = new_size & (UBIFS_BLOCK_SIZE - 1); + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); @@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, /* A funny way to budget for truncation node */ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; err = ubifs_budget_space(c, &req); - if (err) - return err; + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } err = vmtruncate(inode, new_size); if (err) @@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, err = ubifs_jnl_truncate(c, inode, old_size, new_size); mutex_unlock(&ui->ui_mutex); out_budg: - ubifs_release_budget(c, &req); + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } return err; } diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index adee7b5ddea..47814cde240 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * dirty index heap, and it falls-back to LPT scanning if the heaps are empty * or do not have an LEB which satisfies the @min_space criteria. * - * Note: - * o LEBs which have less than dead watermark of dirty space are never picked - * by this function; - * - * Returns zero and the LEB properties of - * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a - * negative error code in case of other failures. The returned LEB is marked as - * "taken". + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. * * The additional @pick_free argument controls if this function has to return a * free or freeable LEB if one is present. For example, GC must to set it to %1, @@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * * In addition @pick_free is set to %2 by the recovery process in order to * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". */ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int min_space, int pick_free) @@ -245,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int lebs, rsvd_idx_lebs = 0; spin_lock(&c->space_lock); - lebs = c->lst.empty_lebs; + lebs = c->lst.empty_lebs + c->idx_gc_cnt; lebs += c->freeable_cnt - c->lst.taken_empty_lebs; /* @@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, lp = idx_lp; if (lp) { - ubifs_assert(lp->dirty >= c->dead_wm); + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); goto found; } @@ -509,7 +507,6 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, rsvd_idx_lebs = 0; lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - c->lst.taken_empty_lebs; - ubifs_assert(lebs + c->lst.idx_lebs >= c->min_idx_lebs); if (rsvd_idx_lebs < lebs) /* * OK to allocate an empty LEB, but we still don't want to go diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d0f3dac2908..02aba36fe3d 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -334,15 +334,21 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) err = move_nodes(c, sleb); if (err) - goto out; + goto out_inc_seq; err = gc_sync_wbufs(c); if (err) - goto out; + goto out_inc_seq; err = ubifs_change_one_lp(c, lnum, c->leb_size, 0, 0, 0, 0); if (err) - goto out; + goto out_inc_seq; + + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); if (c->gc_lnum == -1) { c->gc_lnum = lnum; @@ -363,6 +369,14 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) out: ubifs_scan_destroy(sleb); return err; + +out_inc_seq: + /* We may have moved at least some nodes so allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + goto out; } /** diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 87dabf9fe74..4c12a9215d7 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -284,38 +284,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, } /** - * ubifs_reported_space - calculate reported free space. - * @c: the UBIFS file-system description object - * @free: amount of free space - * - * This function calculates amount of free space which will be reported to - * user-space. User-space application tend to expect that if the file-system - * (e.g., via the 'statfs()' call) reports that it has N bytes available, they - * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. - * - * This function assumes free space is made up of uncompressed data nodes and - * full index nodes (one per data node, doubled because we always allow enough - * space to write the index twice). - * - * Note, the calculation is pessimistic, which means that most of the time - * UBIFS reports less space than it actually has. - */ -static inline long long ubifs_reported_space(const struct ubifs_info *c, - uint64_t free) -{ - int divisor, factor; - - divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3); - factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; - do_div(free, divisor); - - return free * factor; -} - -/** * ubifs_current_time - round current time to time granularity. * @inode: inode */ @@ -325,4 +293,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f71e6b8822c..9a9220333b3 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -370,8 +370,9 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; - free = ubifs_budg_get_free_space(c); + free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); @@ -386,7 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; - + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); return 0; } @@ -530,6 +532,12 @@ static int init_constants_early(struct ubifs_info *c) c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; return 0; } @@ -647,13 +655,11 @@ static int init_constants_late(struct ubifs_info *c) * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * - * Subtract the LEB reserved for GC and the LEB which is reserved for - * deletions. - * - * Review 'ubifs_calc_available()' if changing this calculation. + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, and assume only one journal head is available. */ - tmp64 = c->main_lebs - 2; - tmp64 *= (uint64_t)c->leb_size - c->dark_wm; + tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; + tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; @@ -842,7 +848,7 @@ enum { Opt_err, }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_fast_unmount, "fast_unmount"}, {Opt_norm_unmount, "norm_unmount"}, {Opt_err, NULL}, @@ -1018,14 +1024,13 @@ static int mount_ubifs(struct ubifs_info *c) goto out_dereg; } + sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, c->vi.vol_id); if (!mounted_read_only) { err = alloc_wbufs(c); if (err) goto out_cbuf; /* Create background thread */ - sprintf(c->bgt_name, BGT_NAME_PATTERN, c->vi.ubi_num, - c->vi.vol_id); c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); if (!c->bgt) c->bgt = ERR_PTR(-EINVAL); diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e909f4a9644..7634c597088 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, if (keys_cmp(c, key, &node_key) != 0) ret = 0; } - if (ret == 0) + if (ret == 0 && c->replaying) dbg_mnt("dangling branch LEB %d:%d len %d, key %s", zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); return ret; @@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, } /** - * ubifs_tnc_lookup - look up a file-system node. + * maybe_leb_gced - determine if a LEB may have been garbage collected. * @c: UBIFS file-system description object - * @key: node key to lookup - * @node: the node is returned here + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number * - * This function look up and reads node with key @key. The caller has to make - * sure the @node buffer is large enough to fit the node. Returns zero in case - * of success, %-ENOENT if the node was not found, and a negative error code in - * case of failure. + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. */ -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node) +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) { - int found, n, err; - struct ubifs_znode *znode; - struct ubifs_zbranch zbr, *zt; + int gc_seq2, gced_lnum; - mutex_lock(&c->tnc_mutex); - found = ubifs_lookup_level0(c, key, &znode, &n); - if (!found) { - err = -ENOENT; - goto out; - } else if (found < 0) { - err = found; - goto out; - } - zt = &znode->zbranch[n]; - if (is_hash_key(c, key)) { - /* - * In this case the leaf node cache gets used, so we pass the - * address of the zbranch and keep the mutex locked - */ - err = tnc_read_node_nm(c, zt, node); - goto out; - } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = ubifs_tnc_read_node(c, &zbr, node); - return err; - -out: - mutex_unlock(&c->tnc_mutex); - return err; + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; } /** @@ -1436,16 +1425,19 @@ out: * @lnum: LEB number is returned here * @offs: offset is returned here * - * This function is the same as 'ubifs_tnc_lookup()' but it returns the node - * location also. See 'ubifs_tnc_lookup()'. + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. */ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs) { - int found, n, err; + int found, n, err, safely = 0, gc_seq1; struct ubifs_znode *znode; struct ubifs_zbranch zbr, *zt; +again: mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, goto out; } zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } if (is_hash_key(c, key)) { /* * In this case the leaf node cache gets used, so we pass the * address of the zbranch and keep the mutex locked */ - *lnum = zt->lnum; - *offs = zt->offs; err = tnc_read_node_nm(c, zt, node); goto out; } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; mutex_unlock(&c->tnc_mutex); - *lnum = zbr.lnum; - *offs = zbr.offs; + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + err = fallible_read_node(c, key, &zbr, node); + if (err <= 0 || maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; out: mutex_unlock(&c->tnc_mutex); @@ -1498,7 +1509,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, { int found, n, err; struct ubifs_znode *znode; - struct ubifs_zbranch zbr; dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); mutex_lock(&c->tnc_mutex); @@ -1522,11 +1532,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = tnc_read_node_nm(c, &zbr, node); - return err; + err = tnc_read_node_nm(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index bd2121f3426..a9ecbd9af20 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -87,7 +87,7 @@ #define UBIFS_SK_LEN 8 /* Minimum index tree fanout */ -#define UBIFS_MIN_FANOUT 2 +#define UBIFS_MIN_FANOUT 3 /* Maximum number of levels in UBIFS indexing B-tree */ #define UBIFS_MAX_LEVELS 512 diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d7f706f7a30..17c620b93ee 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -995,6 +995,9 @@ struct ubifs_mount_opts { * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary * @max_inode_sz: maximum possible inode size in bytes * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting * @dead_wm: LEB dead space watermark * @dark_wm: LEB dark space watermark * @block_cnt: count of 4KiB blocks on the FS @@ -1028,6 +1031,8 @@ struct ubifs_mount_opts { * @sbuf: a buffer of LEB size used by GC and replay for scanning * @idx_gc: list of index LEBs that have been garbage collected * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected * * @infos_list: links all 'ubifs_info' objects * @umount_mutex: serializes shrinker and un-mount @@ -1224,6 +1229,8 @@ struct ubifs_info { int max_idx_node_sz; long long max_inode_sz; int max_znode_sz; + + int leb_overhead; int dead_wm; int dark_wm; int block_cnt; @@ -1257,6 +1264,8 @@ struct ubifs_info { void *sbuf; struct list_head idx_gc; int idx_gc_cnt; + volatile int gc_seq; + volatile int gced_lnum; struct list_head infos_list; struct mutex umount_mutex; @@ -1434,9 +1443,10 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); -long long ubifs_budg_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ @@ -1451,8 +1461,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); /* tnc.c */ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, void *node, const struct qstr *nm); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, diff --git a/fs/udf/file.c b/fs/udf/file.c index 0ed6e146a0d..eb91f3b7032 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -211,6 +211,7 @@ const struct file_operations udf_file_operations = { .release = udf_release_file, .fsync = udf_fsync_file, .splice_read = generic_file_splice_read, + .llseek = generic_file_llseek, }; const struct inode_operations udf_file_inode_operations = { diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index eb9cfa23dc3..a4f2b3ce45b 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -76,11 +76,24 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) *err = -ENOSPC; iinfo = UDF_I(inode); - iinfo->i_unique = 0; - iinfo->i_lenExtents = 0; - iinfo->i_next_alloc_block = 0; - iinfo->i_next_alloc_goal = 0; - iinfo->i_strat4096 = 0; + if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { + iinfo->i_efe = 1; + if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) + sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry), + GFP_KERNEL); + } else { + iinfo->i_efe = 0; + iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - + sizeof(struct fileEntry), + GFP_KERNEL); + } + if (!iinfo->i_ext.i_data) { + iput(inode); + *err = -ENOMEM; + return NULL; + } block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, @@ -111,6 +124,7 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) lvhd->uniqueID = cpu_to_le64(uniqueID); mark_buffer_dirty(sbi->s_lvid_bh); } + mutex_unlock(&sbi->s_alloc_mutex); inode->i_mode = mode; inode->i_uid = current->fsuid; if (dir->i_mode & S_ISGID) { @@ -129,25 +143,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_lenEAttr = 0; iinfo->i_lenAlloc = 0; iinfo->i_use = 0; - if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { - iinfo->i_efe = 1; - if (UDF_VERS_USE_EXTENDED_FE > sbi->s_udfrev) - sbi->s_udfrev = UDF_VERS_USE_EXTENDED_FE; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry), - GFP_KERNEL); - } else { - iinfo->i_efe = 0; - iinfo->i_ext.i_data = kzalloc(inode->i_sb->s_blocksize - - sizeof(struct fileEntry), - GFP_KERNEL); - } - if (!iinfo->i_ext.i_data) { - iput(inode); - *err = -ENOMEM; - mutex_unlock(&sbi->s_alloc_mutex); - return NULL; - } if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_AD_IN_ICB)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_IN_ICB; else if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) @@ -158,7 +153,6 @@ struct inode *udf_new_inode(struct inode *dir, int mode, int *err) iinfo->i_crtime = current_fs_time(inode->i_sb); insert_inode_hash(inode); mark_inode_dirty(inode); - mutex_unlock(&sbi->s_alloc_mutex); if (DQUOT_ALLOC_INODE(inode)) { DQUOT_DROP(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index 5698bbf83bb..e25e7010627 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -369,7 +369,7 @@ enum { Opt_err, Opt_uforget, Opt_uignore, Opt_gforget, Opt_gignore }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_novrs, "novrs"}, {Opt_nostrict, "nostrict"}, {Opt_bs, "bs=%u"}, diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 3141969b456..e65212dfb60 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -309,7 +309,7 @@ enum { Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_type_old, "ufstype=old"}, {Opt_type_sunx86, "ufstype=sunx86"}, {Opt_type_sun, "ufstype=sun"}, @@ -1233,7 +1233,7 @@ static int ufs_show_options(struct seq_file *seq, struct vfsmount *vfs) { struct ufs_sb_info *sbi = UFS_SB(vfs->mnt_sb); unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; - struct match_token *tp = tokens; + const struct match_token *tp = tokens; while (tp->token != Opt_onerror_panic && tp->token != mval) ++tp; diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index f42f80a3b1f..a44d68eb50b 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1338,6 +1338,10 @@ __xfs_get_blocks( offset = (xfs_off_t)iblock << inode->i_blkbits; ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); size = bh_result->b_size; + + if (!create && direct && offset >= i_size_read(inode)) + return 0; + error = xfs_iomap(XFS_I(inode), offset, size, create ? flags : BMAPI_READ, &iomap, &niomap); if (error) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 986061ae1b9..36d5fcd3f59 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1001,12 +1001,13 @@ xfs_buf_iodone_work( * We can get an EOPNOTSUPP to ordered writes. Here we clear the * ordered flag and reissue them. Because we can't tell the higher * layers directly that they should not issue ordered I/O anymore, they - * need to check if the ordered flag was cleared during I/O completion. + * need to check if the _XFS_BARRIER_FAILED flag was set during I/O completion. */ if ((bp->b_error == EOPNOTSUPP) && (bp->b_flags & (XBF_ORDERED|XBF_ASYNC)) == (XBF_ORDERED|XBF_ASYNC)) { XB_TRACE(bp, "ordered_retry", bp->b_iodone); bp->b_flags &= ~XBF_ORDERED; + bp->b_flags |= _XFS_BARRIER_FAILED; xfs_buf_iorequest(bp); } else if (bp->b_iodone) (*(bp->b_iodone))(bp); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index fe010995665..456519a088c 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -85,6 +85,14 @@ typedef enum { * modifications being lost. */ _XBF_PAGE_LOCKED = (1 << 22), + + /* + * If we try a barrier write, but it fails we have to communicate + * this to the upper layers. Unfortunately b_error gets overwritten + * when the buffer is re-issued so we have to add another flag to + * keep this information. + */ + _XFS_BARRIER_FAILED = (1 << 23), } xfs_buf_flags_t; typedef enum { diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 5f60363b934..5311c1acdd4 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -475,6 +475,7 @@ const struct file_operations xfs_invis_file_operations = { const struct file_operations xfs_dir_file_operations = { .read = generic_read_dir, .readdir = xfs_file_readdir, + .llseek = generic_file_llseek, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 91bcd979242..095d271f343 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -355,7 +355,7 @@ xfs_vn_ci_lookup( /* else case-insensitive match... */ dname.name = ci_name.name; dname.len = ci_name.len; - dentry = d_add_ci(VFS_I(ip), dentry, &dname); + dentry = d_add_ci(dentry, VFS_I(ip), &dname); kmem_free(ci_name.name); return dentry; } diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 73c65f19e54..e39013619b2 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -158,7 +158,7 @@ enum { Opt_barrier, Opt_nobarrier, Opt_err }; -static match_table_t tokens = { +static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_err, NULL} @@ -1302,9 +1302,29 @@ xfs_fs_remount( mp->m_flags &= ~XFS_MOUNT_BARRIER; break; default: + /* + * Logically we would return an error here to prevent + * users from believing they might have changed + * mount options using remount which can't be changed. + * + * But unfortunately mount(8) adds all options from + * mtab and fstab to the mount arguments in some cases + * so we can't blindly reject options, but have to + * check for each specified option if it actually + * differs from the currently set option and only + * reject it if that's the case. + * + * Until that is implemented we return success for + * every remount request, and silently ignore all + * options that we can't actually change. + */ +#if 0 printk(KERN_INFO "XFS: mount option \"%s\" not supported for remount\n", p); return -EINVAL; +#else + break; +#endif } } diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 608c30c3f76..002fc2617c8 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -732,6 +732,7 @@ xfs_buf_item_init( bip->bli_item.li_ops = &xfs_buf_item_ops; bip->bli_item.li_mountp = mp; bip->bli_buf = bp; + xfs_buf_hold(bp); bip->bli_format.blf_type = XFS_LI_BUF; bip->bli_format.blf_blkno = (__int64_t)XFS_BUF_ADDR(bp); bip->bli_format.blf_len = (ushort)BTOBB(XFS_BUF_COUNT(bp)); @@ -867,6 +868,21 @@ xfs_buf_item_dirty( return (bip->bli_flags & XFS_BLI_DIRTY); } +STATIC void +xfs_buf_item_free( + xfs_buf_log_item_t *bip) +{ +#ifdef XFS_TRANS_DEBUG + kmem_free(bip->bli_orig); + kmem_free(bip->bli_logged); +#endif /* XFS_TRANS_DEBUG */ + +#ifdef XFS_BLI_TRACE + ktrace_free(bip->bli_trace); +#endif + kmem_zone_free(xfs_buf_item_zone, bip); +} + /* * This is called when the buf log item is no longer needed. It should * free the buf log item associated with the given buffer and clear @@ -887,18 +903,8 @@ xfs_buf_item_relse( (XFS_BUF_IODONE_FUNC(bp) != NULL)) { XFS_BUF_CLR_IODONE_FUNC(bp); } - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_rele(bp); + xfs_buf_item_free(bip); } @@ -1120,6 +1126,7 @@ xfs_buf_iodone( ASSERT(bip->bli_buf == bp); + xfs_buf_rele(bp); mp = bip->bli_item.li_mountp; /* @@ -1136,18 +1143,7 @@ xfs_buf_iodone( * xfs_trans_delete_ail() drops the AIL lock. */ xfs_trans_delete_ail(mp, (xfs_log_item_t *)bip); - -#ifdef XFS_TRANS_DEBUG - kmem_free(bip->bli_orig); - bip->bli_orig = NULL; - kmem_free(bip->bli_logged); - bip->bli_logged = NULL; -#endif /* XFS_TRANS_DEBUG */ - -#ifdef XFS_BLI_TRACE - ktrace_free(bip->bli_trace); -#endif - kmem_zone_free(xfs_buf_item_zone, bip); + xfs_buf_item_free(bip); } #if defined(XFS_BLI_TRACE) diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 760f4c5b516..75b0cd4da0e 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -149,7 +149,14 @@ xfs_swap_extents( sbp = &sxp->sx_stat; - xfs_lock_two_inodes(ip, tip, lock_flags); + /* + * we have to do two separate lock calls here to keep lockdep + * happy. If we try to get all the locks in one call, lock will + * report false positives when we drop the ILOCK and regain them + * below. + */ + xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL); + xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL); locked = 1; /* Verify that both files have the same format */ diff --git a/fs/xfs/xfs_dmapi.h b/fs/xfs/xfs_dmapi.h index cdc2d3464a1..2813cdd7237 100644 --- a/fs/xfs/xfs_dmapi.h +++ b/fs/xfs/xfs_dmapi.h @@ -18,7 +18,6 @@ #ifndef __XFS_DMAPI_H__ #define __XFS_DMAPI_H__ -#include <linux/version.h> /* Values used to define the on-disk version of dm_attrname_t. All * on-disk attribute names start with the 8-byte string "SGI_DMI_". * diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 00e80df9dd9..dbd9cef852e 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -4118,7 +4118,7 @@ xfs_iext_indirect_to_direct( ASSERT(nextents <= XFS_LINEAR_EXTS); size = nextents * sizeof(xfs_bmbt_rec_t); - xfs_iext_irec_compact_full(ifp); + xfs_iext_irec_compact_pages(ifp); ASSERT(ifp->if_real_bytes == XFS_IEXT_BUFSZ); ep = ifp->if_u1.if_ext_irec->er_extbuf; @@ -4449,8 +4449,7 @@ xfs_iext_irec_remove( * compaction policy is as follows: * * Full Compaction: Extents fit into a single page (or inline buffer) - * Full Compaction: Extents occupy less than 10% of allocated space - * Partial Compaction: Extents occupy > 10% and < 50% of allocated space + * Partial Compaction: Extents occupy less than 50% of allocated space * No Compaction: Extents occupy at least 50% of allocated space */ void @@ -4471,8 +4470,6 @@ xfs_iext_irec_compact( xfs_iext_direct_to_inline(ifp, nextents); } else if (nextents <= XFS_LINEAR_EXTS) { xfs_iext_indirect_to_direct(ifp); - } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 3) { - xfs_iext_irec_compact_full(ifp); } else if (nextents < (nlists * XFS_LINEAR_EXTS) >> 1) { xfs_iext_irec_compact_pages(ifp); } @@ -4496,7 +4493,7 @@ xfs_iext_irec_compact_pages( erp_next = erp + 1; if (erp_next->er_extcount <= (XFS_LINEAR_EXTS - erp->er_extcount)) { - memmove(&erp->er_extbuf[erp->er_extcount], + memcpy(&erp->er_extbuf[erp->er_extcount], erp_next->er_extbuf, erp_next->er_extcount * sizeof(xfs_bmbt_rec_t)); erp->er_extcount += erp_next->er_extcount; @@ -4516,91 +4513,6 @@ xfs_iext_irec_compact_pages( } /* - * Fully compact the extent records managed by the indirection array. - */ -void -xfs_iext_irec_compact_full( - xfs_ifork_t *ifp) /* inode fork pointer */ -{ - xfs_bmbt_rec_host_t *ep, *ep_next; /* extent record pointers */ - xfs_ext_irec_t *erp, *erp_next; /* extent irec pointers */ - int erp_idx = 0; /* extent irec index */ - int ext_avail; /* empty entries in ex list */ - int ext_diff; /* number of exts to add */ - int nlists; /* number of irec's (ex lists) */ - - ASSERT(ifp->if_flags & XFS_IFEXTIREC); - - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - erp = ifp->if_u1.if_ext_irec; - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - - while (erp_idx < nlists - 1) { - /* - * Check how many extent records are available in this irec. - * If there is none skip the whole exercise. - */ - ext_avail = XFS_LINEAR_EXTS - erp->er_extcount; - if (ext_avail) { - - /* - * Copy over as many as possible extent records into - * the previous page. - */ - ext_diff = MIN(ext_avail, erp_next->er_extcount); - memcpy(ep, ep_next, ext_diff * sizeof(xfs_bmbt_rec_t)); - erp->er_extcount += ext_diff; - erp_next->er_extcount -= ext_diff; - - /* - * If the next irec is empty now we can simply - * remove it. - */ - if (erp_next->er_extcount == 0) { - /* - * Free page before removing extent record - * so er_extoffs don't get modified in - * xfs_iext_irec_remove. - */ - kmem_free(erp_next->er_extbuf); - erp_next->er_extbuf = NULL; - xfs_iext_irec_remove(ifp, erp_idx + 1); - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ; - - /* - * If the next irec is not empty move up the content - * that has not been copied to the previous page to - * the beggining of this one. - */ - } else { - memmove(erp_next->er_extbuf, &ep_next[ext_diff], - erp_next->er_extcount * - sizeof(xfs_bmbt_rec_t)); - ep_next = erp_next->er_extbuf; - memset(&ep_next[erp_next->er_extcount], 0, - (XFS_LINEAR_EXTS - - erp_next->er_extcount) * - sizeof(xfs_bmbt_rec_t)); - } - } - - if (erp->er_extcount == XFS_LINEAR_EXTS) { - erp_idx++; - if (erp_idx < nlists) - erp = &ifp->if_u1.if_ext_irec[erp_idx]; - else - break; - } - ep = &erp->er_extbuf[erp->er_extcount]; - erp_next = erp + 1; - ep_next = erp_next->er_extbuf; - } -} - -/* * This is called to update the er_extoff field in the indirection * array when extents have been added or removed from one of the * extent lists. erp_idx contains the irec index to begin updating diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ccba14eb9db..0b02c644355 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -124,16 +124,27 @@ STATIC void xlog_verify_tail_lsn(xlog_t *log, xlog_in_core_t *iclog, STATIC int xlog_iclogs_empty(xlog_t *log); #if defined(XFS_LOG_TRACE) + +#define XLOG_TRACE_LOGGRANT_SIZE 2048 +#define XLOG_TRACE_ICLOG_SIZE 256 + +void +xlog_trace_loggrant_alloc(xlog_t *log) +{ + log->l_grant_trace = ktrace_alloc(XLOG_TRACE_LOGGRANT_SIZE, KM_NOFS); +} + +void +xlog_trace_loggrant_dealloc(xlog_t *log) +{ + ktrace_free(log->l_grant_trace); +} + void xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) { unsigned long cnts; - if (!log->l_grant_trace) { - log->l_grant_trace = ktrace_alloc(2048, KM_NOSLEEP); - if (!log->l_grant_trace) - return; - } /* ticket counts are 1 byte each */ cnts = ((unsigned long)tic->t_ocnt) | ((unsigned long)tic->t_cnt) << 8; @@ -157,10 +168,20 @@ xlog_trace_loggrant(xlog_t *log, xlog_ticket_t *tic, xfs_caddr_t string) } void +xlog_trace_iclog_alloc(xlog_in_core_t *iclog) +{ + iclog->ic_trace = ktrace_alloc(XLOG_TRACE_ICLOG_SIZE, KM_NOFS); +} + +void +xlog_trace_iclog_dealloc(xlog_in_core_t *iclog) +{ + ktrace_free(iclog->ic_trace); +} + +void xlog_trace_iclog(xlog_in_core_t *iclog, uint state) { - if (!iclog->ic_trace) - iclog->ic_trace = ktrace_alloc(256, KM_NOFS); ktrace_enter(iclog->ic_trace, (void *)((unsigned long)state), (void *)((unsigned long)current_pid()), @@ -170,8 +191,15 @@ xlog_trace_iclog(xlog_in_core_t *iclog, uint state) (void *)NULL, (void *)NULL); } #else + +#define xlog_trace_loggrant_alloc(log) +#define xlog_trace_loggrant_dealloc(log) #define xlog_trace_loggrant(log,tic,string) + +#define xlog_trace_iclog_alloc(iclog) +#define xlog_trace_iclog_dealloc(iclog) #define xlog_trace_iclog(iclog,state) + #endif /* XFS_LOG_TRACE */ @@ -1005,11 +1033,12 @@ xlog_iodone(xfs_buf_t *bp) l = iclog->ic_log; /* - * If the ordered flag has been removed by a lower - * layer, it means the underlyin device no longer supports + * If the _XFS_BARRIER_FAILED flag was set by a lower + * layer, it means the underlying device no longer supports * barrier I/O. Warn loudly and turn off barriers. */ - if ((l->l_mp->m_flags & XFS_MOUNT_BARRIER) && !XFS_BUF_ORDERED(bp)) { + if (bp->b_flags & _XFS_BARRIER_FAILED) { + bp->b_flags &= ~_XFS_BARRIER_FAILED; l->l_mp->m_flags &= ~XFS_MOUNT_BARRIER; xfs_fs_cmn_err(CE_WARN, l->l_mp, "xlog_iodone: Barriers are no longer supported" @@ -1231,6 +1260,7 @@ xlog_alloc_log(xfs_mount_t *mp, spin_lock_init(&log->l_grant_lock); sv_init(&log->l_flush_wait, 0, "flush_wait"); + xlog_trace_loggrant_alloc(log); /* log record size must be multiple of BBSIZE; see xlog_rec_header_t */ ASSERT((XFS_BUF_SIZE(bp) & BBMASK) == 0); @@ -1285,6 +1315,8 @@ xlog_alloc_log(xfs_mount_t *mp, sv_init(&iclog->ic_force_wait, SV_DEFAULT, "iclog-force"); sv_init(&iclog->ic_write_wait, SV_DEFAULT, "iclog-write"); + xlog_trace_iclog_alloc(iclog); + iclogp = &iclog->ic_next; } *iclogp = log->l_iclog; /* complete ring */ @@ -1565,11 +1597,7 @@ xlog_dealloc_log(xlog_t *log) sv_destroy(&iclog->ic_force_wait); sv_destroy(&iclog->ic_write_wait); xfs_buf_free(iclog->ic_bp); -#ifdef XFS_LOG_TRACE - if (iclog->ic_trace != NULL) { - ktrace_free(iclog->ic_trace); - } -#endif + xlog_trace_iclog_dealloc(iclog); next_iclog = iclog->ic_next; kmem_free(iclog); iclog = next_iclog; @@ -1578,14 +1606,7 @@ xlog_dealloc_log(xlog_t *log) spinlock_destroy(&log->l_grant_lock); xfs_buf_free(log->l_xbuf); -#ifdef XFS_LOG_TRACE - if (log->l_trace != NULL) { - ktrace_free(log->l_trace); - } - if (log->l_grant_trace != NULL) { - ktrace_free(log->l_grant_trace); - } -#endif + xlog_trace_loggrant_dealloc(log); log->l_mp->m_log = NULL; kmem_free(log); } /* xlog_dealloc_log */ diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index c8a5b22ee3e..e7d8f84443f 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -448,7 +448,6 @@ typedef struct log { int l_grant_write_bytes; #ifdef XFS_LOG_TRACE - struct ktrace *l_trace; struct ktrace *l_grant_trace; #endif diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index aa238c8fbd7..8b6812f66a1 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -1838,6 +1838,12 @@ again: #endif } +/* + * xfs_lock_two_inodes() can only be used to lock one type of lock + * at a time - the iolock or the ilock, but not both at once. If + * we lock both at once, lockdep will report false positives saying + * we have violated locking orders. + */ void xfs_lock_two_inodes( xfs_inode_t *ip0, @@ -1848,6 +1854,8 @@ xfs_lock_two_inodes( int attempts = 0; xfs_log_item_t *lp; + if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) + ASSERT((lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)) == 0); ASSERT(ip0->i_ino != ip1->i_ino); if (ip0->i_ino > ip1->i_ino) { @@ -3152,6 +3160,13 @@ error1: /* Just cancel transaction */ /* * Zero file bytes between startoff and endoff inclusive. * The iolock is held exclusive and no blocks are buffered. + * + * This function is used by xfs_free_file_space() to zero + * partial blocks when the range to free is not block aligned. + * When unreserving space with boundaries that are not block + * aligned we round up the start and round down the end + * boundaries and then use this function to zero the parts of + * the blocks that got dropped during the rounding. */ STATIC int xfs_zero_remaining_bytes( @@ -3168,6 +3183,17 @@ xfs_zero_remaining_bytes( int nimap; int error = 0; + /* + * Avoid doing I/O beyond eof - it's not necessary + * since nothing can read beyond eof. The space will + * be zeroed when the file is extended anyway. + */ + if (startoff >= ip->i_size) + return 0; + + if (endoff > ip->i_size) + endoff = ip->i_size; + bp = xfs_buf_get_noaddr(mp->m_sb.sb_blocksize, XFS_IS_REALTIME_INODE(ip) ? mp->m_rtdev_targp : mp->m_ddev_targp); |