diff options
Diffstat (limited to 'fs')
344 files changed, 10172 insertions, 5187 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 678c02f1ae2..a452ac67fc9 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -224,12 +224,11 @@ static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) } static void -v9fs_umount_begin(struct vfsmount *vfsmnt, int flags) +v9fs_umount_begin(struct super_block *sb) { - struct v9fs_session_info *v9ses = vfsmnt->mnt_sb->s_fs_info; + struct v9fs_session_info *v9ses = sb->s_fs_info; - if (flags & MNT_FORCE) - v9fs_session_cancel(v9ses); + v9fs_session_cancel(v9ses); } static const struct super_operations v9fs_super_ops = { diff --git a/fs/Kconfig b/fs/Kconfig index 8b18a875867..cf12c403b8c 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -411,7 +411,7 @@ config JFS_STATISTICS to be made available to the user in the /proc/fs/jfs/ directory. config FS_POSIX_ACL -# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs) +# Posix ACL utility routines (for now, only ext2/ext3/jfs/reiserfs/nfs4) # # NOTE: you can implement Posix ACLs without these helpers (XFS does). # Never use this symbol for ifdefs. @@ -1005,7 +1005,8 @@ config TMPFS_POSIX_ACL config HUGETLBFS bool "HugeTLB file system support" - depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || BROKEN + depends on X86 || IA64 || PPC64 || SPARC64 || (SUPERH && MMU) || \ + (S390 && 64BIT) || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -1664,105 +1665,86 @@ config NFS_V4 If unsure, say N. -config NFS_DIRECTIO - bool "Allow direct I/O on NFS files" - depends on NFS_FS - help - This option enables applications to perform uncached I/O on files - in NFS file systems using the O_DIRECT open() flag. When O_DIRECT - is set for a file, its data is not cached in the system's page - cache. Data is moved to and from user-level application buffers - directly. Unlike local disk-based file systems, NFS O_DIRECT has - no alignment restrictions. - - Unless your program is designed to use O_DIRECT properly, you are - much better off allowing the NFS client to manage data caching for - you. Misusing O_DIRECT can cause poor server performance or network - storms. This kernel build option defaults OFF to avoid exposing - system administrators unwittingly to a potentially hazardous - feature. - - For more details on NFS O_DIRECT, see fs/nfs/direct.c. - - If unsure, say N. This reduces the size of the NFS client, and - causes open() to return EINVAL if a file residing in NFS is - opened with the O_DIRECT flag. - config NFSD tristate "NFS server support" depends on INET select LOCKD select SUNRPC select EXPORTFS - select NFSD_V2_ACL if NFSD_V3_ACL select NFS_ACL_SUPPORT if NFSD_V2_ACL - select NFSD_TCP if NFSD_V4 - select CRYPTO_MD5 if NFSD_V4 - select CRYPTO if NFSD_V4 - select FS_POSIX_ACL if NFSD_V4 - select PROC_FS if NFSD_V4 - select PROC_FS if SUNRPC_GSS - help - If you want your Linux box to act as an NFS *server*, so that other - computers on your local network which support NFS can access certain - directories on your box transparently, you have two options: you can - use the self-contained user space program nfsd, in which case you - should say N here, or you can say Y and use the kernel based NFS - server. The advantage of the kernel based solution is that it is - faster. - - In either case, you will need support software; the respective - locations are given in the file <file:Documentation/Changes> in the - NFS section. - - If you say Y here, you will get support for version 2 of the NFS - protocol (NFSv2). If you also want NFSv3, say Y to the next question - as well. - - Please read the NFS-HOWTO, available from - <http://www.tldp.org/docs.html#howto>. - - To compile the NFS server support as a module, choose M here: the - module will be called nfsd. If unsure, say N. + help + Choose Y here if you want to allow other computers to access + files residing on this system using Sun's Network File System + protocol. To compile the NFS server support as a module, + choose M here: the module will be called nfsd. + + You may choose to use a user-space NFS server instead, in which + case you can choose N here. + + To export local file systems using NFS, you also need to install + user space programs which can be found in the Linux nfs-utils + package, available from http://linux-nfs.org/. More detail about + the Linux NFS server implementation is available via the + exports(5) man page. + + Below you can choose which versions of the NFS protocol are + available to clients mounting the NFS server on this system. + Support for NFS version 2 (RFC 1094) is always available when + CONFIG_NFSD is selected. + + If unsure, say N. config NFSD_V2_ACL bool depends on NFSD config NFSD_V3 - bool "Provide NFSv3 server support" + bool "NFS server support for NFS version 3" depends on NFSD help - If you would like to include the NFSv3 server as well as the NFSv2 - server, say Y here. If unsure, say Y. + This option enables support in your system's NFS server for + version 3 of the NFS protocol (RFC 1813). + + If unsure, say Y. config NFSD_V3_ACL - bool "Provide server support for the NFSv3 ACL protocol extension" + bool "NFS server support for the NFSv3 ACL protocol extension" depends on NFSD_V3 + select NFSD_V2_ACL help - Implement the NFSv3 ACL protocol extension for manipulating POSIX - Access Control Lists on exported file systems. NFS clients should - be compiled with the NFSv3 ACL protocol extension; see the - CONFIG_NFS_V3_ACL option. If unsure, say N. + Solaris NFS servers support an auxiliary NFSv3 ACL protocol that + never became an official part of the NFS version 3 protocol. + This protocol extension allows applications on NFS clients to + manipulate POSIX Access Control Lists on files residing on NFS + servers. NFS servers enforce POSIX ACLs on local files whether + this protocol is available or not. + + This option enables support in your system's NFS server for the + NFSv3 ACL protocol extension allowing NFS clients to manipulate + POSIX ACLs on files exported by your system's NFS server. NFS + clients which support the Solaris NFSv3 ACL protocol can then + access and modify ACLs on your NFS server. + + To store ACLs on your NFS server, you also need to enable ACL- + related CONFIG options for your local file systems of choice. + + If unsure, say N. config NFSD_V4 - bool "Provide NFSv4 server support (EXPERIMENTAL)" - depends on NFSD && NFSD_V3 && EXPERIMENTAL + bool "NFS server support for NFS version 4 (EXPERIMENTAL)" + depends on NFSD && PROC_FS && EXPERIMENTAL + select NFSD_V3 + select FS_POSIX_ACL select RPCSEC_GSS_KRB5 help - If you would like to include the NFSv4 server as well as the NFSv2 - and NFSv3 servers, say Y here. This feature is experimental, and - should only be used if you are interested in helping to test NFSv4. - If unsure, say N. + This option enables support in your system's NFS server for + version 4 of the NFS protocol (RFC 3530). -config NFSD_TCP - bool "Provide NFS server over TCP support" - depends on NFSD - default y - help - If you want your NFS server to support TCP connections, say Y here. - TCP connections usually perform better than the default UDP when - the network is lossy or congested. If unsure, say Y. + To export files using NFSv4, you need to install additional user + space programs which can be found in the Linux nfs-utils package, + available from http://linux-nfs.org/. + + If unsure, say N. config ROOT_NFS bool "Root file system on NFS" @@ -1808,15 +1790,33 @@ config SUNRPC_XPRT_RDMA tristate depends on SUNRPC && INFINIBAND && EXPERIMENTAL default SUNRPC && INFINIBAND + help + This option enables an RPC client transport capability that + allows the NFS client to mount servers via an RDMA-enabled + transport. + + To compile RPC client RDMA transport support as a module, + choose M here: the module will be called xprtrdma. + + If unsure, say N. config SUNRPC_BIND34 bool "Support for rpcbind versions 3 & 4 (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL + default n help - Provides kernel support for querying rpcbind servers via versions 3 - and 4 of the rpcbind protocol. The kernel automatically falls back - to version 2 if a remote rpcbind service does not support versions - 3 or 4. + RPC requests over IPv6 networks require support for larger + addresses when performing an RPC bind. Sun added support for + IPv6 addressing by creating two new versions of the rpcbind + protocol (RFC 1833). + + This option enables support in the kernel RPC client for + querying rpcbind servers via versions 3 and 4 of the rpcbind + protocol. The kernel automatically falls back to version 2 + if a remote rpcbind service does not support versions 3 or 4. + By themselves, these new versions do not provide support for + RPC over IPv6, but the new protocol versions are necessary to + support it. If unsure, say N to get traditional behavior (version 2 rpcbind requests only). @@ -1830,12 +1830,13 @@ config RPCSEC_GSS_KRB5 select CRYPTO_DES select CRYPTO_CBC help - Provides for secure RPC calls by means of a gss-api - mechanism based on Kerberos V5. This is required for - NFSv4. + Choose Y here to enable Secure RPC using the Kerberos version 5 + GSS-API mechanism (RFC 1964). - Note: Requires an auxiliary userspace daemon which may be found on - http://www.citi.umich.edu/projects/nfsv4/ + Secure RPC calls with Kerberos require an auxiliary user-space + daemon which may be found in the Linux nfs-utils package + available from http://linux-nfs.org/. In addition, user-space + Kerberos support should be installed. If unsure, say N. @@ -1849,11 +1850,12 @@ config RPCSEC_GSS_SPKM3 select CRYPTO_CAST5 select CRYPTO_CBC help - Provides for secure RPC calls by means of a gss-api - mechanism based on the SPKM3 public-key mechanism. + Choose Y here to enable Secure RPC using the SPKM3 public key + GSS-API mechansim (RFC 2025). - Note: Requires an auxiliary userspace daemon which may be found on - http://www.citi.umich.edu/projects/nfsv4/ + Secure RPC calls with SPKM3 require an auxiliary userspace + daemon which may be found in the Linux nfs-utils package + available from http://linux-nfs.org/. If unsure, say N. diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 853845abcca..55e8ee1900a 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -41,7 +41,7 @@ config BINFMT_ELF_FDPIC It is also possible to run FDPIC ELF binaries on MMU linux also. config BINFMT_FLAT - tristate "Kernel support for flat binaries" + bool "Kernel support for flat binaries" depends on !MMU help Support uClinux FLAT format binaries. diff --git a/fs/adfs/adfs.h b/fs/adfs/adfs.h index 936f2af39c4..831157502d5 100644 --- a/fs/adfs/adfs.h +++ b/fs/adfs/adfs.h @@ -75,7 +75,7 @@ extern unsigned int adfs_map_free(struct super_block *sb); /* Misc */ void __adfs_error(struct super_block *sb, const char *function, const char *fmt, ...); -#define adfs_error(sb, fmt...) __adfs_error(sb, __FUNCTION__, fmt) +#define adfs_error(sb, fmt...) __adfs_error(sb, __func__, fmt) /* super.c */ diff --git a/fs/adfs/dir_f.c b/fs/adfs/dir_f.c index b9b2b27b68c..ea7df214692 100644 --- a/fs/adfs/dir_f.c +++ b/fs/adfs/dir_f.c @@ -122,9 +122,9 @@ adfs_dir_checkbyte(const struct adfs_dir *dir) ptr.ptr8 = bufoff(bh, i); end.ptr8 = ptr.ptr8 + last - i; - do + do { dircheck = *ptr.ptr8++ ^ ror13(dircheck); - while (ptr.ptr8 < end.ptr8); + } while (ptr.ptr8 < end.ptr8); } /* diff --git a/fs/affs/file.c b/fs/affs/file.c index 6e0c9399200..1a4f092f24e 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -325,8 +325,7 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block); - if (block > (sector_t)0x7fffffffUL) - BUG(); + BUG_ON(block > (sector_t)0x7fffffffUL); if (block >= AFFS_I(inode)->i_blkcnt) { if (block > AFFS_I(inode)->i_blkcnt || !create) @@ -493,8 +492,7 @@ affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsign u32 tmp; pr_debug("AFFS: read_page(%u, %ld, %d, %d)\n", (u32)inode->i_ino, page->index, from, to); - if (from > to || to > PAGE_CACHE_SIZE) - BUG(); + BUG_ON(from > to || to > PAGE_CACHE_SIZE); kmap(page); data = page_address(page); bsize = AFFS_SB(sb)->s_data_blksize; @@ -507,8 +505,7 @@ affs_do_readpage_ofs(struct file *file, struct page *page, unsigned from, unsign if (IS_ERR(bh)) return PTR_ERR(bh); tmp = min(bsize - boff, to - from); - if (from + tmp > to || tmp > bsize) - BUG(); + BUG_ON(from + tmp > to || tmp > bsize); memcpy(data + from, AFFS_DATA(bh) + boff, tmp); affs_brelse(bh); bidx++; @@ -540,10 +537,9 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) if (IS_ERR(bh)) return PTR_ERR(bh); tmp = min(bsize - boff, newsize - size); - if (boff + tmp > bsize || tmp > bsize) - BUG(); + BUG_ON(boff + tmp > bsize || tmp > bsize); memset(AFFS_DATA(bh) + boff, 0, tmp); - AFFS_DATA_HEAD(bh)->size = cpu_to_be32(be32_to_cpu(AFFS_DATA_HEAD(bh)->size) + tmp); + be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); size += tmp; @@ -560,8 +556,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) if (IS_ERR(bh)) goto out; tmp = min(bsize, newsize - size); - if (tmp > bsize) - BUG(); + BUG_ON(tmp > bsize); AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); @@ -683,10 +678,9 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (IS_ERR(bh)) return PTR_ERR(bh); tmp = min(bsize - boff, to - from); - if (boff + tmp > bsize || tmp > bsize) - BUG(); + BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); - AFFS_DATA_HEAD(bh)->size = cpu_to_be32(be32_to_cpu(AFFS_DATA_HEAD(bh)->size) + tmp); + be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); written += tmp; @@ -732,8 +726,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (IS_ERR(bh)) goto out; tmp = min(bsize, to - from); - if (tmp > bsize) - BUG(); + BUG_ON(tmp > bsize); memcpy(AFFS_DATA(bh), data + from, tmp); if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); diff --git a/fs/affs/super.c b/fs/affs/super.c index d2dc047cb47..01d25d53254 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -199,7 +199,6 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s case Opt_prefix: /* Free any previous prefix */ kfree(*prefix); - *prefix = NULL; *prefix = match_strdup(&args[0]); if (!*prefix) return 0; @@ -233,6 +232,8 @@ parse_options(char *options, uid_t *uid, gid_t *gid, int *mode, int *reserved, s break; case Opt_volume: { char *vol = match_strdup(&args[0]); + if (!vol) + return 0; strlcpy(volume, vol, 32); kfree(vol); break; diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h index 7b4d4fab4c8..255f5dd6040 100644 --- a/fs/afs/afs_cm.h +++ b/fs/afs/afs_cm.h @@ -24,7 +24,8 @@ enum AFS_CM_Operations { CBGetXStatsVersion = 209, /* get version of extended statistics */ CBGetXStats = 210, /* get contents of extended statistics data */ CBInitCallBackState3 = 213, /* initialise callback state, version 3 */ - CBGetCapabilities = 65538, /* get client capabilities */ + CBProbeUuid = 214, /* check the client hasn't rebooted */ + CBTellMeAboutYourself = 65538, /* get client capabilities */ }; #define AFS_CAP_ERROR_TRANSLATION 0x1 diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 584bb0f9c36..5e1df14e16b 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,7 +20,7 @@ DECLARE_RWSEM(afs_proc_cells_sem); LIST_HEAD(afs_proc_cells); -static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells); +static LIST_HEAD(afs_cells); static DEFINE_RWLOCK(afs_cells_lock); static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */ static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq); diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 47b71c8947f..eb765489164 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -26,8 +26,9 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *, struct sk_buff *, bool); static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool); static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool); -static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *, - bool); +static int afs_deliver_cb_probe_uuid(struct afs_call *, struct sk_buff *, bool); +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *, + struct sk_buff *, bool); static void afs_cm_destructor(struct afs_call *); /* @@ -71,11 +72,21 @@ static const struct afs_call_type afs_SRXCBProbe = { }; /* - * CB.GetCapabilities operation type + * CB.ProbeUuid operation type */ -static const struct afs_call_type afs_SRXCBGetCapabilites = { - .name = "CB.GetCapabilities", - .deliver = afs_deliver_cb_get_capabilities, +static const struct afs_call_type afs_SRXCBProbeUuid = { + .name = "CB.ProbeUuid", + .deliver = afs_deliver_cb_probe_uuid, + .abort_to_error = afs_abort_to_error, + .destructor = afs_cm_destructor, +}; + +/* + * CB.TellMeAboutYourself operation type + */ +static const struct afs_call_type afs_SRXCBTellMeAboutYourself = { + .name = "CB.TellMeAboutYourself", + .deliver = afs_deliver_cb_tell_me_about_yourself, .abort_to_error = afs_abort_to_error, .destructor = afs_cm_destructor, }; @@ -103,8 +114,8 @@ bool afs_cm_incoming_call(struct afs_call *call) case CBProbe: call->type = &afs_SRXCBProbe; return true; - case CBGetCapabilities: - call->type = &afs_SRXCBGetCapabilites; + case CBTellMeAboutYourself: + call->type = &afs_SRXCBTellMeAboutYourself; return true; default: return false; @@ -393,9 +404,105 @@ static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb, } /* + * allow the fileserver to quickly find out if the fileserver has been rebooted + */ +static void SRXAFSCB_ProbeUuid(struct work_struct *work) +{ + struct afs_call *call = container_of(work, struct afs_call, work); + struct afs_uuid *r = call->request; + + struct { + __be32 match; + } reply; + + _enter(""); + + + if (memcmp(r, &afs_uuid, sizeof(afs_uuid)) == 0) + reply.match = htonl(0); + else + reply.match = htonl(1); + + afs_send_simple_reply(call, &reply, sizeof(reply)); + _leave(""); +} + +/* + * deliver request data to a CB.ProbeUuid call + */ +static int afs_deliver_cb_probe_uuid(struct afs_call *call, struct sk_buff *skb, + bool last) +{ + struct afs_uuid *r; + unsigned loop; + __be32 *b; + int ret; + + _enter("{%u},{%u},%d", call->unmarshall, skb->len, last); + + if (skb->len > 0) + return -EBADMSG; + if (!last) + return 0; + + switch (call->unmarshall) { + case 0: + call->offset = 0; + call->buffer = kmalloc(11 * sizeof(__be32), GFP_KERNEL); + if (!call->buffer) + return -ENOMEM; + call->unmarshall++; + + case 1: + _debug("extract UUID"); + ret = afs_extract_data(call, skb, last, call->buffer, + 11 * sizeof(__be32)); + switch (ret) { + case 0: break; + case -EAGAIN: return 0; + default: return ret; + } + + _debug("unmarshall UUID"); + call->request = kmalloc(sizeof(struct afs_uuid), GFP_KERNEL); + if (!call->request) + return -ENOMEM; + + b = call->buffer; + r = call->request; + r->time_low = ntohl(b[0]); + r->time_mid = ntohl(b[1]); + r->time_hi_and_version = ntohl(b[2]); + r->clock_seq_hi_and_reserved = ntohl(b[3]); + r->clock_seq_low = ntohl(b[4]); + + for (loop = 0; loop < 6; loop++) + r->node[loop] = ntohl(b[loop + 5]); + + call->offset = 0; + call->unmarshall++; + + case 2: + _debug("trailer"); + if (skb->len != 0) + return -EBADMSG; + break; + } + + if (!last) + return 0; + + call->state = AFS_CALL_REPLYING; + + INIT_WORK(&call->work, SRXAFSCB_ProbeUuid); + schedule_work(&call->work); + return 0; +} + +/* * allow the fileserver to ask about the cache manager's capabilities */ -static void SRXAFSCB_GetCapabilities(struct work_struct *work) +static void SRXAFSCB_TellMeAboutYourself(struct work_struct *work) { struct afs_interface *ifs; struct afs_call *call = container_of(work, struct afs_call, work); @@ -456,10 +563,10 @@ static void SRXAFSCB_GetCapabilities(struct work_struct *work) } /* - * deliver request data to a CB.GetCapabilities call + * deliver request data to a CB.TellMeAboutYourself call */ -static int afs_deliver_cb_get_capabilities(struct afs_call *call, - struct sk_buff *skb, bool last) +static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call, + struct sk_buff *skb, bool last) { _enter(",{%u},%d", skb->len, last); @@ -471,7 +578,7 @@ static int afs_deliver_cb_get_capabilities(struct afs_call *call, /* no unmarshalling required */ call->state = AFS_CALL_REPLYING; - INIT_WORK(&call->work, SRXAFSCB_GetCapabilities); + INIT_WORK(&call->work, SRXAFSCB_TellMeAboutYourself); schedule_work(&call->work); return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index b58af8f18bc..dfda03d4397 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -140,7 +140,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page) if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) { printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n", - __FUNCTION__, dir->i_ino, qty, + __func__, dir->i_ino, qty, ntohs(dbuf->blocks[0].pagehdr.npages)); goto error; } @@ -159,7 +159,7 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page) for (tmp = 0; tmp < qty; tmp++) { if (dbuf->blocks[tmp].pagehdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lu): bad magic %d/%d is %04hx\n", - __FUNCTION__, dir->i_ino, tmp, qty, + __func__, dir->i_ino, tmp, qty, ntohs(dbuf->blocks[tmp].pagehdr.magic)); goto error; } diff --git a/fs/afs/internal.h b/fs/afs/internal.h index eec41c76de7..7102824ba84 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -757,8 +757,8 @@ void _dbprintk(const char *fmt, ...) { } -#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__) -#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__) +#define kenter(FMT,...) dbgprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define kleave(FMT,...) dbgprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define kdebug(FMT,...) dbgprintk(" "FMT ,##__VA_ARGS__) @@ -791,8 +791,8 @@ do { \ } while (0) #else -#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__) -#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__) +#define _enter(FMT,...) _dbprintk("==> %s("FMT")",__func__ ,##__VA_ARGS__) +#define _leave(FMT,...) _dbprintk("<== %s()"FMT"",__func__ ,##__VA_ARGS__) #define _debug(FMT,...) _dbprintk(" "FMT ,##__VA_ARGS__) #endif diff --git a/fs/afs/proc.c b/fs/afs/proc.c index 846c7615ac9..9f7d1ae7026 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -41,6 +41,7 @@ static const struct file_operations afs_proc_cells_fops = { .write = afs_proc_cells_write, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; static int afs_proc_rootcell_open(struct inode *inode, struct file *file); @@ -56,7 +57,8 @@ static const struct file_operations afs_proc_rootcell_fops = { .read = afs_proc_rootcell_read, .write = afs_proc_rootcell_write, .llseek = no_llseek, - .release = afs_proc_rootcell_release + .release = afs_proc_rootcell_release, + .owner = THIS_MODULE, }; static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file); @@ -80,6 +82,7 @@ static const struct file_operations afs_proc_cell_volumes_fops = { .read = seq_read, .llseek = seq_lseek, .release = afs_proc_cell_volumes_release, + .owner = THIS_MODULE, }; static int afs_proc_cell_vlservers_open(struct inode *inode, @@ -104,6 +107,7 @@ static const struct file_operations afs_proc_cell_vlservers_fops = { .read = seq_read, .llseek = seq_lseek, .release = afs_proc_cell_vlservers_release, + .owner = THIS_MODULE, }; static int afs_proc_cell_servers_open(struct inode *inode, struct file *file); @@ -127,6 +131,7 @@ static const struct file_operations afs_proc_cell_servers_fops = { .read = seq_read, .llseek = seq_lseek, .release = afs_proc_cell_servers_release, + .owner = THIS_MODULE, }; /* @@ -143,17 +148,13 @@ int afs_proc_init(void) goto error_dir; proc_afs->owner = THIS_MODULE; - p = create_proc_entry("cells", 0, proc_afs); + p = proc_create("cells", 0, proc_afs, &afs_proc_cells_fops); if (!p) goto error_cells; - p->proc_fops = &afs_proc_cells_fops; - p->owner = THIS_MODULE; - p = create_proc_entry("rootcell", 0, proc_afs); + p = proc_create("rootcell", 0, proc_afs, &afs_proc_rootcell_fops); if (!p) goto error_rootcell; - p->proc_fops = &afs_proc_rootcell_fops; - p->owner = THIS_MODULE; _leave(" = 0"); return 0; @@ -395,26 +396,20 @@ int afs_proc_cell_setup(struct afs_cell *cell) if (!cell->proc_dir) goto error_dir; - p = create_proc_entry("servers", 0, cell->proc_dir); + p = proc_create_data("servers", 0, cell->proc_dir, + &afs_proc_cell_servers_fops, cell); if (!p) goto error_servers; - p->proc_fops = &afs_proc_cell_servers_fops; - p->owner = THIS_MODULE; - p->data = cell; - p = create_proc_entry("vlservers", 0, cell->proc_dir); + p = proc_create_data("vlservers", 0, cell->proc_dir, + &afs_proc_cell_vlservers_fops, cell); if (!p) goto error_vlservers; - p->proc_fops = &afs_proc_cell_vlservers_fops; - p->owner = THIS_MODULE; - p->data = cell; - p = create_proc_entry("volumes", 0, cell->proc_dir); + p = proc_create_data("volumes", 0, cell->proc_dir, + &afs_proc_cell_volumes_fops, cell); if (!p) goto error_volumes; - p->proc_fops = &afs_proc_cell_volumes_fops; - p->owner = THIS_MODULE; - p->data = cell; _leave(" = 0"); return 0; @@ -191,6 +191,43 @@ static int aio_setup_ring(struct kioctx *ctx) kunmap_atomic((void *)((unsigned long)__event & PAGE_MASK), km); \ } while(0) + +/* __put_ioctx + * Called when the last user of an aio context has gone away, + * and the struct needs to be freed. + */ +static void __put_ioctx(struct kioctx *ctx) +{ + unsigned nr_events = ctx->max_reqs; + + BUG_ON(ctx->reqs_active); + + cancel_delayed_work(&ctx->wq); + cancel_work_sync(&ctx->wq.work); + aio_free_ring(ctx); + mmdrop(ctx->mm); + ctx->mm = NULL; + pr_debug("__put_ioctx: freeing %p\n", ctx); + kmem_cache_free(kioctx_cachep, ctx); + + if (nr_events) { + spin_lock(&aio_nr_lock); + BUG_ON(aio_nr - nr_events > aio_nr); + aio_nr -= nr_events; + spin_unlock(&aio_nr_lock); + } +} + +#define get_ioctx(kioctx) do { \ + BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ + atomic_inc(&(kioctx)->users); \ +} while (0) +#define put_ioctx(kioctx) do { \ + BUG_ON(atomic_read(&(kioctx)->users) <= 0); \ + if (unlikely(atomic_dec_and_test(&(kioctx)->users))) \ + __put_ioctx(kioctx); \ +} while (0) + /* ioctx_alloc * Allocates and initializes an ioctx. Returns an ERR_PTR if it failed. */ @@ -240,7 +277,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) if (ctx->max_reqs == 0) goto out_cleanup; - /* now link into global list. kludge. FIXME */ + /* now link into global list. */ write_lock(&mm->ioctx_list_lock); ctx->next = mm->ioctx_list; mm->ioctx_list = ctx; @@ -361,32 +398,6 @@ void exit_aio(struct mm_struct *mm) } } -/* __put_ioctx - * Called when the last user of an aio context has gone away, - * and the struct needs to be freed. - */ -void __put_ioctx(struct kioctx *ctx) -{ - unsigned nr_events = ctx->max_reqs; - - BUG_ON(ctx->reqs_active); - - cancel_delayed_work(&ctx->wq); - cancel_work_sync(&ctx->wq.work); - aio_free_ring(ctx); - mmdrop(ctx->mm); - ctx->mm = NULL; - pr_debug("__put_ioctx: freeing %p\n", ctx); - kmem_cache_free(kioctx_cachep, ctx); - - if (nr_events) { - spin_lock(&aio_nr_lock); - BUG_ON(aio_nr - nr_events > aio_nr); - aio_nr -= nr_events; - spin_unlock(&aio_nr_lock); - } -} - /* aio_get_req * Allocate a slot for an aio request. Increments the users count * of the kioctx so that the kioctx stays around until all requests are @@ -542,10 +553,7 @@ int aio_put_req(struct kiocb *req) return ret; } -/* Lookup an ioctx id. ioctx_list is lockless for reads. - * FIXME: this is O(n) and is only suitable for development. - */ -struct kioctx *lookup_ioctx(unsigned long ctx_id) +static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct kioctx *ioctx; struct mm_struct *mm; @@ -1070,9 +1078,7 @@ static void timeout_func(unsigned long data) static inline void init_timeout(struct aio_timeout *to) { - init_timer(&to->timer); - to->timer.data = (unsigned long)to; - to->timer.function = timeout_func; + setup_timer_on_stack(&to->timer, timeout_func, (unsigned long) to); to->timed_out = 0; to->p = current; } @@ -1166,7 +1172,10 @@ retry: break; if (min_nr <= i) break; - ret = 0; + if (unlikely(ctx->dead)) { + ret = -EINVAL; + break; + } if (to.timed_out) /* Only check after read evt */ break; /* Try to only show up in io wait if there are ops @@ -1202,6 +1211,7 @@ retry: if (timeout) clear_timeout(&to); out: + destroy_timer_on_stack(&to.timer); return i ? i : ret; } @@ -1231,6 +1241,13 @@ static void io_destroy(struct kioctx *ioctx) aio_cancel_all(ioctx); wait_for_all_aios(ioctx); + + /* + * Wake up any waiters. The setting of ctx->dead must be seen + * by other CPUs at this point. Right now, we rely on the + * locking done by the above calls to ensure this consistency. + */ + wake_up(&ioctx->wait); put_ioctx(ioctx); /* once for the lookup */ } @@ -1542,7 +1559,7 @@ static int aio_wake_function(wait_queue_t *wait, unsigned mode, return 1; } -int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, +static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, struct iocb *iocb) { struct kiocb *req; @@ -1583,7 +1600,7 @@ int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, * event using the eventfd_signal() function. */ req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); - if (unlikely(IS_ERR(req->ki_eventfd))) { + if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); goto out_put_req; } diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index f42be069e08..977ef208c05 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -57,9 +57,6 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * anonymous inode, and a dentry that describe the "class" * of the file * - * @pfd: [out] pointer to the file descriptor - * @dpinode: [out] pointer to the inode - * @pfile: [out] pointer to the file struct * @name: [in] name of the "class" of the new file * @fops [in] file operations for the new file * @priv [in] private data for the new file (will be file's private_data) @@ -68,10 +65,9 @@ static struct dentry_operations anon_inodefs_dentry_operations = { * that do not need to have a full-fledged inode in order to operate correctly. * All the files created with anon_inode_getfd() will share a single inode, * hence saving memory and avoiding code duplication for the file/inode/dentry - * setup. + * setup. Returns new descriptor or -error. */ -int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, - const char *name, const struct file_operations *fops, +int anon_inode_getfd(const char *name, const struct file_operations *fops, void *priv) { struct qstr this; @@ -125,10 +121,7 @@ int anon_inode_getfd(int *pfd, struct inode **pinode, struct file **pfile, fd_install(fd, file); - *pfd = fd; - *pinode = anon_inode_inode; - *pfile = file; - return 0; + return fd; err_dput: dput(dentry); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 2d4ae40718d..c3d352d7fa9 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -35,7 +35,7 @@ /* #define DEBUG */ #ifdef DEBUG -#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __FUNCTION__ , ##args); } while(0) +#define DPRINTK(fmt,args...) do { printk(KERN_DEBUG "pid %d: %s: " fmt "\n" , current->pid , __func__ , ##args); } while(0) #else #define DPRINTK(fmt,args...) do {} while(0) #endif diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index d96e5c14a9c..894fee54d4d 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -73,8 +73,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) status = 0; done: DPRINTK("returning = %d", status); - mntput(mnt); dput(dentry); + mntput(mnt); return status; } @@ -333,7 +333,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, /* Can we expire this guy */ if (autofs4_can_expire(dentry, timeout, do_now)) { expired = dentry; - break; + goto found; } goto next; } @@ -352,7 +352,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, inf->flags |= AUTOFS_INF_EXPIRING; spin_unlock(&sbi->fs_lock); expired = dentry; - break; + goto found; } spin_unlock(&sbi->fs_lock); /* @@ -363,7 +363,7 @@ static struct dentry *autofs4_expire_indirect(struct super_block *sb, expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); if (expired) { dput(dentry); - break; + goto found; } } next: @@ -371,18 +371,16 @@ next: spin_lock(&dcache_lock); next = next->next; } - - if (expired) { - DPRINTK("returning %p %.*s", - expired, (int)expired->d_name.len, expired->d_name.name); - spin_lock(&dcache_lock); - list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); - spin_unlock(&dcache_lock); - return expired; - } spin_unlock(&dcache_lock); - return NULL; + +found: + DPRINTK("returning %p %.*s", + expired, (int)expired->d_name.len, expired->d_name.name); + spin_lock(&dcache_lock); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + spin_unlock(&dcache_lock); + return expired; } /* Perform an expiry operation */ diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index a54a946a50a..edf5b6bddb5 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -146,17 +146,17 @@ static int autofs4_dir_open(struct inode *inode, struct file *file) if (d_mountpoint(dentry)) { struct file *fp = NULL; - struct vfsmount *fp_mnt = mntget(mnt); - struct dentry *fp_dentry = dget(dentry); + struct path fp_path = { .dentry = dentry, .mnt = mnt }; - if (!autofs4_follow_mount(&fp_mnt, &fp_dentry)) { - dput(fp_dentry); - mntput(fp_mnt); + path_get(&fp_path); + + if (!autofs4_follow_mount(&fp_path.mnt, &fp_path.dentry)) { + path_put(&fp_path); dcache_dir_close(inode, file); goto out; } - fp = dentry_open(fp_dentry, fp_mnt, file->f_flags); + fp = dentry_open(fp_path.dentry, fp_path.mnt, file->f_flags); status = PTR_ERR(fp); if (IS_ERR(fp)) { dcache_dir_close(inode, file); @@ -242,7 +242,8 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); - int status = 0; + struct dentry *new; + int status; /* Block on any pending expiry here; invalidate the dentry when expiration is done to trigger mount request with a new @@ -318,7 +319,28 @@ static int try_to_fill_dentry(struct dentry *dentry, int flags) spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_AUTOFS_PENDING; spin_unlock(&dentry->d_lock); - return status; + + /* + * The dentry that is passed in from lookup may not be the one + * we end up using, as mkdir can create a new one. If this + * happens, and another process tries the lookup at the same time, + * it will set the PENDING flag on this new dentry, but add itself + * to our waitq. Then, if after the lookup succeeds, the first + * process that requested the mount performs another lookup of the + * same directory, it will show up as still pending! So, we need + * to redo the lookup here and clear pending on that dentry. + */ + if (d_unhashed(dentry)) { + new = d_lookup(dentry->d_parent, &dentry->d_name); + if (new) { + spin_lock(&new->d_lock); + new->d_flags &= ~DCACHE_AUTOFS_PENDING; + spin_unlock(&new->d_lock); + dput(new); + } + } + + return 0; } /* For autofs direct mounts the follow link triggers the mount */ @@ -533,9 +555,9 @@ static struct dentry *autofs4_lookup_unhashed(struct autofs_sb_info *sbi, struct goto next; if (d_unhashed(dentry)) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); struct inode *inode = dentry->d_inode; + ino = autofs4_dentry_ino(dentry); list_del_init(&ino->rehash); dget(dentry); /* diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index 1fe28e4754c..75e5955c3f6 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -171,7 +171,7 @@ static int autofs4_getpath(struct autofs_sb_info *sbi, for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) len += tmp->d_name.len + 1; - if (--len > NAME_MAX) { + if (!len || --len > NAME_MAX) { spin_unlock(&dcache_lock); return 0; } diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 82123ff3e1d..e8717de3bab 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -489,9 +489,9 @@ static void befs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { befs_inode_info *befs_ino = BEFS_I(dentry->d_inode); if (befs_ino->i_flags & BEFS_LONG_SYMLINK) { - char *p = nd_get_link(nd); - if (!IS_ERR(p)) - kfree(p); + char *link = nd_get_link(nd); + if (!IS_ERR(link)) + kfree(link); } } diff --git a/fs/bfs/bfs.h b/fs/bfs/bfs.h index 71faf4d2390..70f5d3a8eed 100644 --- a/fs/bfs/bfs.h +++ b/fs/bfs/bfs.h @@ -42,7 +42,7 @@ static inline struct bfs_inode_info *BFS_I(struct inode *inode) #define printf(format, args...) \ - printk(KERN_ERR "BFS-fs: %s(): " format, __FUNCTION__, ## args) + printk(KERN_ERR "BFS-fs: %s(): " format, __func__, ## args) /* inode.c */ extern struct inode *bfs_iget(struct super_block *sb, unsigned long ino); diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index a1bb2244cac..ba4cddb92f1 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -372,21 +372,17 @@ static int load_aout_binary(struct linux_binprm * bprm, struct pt_regs * regs) flush_icache_range(text_addr, text_addr+ex.a_text+ex.a_data); } else { - static unsigned long error_time, error_time2; if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && - (N_MAGIC(ex) != NMAGIC) && (jiffies-error_time2) > 5*HZ) + (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) { printk(KERN_NOTICE "executable not page aligned\n"); - error_time2 = jiffies; } - if ((fd_offset & ~PAGE_MASK) != 0 && - (jiffies-error_time) > 5*HZ) + if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit()) { printk(KERN_WARNING "fd_offset is not page aligned. Please convert program: %s\n", bprm->file->f_path.dentry->d_name.name); - error_time = jiffies; } if (!bprm->file->f_op->mmap||((fd_offset & ~PAGE_MASK) != 0)) { @@ -495,15 +491,13 @@ static int load_aout_library(struct file *file) start_addr = ex.a_entry & 0xfffff000; if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) { - static unsigned long error_time; loff_t pos = N_TXTOFF(ex); - if ((jiffies-error_time) > 5*HZ) + if (printk_ratelimit()) { printk(KERN_WARNING "N_TXTOFF is not page aligned. Please convert library: %s\n", file->f_path.dentry->d_name.name); - error_time = jiffies; } down_write(¤t->mm->mmap_sem); do_brk(start_addr, ex.a_text + ex.a_data + ex.a_bss); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5e1a4fb5cac..b25707fee2c 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -543,7 +543,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) unsigned long interp_load_addr = 0; unsigned long start_code, end_code, start_data, end_data; unsigned long reloc_func_desc = 0; - struct files_struct *files; int executable_stack = EXSTACK_DEFAULT; unsigned long def_flags = 0; struct { @@ -593,20 +592,9 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) goto out_free_ph; } - files = current->files; /* Refcounted so ok */ - retval = unshare_files(); - if (retval < 0) - goto out_free_ph; - if (files == current->files) { - put_files_struct(files); - files = NULL; - } - - /* exec will make our files private anyway, but for the a.out - loader stuff we need to do it earlier */ retval = get_unused_fd(); if (retval < 0) - goto out_free_fh; + goto out_free_ph; get_file(bprm->file); fd_install(elf_exec_fileno = retval, bprm->file); @@ -728,12 +716,6 @@ static int load_elf_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval) goto out_free_dentry; - /* Discard our unneeded old files struct */ - if (files) { - put_files_struct(files); - files = NULL; - } - /* OK, This is the point of no return */ current->flags &= ~PF_FORKNOEXEC; current->mm->def_flags = def_flags; @@ -1016,9 +998,6 @@ out_free_interp: kfree(elf_interpreter); out_free_file: sys_close(elf_exec_fileno); -out_free_fh: - if (files) - reset_files_struct(current, files); out_free_ph: kfree(elf_phdata); goto out; @@ -1276,26 +1255,23 @@ static int writenote(struct memelfnote *men, struct file *file, static void fill_elf_header(struct elfhdr *elf, int segs, u16 machine, u32 flags, u8 osabi) { + memset(elf, 0, sizeof(*elf)); + memcpy(elf->e_ident, ELFMAG, SELFMAG); elf->e_ident[EI_CLASS] = ELF_CLASS; elf->e_ident[EI_DATA] = ELF_DATA; elf->e_ident[EI_VERSION] = EV_CURRENT; elf->e_ident[EI_OSABI] = ELF_OSABI; - memset(elf->e_ident+EI_PAD, 0, EI_NIDENT-EI_PAD); elf->e_type = ET_CORE; elf->e_machine = machine; elf->e_version = EV_CURRENT; - elf->e_entry = 0; elf->e_phoff = sizeof(struct elfhdr); - elf->e_shoff = 0; elf->e_flags = flags; elf->e_ehsize = sizeof(struct elfhdr); elf->e_phentsize = sizeof(struct elf_phdr); elf->e_phnum = segs; - elf->e_shentsize = 0; - elf->e_shnum = 0; - elf->e_shstrndx = 0; + return; } @@ -1746,26 +1722,25 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, info->thread_status_size = 0; if (signr) { - struct elf_thread_status *tmp; + struct elf_thread_status *ets; rcu_read_lock(); do_each_thread(g, p) if (current->mm == p->mm && current != p) { - tmp = kzalloc(sizeof(*tmp), GFP_ATOMIC); - if (!tmp) { + ets = kzalloc(sizeof(*ets), GFP_ATOMIC); + if (!ets) { rcu_read_unlock(); return 0; } - tmp->thread = p; - list_add(&tmp->list, &info->thread_list); + ets->thread = p; + list_add(&ets->list, &info->thread_list); } while_each_thread(g, p); rcu_read_unlock(); list_for_each(t, &info->thread_list) { - struct elf_thread_status *tmp; int sz; - tmp = list_entry(t, struct elf_thread_status, list); - sz = elf_dump_thread_status(signr, tmp); + ets = list_entry(t, struct elf_thread_status, list); + sz = elf_dump_thread_status(signr, ets); info->thread_status_size += sz; } } @@ -2021,10 +1996,10 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un for (addr = vma->vm_start; addr < end; addr += PAGE_SIZE) { struct page *page; - struct vm_area_struct *vma; + struct vm_area_struct *tmp_vma; if (get_user_pages(current, current->mm, addr, 1, 0, 1, - &page, &vma) <= 0) { + &page, &tmp_vma) <= 0) { DUMP_SEEK(PAGE_SIZE); } else { if (page == ZERO_PAGE(0)) { @@ -2034,7 +2009,7 @@ static int elf_core_dump(long signr, struct pt_regs *regs, struct file *file, un } } else { void *kaddr; - flush_cache_page(vma, addr, + flush_cache_page(tmp_vma, addr, page_to_pfn(page)); kaddr = kmap(page); if ((size += PAGE_SIZE) > limit || diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 32649f2a165..ddd35d87339 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -136,8 +136,8 @@ static int elf_fdpic_fetch_phdrs(struct elf_fdpic_params *params, retval = kernel_read(file, params->hdr.e_phoff, (char *) params->phdrs, size); - if (retval < 0) - return retval; + if (unlikely(retval != size)) + return retval < 0 ? retval : -ENOEXEC; /* determine stack size for this binary */ phdr = params->phdrs; @@ -218,8 +218,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, phdr->p_offset, interpreter_name, phdr->p_filesz); - if (retval < 0) + if (unlikely(retval != phdr->p_filesz)) { + if (retval >= 0) + retval = -ENOEXEC; goto error; + } retval = -ENOENT; if (interpreter_name[phdr->p_filesz - 1] != '\0') @@ -245,8 +248,11 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm, retval = kernel_read(interpreter, 0, bprm->buf, BINPRM_BUF_SIZE); - if (retval < 0) + if (unlikely(retval != BINPRM_BUF_SIZE)) { + if (retval >= 0) + retval = -ENOEXEC; goto error; + } interp_params.hdr = *((struct elfhdr *) bprm->buf); break; diff --git a/fs/binfmt_em86.c b/fs/binfmt_em86.c index f95ae9789c9..f9c88d0c8ce 100644 --- a/fs/binfmt_em86.c +++ b/fs/binfmt_em86.c @@ -43,7 +43,7 @@ static int load_em86(struct linux_binprm *bprm,struct pt_regs *regs) return -ENOEXEC; } - bprm->sh_bang++; /* Well, the bang-shell is implicit... */ + bprm->sh_bang = 1; /* Well, the bang-shell is implicit... */ allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 0498b181dd5..3b40d45a3a1 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -531,7 +531,8 @@ static int load_flat_file(struct linux_binprm * bprm, DBG_FLT("BINFMT_FLAT: ROM mapping of file (we hope)\n"); down_write(¤t->mm->mmap_sem); - textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, MAP_PRIVATE, 0); + textpos = do_mmap(bprm->file, 0, text_len, PROT_READ|PROT_EXEC, + MAP_PRIVATE|MAP_EXECUTABLE, 0); up_write(¤t->mm->mmap_sem); if (!textpos || textpos >= (unsigned long) -4096) { if (!textpos) @@ -932,14 +933,8 @@ static int __init init_flat_binfmt(void) return register_binfmt(&flat_format); } -static void __exit exit_flat_binfmt(void) -{ - unregister_binfmt(&flat_format); -} - /****************************************************************************/ core_initcall(init_flat_binfmt); -module_exit(exit_flat_binfmt); /****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index b53c7e5f41b..7191306367c 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -110,12 +110,17 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) char *iname_addr = iname; int retval; int fd_binary = -1; - struct files_struct *files = NULL; retval = -ENOEXEC; if (!enabled) goto _ret; + retval = -ENOEXEC; + if (bprm->misc_bang) + goto _ret; + + bprm->misc_bang = 1; + /* to keep locking time low, we copy the interpreter string */ read_lock(&entries_lock); fmt = check_file(bprm); @@ -133,21 +138,13 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (fmt->flags & MISC_FMT_OPEN_BINARY) { - files = current->files; - retval = unshare_files(); - if (retval < 0) - goto _ret; - if (files == current->files) { - put_files_struct(files); - files = NULL; - } /* if the binary should be opened on behalf of the * interpreter than keep it open and assign descriptor * to it */ fd_binary = get_unused_fd(); if (fd_binary < 0) { retval = fd_binary; - goto _unshare; + goto _ret; } fd_install(fd_binary, bprm->file); @@ -205,10 +202,6 @@ static int load_misc_binary(struct linux_binprm *bprm, struct pt_regs *regs) if (retval < 0) goto _error; - if (files) { - put_files_struct(files); - files = NULL; - } _ret: return retval; _error: @@ -216,9 +209,6 @@ _error: sys_close(fd_binary); bprm->interp_flags = 0; bprm->interp_data = 0; -_unshare: - if (files) - reset_files_struct(current, files); goto _ret; } diff --git a/fs/binfmt_script.c b/fs/binfmt_script.c index ab33939b12a..9e3963f7ebf 100644 --- a/fs/binfmt_script.c +++ b/fs/binfmt_script.c @@ -29,7 +29,7 @@ static int load_script(struct linux_binprm *bprm,struct pt_regs *regs) * Sorta complicated, but hopefully it will work. -TYT */ - bprm->sh_bang++; + bprm->sh_bang = 1; allow_write_access(bprm->file); fput(bprm->file); bprm->file = NULL; diff --git a/fs/binfmt_som.c b/fs/binfmt_som.c index 14c63527c76..fdc36bfd6a7 100644 --- a/fs/binfmt_som.c +++ b/fs/binfmt_som.c @@ -194,7 +194,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) unsigned long som_entry; struct som_hdr *som_ex; struct som_exec_auxhdr *hpuxhdr; - struct files_struct *files; /* Get the exec-header */ som_ex = (struct som_hdr *) bprm->buf; @@ -221,15 +220,6 @@ load_som_binary(struct linux_binprm * bprm, struct pt_regs * regs) goto out_free; } - files = current->files; /* Refcounted so ok */ - retval = unshare_files(); - if (retval < 0) - goto out_free; - if (files == current->files) { - put_files_struct(files); - files = NULL; - } - retval = get_unused_fd(); if (retval < 0) goto out_free; @@ -937,6 +937,95 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len, return ERR_PTR(-EINVAL); } +static void bio_copy_kern_endio(struct bio *bio, int err) +{ + struct bio_vec *bvec; + const int read = bio_data_dir(bio) == READ; + char *p = bio->bi_private; + int i; + + __bio_for_each_segment(bvec, bio, i, 0) { + char *addr = page_address(bvec->bv_page); + + if (read && !err) + memcpy(p, addr, bvec->bv_len); + + __free_page(bvec->bv_page); + p += bvec->bv_len; + } + + bio_put(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, + gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + struct bio *bio; + struct bio_vec *bvec; + int i, ret; + + bio = bio_alloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + goto cleanup; + } + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { + ret = -EINVAL; + goto cleanup; + } + + len -= bytes; + } + + if (!reading) { + void *p = data; + + bio_for_each_segment(bvec, bio, i) { + char *addr = page_address(bvec->bv_page); + + memcpy(addr, p, bvec->bv_len); + p += bvec->bv_len; + } + } + + bio->bi_private = data; + bio->bi_end_io = bio_copy_kern_endio; + return bio; +cleanup: + bio_for_each_segment(bvec, bio, i) + __free_page(bvec->bv_page); + + bio_put(bio); + + return ERR_PTR(ret); +} + /* * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. @@ -1273,6 +1362,7 @@ EXPORT_SYMBOL(bio_get_nr_vecs); EXPORT_SYMBOL(bio_map_user); EXPORT_SYMBOL(bio_unmap_user); EXPORT_SYMBOL(bio_map_kern); +EXPORT_SYMBOL(bio_copy_kern); EXPORT_SYMBOL(bio_pair_release); EXPORT_SYMBOL(bio_split); EXPORT_SYMBOL(bio_split_pool); diff --git a/fs/buffer.c b/fs/buffer.c index 39ff14403d1..a073f3f4f01 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -360,16 +360,19 @@ void invalidate_bdev(struct block_device *bdev) */ static void free_more_memory(void) { - struct zone **zones; - pg_data_t *pgdat; + struct zone *zone; + int nid; wakeup_pdflush(1024); yield(); - for_each_online_pgdat(pgdat) { - zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones; - if (*zones) - try_to_free_pages(zones, 0, GFP_NOFS); + for_each_online_node(nid) { + (void)first_zones_zonelist(node_zonelist(nid, GFP_NOFS), + gfp_zone(GFP_NOFS), NULL, + &zone); + if (zone) + try_to_free_pages(node_zonelist(nid, GFP_NOFS), 0, + GFP_NOFS); } } @@ -1098,7 +1101,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) printk(KERN_ERR "%s: requested out-of-range block %llu for " "device %s\n", - __FUNCTION__, (unsigned long long)block, + __func__, (unsigned long long)block, bdevname(bdev, b)); return -EIO; } @@ -2208,8 +2211,8 @@ out: return err; } -int cont_expand_zero(struct file *file, struct address_space *mapping, - loff_t pos, loff_t *bytes) +static int cont_expand_zero(struct file *file, struct address_space *mapping, + loff_t pos, loff_t *bytes) { struct inode *inode = mapping->host; unsigned blocksize = 1 << inode->i_blkbits; @@ -2243,6 +2246,8 @@ int cont_expand_zero(struct file *file, struct address_space *mapping, goto out; BUG_ON(err != len); err = 0; + + balance_dirty_pages_ratelimited(mapping); } /* page covers the boundary, find the boundary offset */ @@ -2323,23 +2328,6 @@ int block_commit_write(struct page *page, unsigned from, unsigned to) return 0; } -int generic_commit_write(struct file *file, struct page *page, - unsigned from, unsigned to) -{ - struct inode *inode = page->mapping->host; - loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; - __block_commit_write(inode,page,from,to); - /* - * No need to use i_size_read() here, the i_size - * cannot change under us because we hold i_mutex. - */ - if (pos > inode->i_size) { - i_size_write(inode, pos); - mark_inode_dirty(inode); - } - return 0; -} - /* * block_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must @@ -3180,8 +3168,7 @@ static void recalc_bh_state(void) struct buffer_head *alloc_buffer_head(gfp_t gfp_flags) { - struct buffer_head *ret = kmem_cache_alloc(bh_cachep, - set_migrateflags(gfp_flags, __GFP_RECLAIMABLE)); + struct buffer_head *ret = kmem_cache_alloc(bh_cachep, gfp_flags); if (ret) { INIT_LIST_HEAD(&ret->b_assoc_buffers); get_cpu_var(bh_accounting).nr++; @@ -3311,7 +3298,6 @@ EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); EXPORT_SYMBOL(fsync_bdev); EXPORT_SYMBOL(generic_block_bmap); -EXPORT_SYMBOL(generic_commit_write); EXPORT_SYMBOL(generic_cont_expand_simple); EXPORT_SYMBOL(init_buffer); EXPORT_SYMBOL(invalidate_bdev); diff --git a/fs/char_dev.c b/fs/char_dev.c index 038674aa88a..68e510b8845 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -55,7 +55,6 @@ static struct char_device_struct { unsigned int baseminor; int minorct; char name[64]; - struct file_operations *fops; struct cdev *cdev; /* will die */ } *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index dbd91461853..05c9da6181c 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -8,7 +8,8 @@ of second share to disconnected server session (autoreconnect on this). Add ability to modify cifs acls for handling chmod (when mounted with cifsacl flag). Fix prefixpath path separator so we can handle mounts with prefixpaths longer than one directory (one path component) when -mounted to Windows servers. +mounted to Windows servers. Fix slow file open when cifsacl +enabled. Version 1.51 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index 50306229b0f..621aa1a8597 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -3,7 +3,14 @@ features such as hierarchical dfs like namespace, hardlinks, locking and more. It was designed to comply with the SNIA CIFS Technical Reference (which supersedes the 1992 X/Open SMB Standard) as well as to perform best practice practical interoperability with Windows 2000, Windows XP, Samba and equivalent -servers. +servers. This code was developed in participation with the Protocol Freedom +Information Foundation. + +Please see + http://protocolfreedom.org/ and + http://samba.org/samba/PFIF/ +for more details. + For questions or bug reports please contact: sfrench@samba.org (sfrench@us.ibm.com) diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 0228ed06069..cc950f69e51 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -468,7 +468,7 @@ cifs_proc_init(void) { struct proc_dir_entry *pde; - proc_fs_cifs = proc_mkdir("cifs", proc_root_fs); + proc_fs_cifs = proc_mkdir("fs/cifs", NULL); if (proc_fs_cifs == NULL) return; @@ -559,7 +559,7 @@ cifs_proc_clean(void) remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("Experimental", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); - remove_proc_entry("cifs", proc_root_fs); + remove_proc_entry("fs/cifs", NULL); } static int diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 56c924033b7..95024c066d8 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -23,16 +23,28 @@ #include "dns_resolve.h" #include "cifs_debug.h" -LIST_HEAD(cifs_dfs_automount_list); +static LIST_HEAD(cifs_dfs_automount_list); -/* - * DFS functions -*/ +static void cifs_dfs_expire_automounts(struct work_struct *work); +static DECLARE_DELAYED_WORK(cifs_dfs_automount_task, + cifs_dfs_expire_automounts); +static int cifs_dfs_mountpoint_expiry_timeout = 500 * HZ; + +static void cifs_dfs_expire_automounts(struct work_struct *work) +{ + struct list_head *list = &cifs_dfs_automount_list; + + mark_mounts_for_expiry(list); + if (!list_empty(list)) + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); +} -void dfs_shrink_umount_helper(struct vfsmount *vfsmnt) +void cifs_dfs_release_automount_timer(void) { - mark_mounts_for_expiry(&cifs_dfs_automount_list); - mark_mounts_for_expiry(&cifs_dfs_automount_list); + BUG_ON(!list_empty(&cifs_dfs_automount_list)); + cancel_delayed_work(&cifs_dfs_automount_task); + flush_scheduled_work(); } /** @@ -261,10 +273,11 @@ static int add_mount_helper(struct vfsmount *newmnt, struct nameidata *nd, err = do_add_mount(newmnt, nd, nd->path.mnt->mnt_flags, mntlist); switch (err) { case 0: - dput(nd->path.dentry); - mntput(nd->path.mnt); + path_put(&nd->path); nd->path.mnt = newmnt; nd->path.dentry = dget(newmnt->mnt_root); + schedule_delayed_work(&cifs_dfs_automount_task, + cifs_dfs_mountpoint_expiry_timeout); break; case -EBUSY: /* someone else made a mount here whilst we were busy */ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 1cb5b0a9f2a..e99d4faf5f0 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -516,7 +516,7 @@ static int parse_sec_desc(struct cifs_ntsd *pntsd, int acl_len, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - int acl_len, struct inode *inode, __u64 nmode) + struct inode *inode, __u64 nmode) { int rc = 0; __u32 dacloffset; @@ -692,14 +692,14 @@ void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid) int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) { int rc = 0; - __u32 acllen = 0; + __u32 secdesclen = 0; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ cFYI(DBG2, ("set ACL from mode for %s", path)); /* Get the security descriptor */ - pntsd = get_cifs_acl(&acllen, inode, path, NULL); + pntsd = get_cifs_acl(&secdesclen, inode, path, NULL); /* Add three ACEs for owner, group, everyone getting rid of other ACEs as chmod disables ACEs and set the security descriptor */ @@ -709,20 +709,22 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode) set security descriptor request security descriptor parameters, and secuirty descriptor itself */ - pnntsd = kmalloc(acllen, GFP_KERNEL); + secdesclen = secdesclen < DEFSECDESCLEN ? + DEFSECDESCLEN : secdesclen; + pnntsd = kmalloc(secdesclen, GFP_KERNEL); if (!pnntsd) { cERROR(1, ("Unable to allocate security descriptor")); kfree(pntsd); return (-ENOMEM); } - rc = build_sec_desc(pntsd, pnntsd, acllen, inode, nmode); + rc = build_sec_desc(pntsd, pnntsd, inode, nmode); cFYI(DBG2, ("build_sec_desc rc: %d", rc)); if (!rc) { /* Set the security descriptor */ - rc = set_cifs_acl(pnntsd, acllen, inode, path); + rc = set_cifs_acl(pnntsd, secdesclen, inode, path); cFYI(DBG2, ("set_cifs_acl rc: %d", rc)); } diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index 93a7c3462ea..6c8096cf515 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -27,6 +27,7 @@ #define NUM_SUBAUTHS 5 /* number of sub authority fields */ #define NUM_WK_SIDS 7 /* number of well known sids */ #define SIDNAMELENGTH 20 /* long enough for the ones we care about */ +#define DEFSECDESCLEN 192 /* sec desc len contaiting a dacl with three aces */ #define READ_BIT 0x4 #define WRITE_BIT 0x2 diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index a04b17e5a9d..39c2cbdface 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -466,16 +466,11 @@ static struct quotactl_ops cifs_quotactl_ops = { }; #endif -static void cifs_umount_begin(struct vfsmount *vfsmnt, int flags) +static void cifs_umount_begin(struct super_block *sb) { - struct cifs_sb_info *cifs_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct cifsTconInfo *tcon; - dfs_shrink_umount_helper(vfsmnt); - - if (!(flags & MNT_FORCE)) - return; - cifs_sb = CIFS_SB(vfsmnt->mnt_sb); if (cifs_sb == NULL) return; @@ -1100,6 +1095,7 @@ exit_cifs(void) cFYI(DBG2, ("exit_cifs")); cifs_proc_clean(); #ifdef CONFIG_CIFS_DFS_UPCALL + cifs_dfs_release_automount_timer(); unregister_key_type(&key_type_dns_resolver); #endif #ifdef CONFIG_CIFS_UPCALL diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 68978306c3c..e1dd9f32e1d 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -62,11 +62,9 @@ extern int cifs_setattr(struct dentry *, struct iattr *); extern const struct inode_operations cifs_file_inode_ops; extern const struct inode_operations cifs_symlink_inode_ops; -extern struct list_head cifs_dfs_automount_list; extern struct inode_operations cifs_dfs_referral_inode_operations; - /* Functions related to files and directories */ extern const struct file_operations cifs_file_ops; extern const struct file_operations cifs_file_direct_ops; /* if directio mnt */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 47f79504f57..9f49c2f3582 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -1,7 +1,7 @@ /* * fs/cifs/cifspdu.h * - * Copyright (c) International Business Machines Corp., 2002,2007 + * Copyright (c) International Business Machines Corp., 2002,2008 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -163,7 +163,10 @@ path names in response */ #define SMBFLG2_KNOWS_EAS cpu_to_le16(2) #define SMBFLG2_SECURITY_SIGNATURE cpu_to_le16(4) +#define SMBFLG2_COMPRESSED (8) +#define SMBFLG2_SECURITY_SIGNATURE_REQUIRED (0x10) #define SMBFLG2_IS_LONG_NAME cpu_to_le16(0x40) +#define SMBFLG2_REPARSE_PATH (0x400) #define SMBFLG2_EXT_SEC cpu_to_le16(0x800) #define SMBFLG2_DFS cpu_to_le16(0x1000) #define SMBFLG2_PAGING_IO cpu_to_le16(0x2000) @@ -305,7 +308,7 @@ #define FILE_SHARE_DELETE 0x00000004 #define FILE_SHARE_ALL 0x00000007 -/* CreateDisposition flags */ +/* CreateDisposition flags, similar to CreateAction as well */ #define FILE_SUPERSEDE 0x00000000 #define FILE_OPEN 0x00000001 #define FILE_CREATE 0x00000002 @@ -317,15 +320,25 @@ #define CREATE_NOT_FILE 0x00000001 /* if set must not be file */ #define CREATE_WRITE_THROUGH 0x00000002 #define CREATE_SEQUENTIAL 0x00000004 -#define CREATE_SYNC_ALERT 0x00000010 -#define CREATE_ASYNC_ALERT 0x00000020 +#define CREATE_NO_BUFFER 0x00000008 /* should not buffer on srv */ +#define CREATE_SYNC_ALERT 0x00000010 /* MBZ */ +#define CREATE_ASYNC_ALERT 0x00000020 /* MBZ */ #define CREATE_NOT_DIR 0x00000040 /* if set must not be directory */ +#define CREATE_TREE_CONNECTION 0x00000080 /* should be zero */ +#define CREATE_COMPLETE_IF_OPLK 0x00000100 /* should be zero */ #define CREATE_NO_EA_KNOWLEDGE 0x00000200 -#define CREATE_EIGHT_DOT_THREE 0x00000400 +#define CREATE_EIGHT_DOT_THREE 0x00000400 /* doc says this is obsolete + open for recovery flag - should + be zero */ #define CREATE_RANDOM_ACCESS 0x00000800 #define CREATE_DELETE_ON_CLOSE 0x00001000 #define CREATE_OPEN_BY_ID 0x00002000 +#define CREATE_OPEN_BACKUP_INTN 0x00004000 +#define CREATE_NO_COMPRESSION 0x00008000 +#define CREATE_RESERVE_OPFILTER 0x00100000 /* should be zero */ #define OPEN_REPARSE_POINT 0x00200000 +#define OPEN_NO_RECALL 0x00400000 +#define OPEN_FREE_SPACE_QUERY 0x00800000 /* should be zero */ #define CREATE_OPTIONS_MASK 0x007FFFFF #define CREATE_OPTION_SPECIAL 0x20000000 /* system. NB not sent over wire */ @@ -470,7 +483,7 @@ typedef struct lanman_neg_rsp { typedef struct negotiate_rsp { struct smb_hdr hdr; /* wct = 17 */ - __le16 DialectIndex; + __le16 DialectIndex; /* 0xFFFF = no dialect acceptable */ __u8 SecurityMode; __le16 MaxMpxCount; __le16 MaxNumberVcs; @@ -516,10 +529,11 @@ typedef struct negotiate_rsp { #define CAP_INFOLEVEL_PASSTHRU 0x00002000 #define CAP_LARGE_READ_X 0x00004000 #define CAP_LARGE_WRITE_X 0x00008000 +#define CAP_LWIO 0x00010000 /* support fctl_srv_req_resume_key */ #define CAP_UNIX 0x00800000 -#define CAP_RESERVED 0x02000000 -#define CAP_BULK_TRANSFER 0x20000000 -#define CAP_COMPRESSED_DATA 0x40000000 +#define CAP_COMPRESSED_DATA 0x02000000 +#define CAP_DYNAMIC_REAUTH 0x20000000 +#define CAP_PERSISTENT_HANDLES 0x40000000 #define CAP_EXTENDED_SECURITY 0x80000000 typedef union smb_com_session_setup_andx { @@ -668,9 +682,7 @@ typedef struct smb_com_tconx_req { } __attribute__((packed)) TCONX_REQ; typedef struct smb_com_tconx_rsp { - struct smb_hdr hdr; /* wct = 3 note that Win2000 has sent wct = 7 - in some cases on responses. Four unspecified - words followed OptionalSupport */ + struct smb_hdr hdr; /* wct = 3 , not extended response */ __u8 AndXCommand; __u8 AndXReserved; __le16 AndXOffset; @@ -680,13 +692,48 @@ typedef struct smb_com_tconx_rsp { /* STRING NativeFileSystem */ } __attribute__((packed)) TCONX_RSP; +typedef struct smb_com_tconx_rsp_ext { + struct smb_hdr hdr; /* wct = 7, extended response */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __le16 OptionalSupport; /* see below */ + __le32 MaximalShareAccessRights; + __le32 GuestMaximalShareAccessRights; + __u16 ByteCount; + unsigned char Service[1]; /* always ASCII, not Unicode */ + /* STRING NativeFileSystem */ +} __attribute__((packed)) TCONX_RSP_EXT; + + /* tree connect Flags */ #define DISCONNECT_TID 0x0001 +#define TCON_EXTENDED_SIGNATURES 0x0004 #define TCON_EXTENDED_SECINFO 0x0008 + /* OptionalSupport bits */ #define SMB_SUPPORT_SEARCH_BITS 0x0001 /* "must have" directory search bits (exclusive searches supported) */ #define SMB_SHARE_IS_IN_DFS 0x0002 +#define SMB_CSC_MASK 0x000C +/* CSC flags defined as follows */ +#define SMB_CSC_CACHE_MANUAL_REINT 0x0000 +#define SMB_CSC_CACHE_AUTO_REINT 0x0004 +#define SMB_CSC_CACHE_VDO 0x0008 +#define SMB_CSC_NO_CACHING 0x000C + +#define SMB_UNIQUE_FILE_NAME 0x0010 +#define SMB_EXTENDED_SIGNATURES 0x0020 + +/* services + * + * A: ie disk + * LPT1: ie printer + * IPC ie named pipe + * COMM + * ????? ie any type + * + */ typedef struct smb_com_logoff_andx_req { struct smb_hdr hdr; /* wct = 2 */ @@ -750,6 +797,17 @@ typedef struct smb_com_findclose_req { #define COMM_DEV_TYPE 0x0004 #define UNKNOWN_TYPE 0xFFFF +/* Device Type or File Status Flags */ +#define NO_EAS 0x0001 +#define NO_SUBSTREAMS 0x0002 +#define NO_REPARSETAG 0x0004 +/* following flags can apply if pipe */ +#define ICOUNT_MASK 0x00FF +#define PIPE_READ_MODE 0x0100 +#define NAMED_PIPE_TYPE 0x0400 +#define PIPE_END_POINT 0x0800 +#define BLOCKING_NAMED_PIPE 0x8000 + typedef struct smb_com_open_req { /* also handles create */ struct smb_hdr hdr; /* wct = 24 */ __u8 AndXCommand; @@ -758,7 +816,7 @@ typedef struct smb_com_open_req { /* also handles create */ __u8 Reserved; /* Must Be Zero */ __le16 NameLength; __le32 OpenFlags; - __le32 RootDirectoryFid; + __u32 RootDirectoryFid; __le32 DesiredAccess; __le64 AllocationSize; __le32 FileAttributes; @@ -801,6 +859,32 @@ typedef struct smb_com_open_rsp { __u16 ByteCount; /* bct = 0 */ } __attribute__((packed)) OPEN_RSP; +typedef struct smb_com_open_rsp_ext { + struct smb_hdr hdr; /* wct = 42 but meaningless due to MS bug? */ + __u8 AndXCommand; + __u8 AndXReserved; + __le16 AndXOffset; + __u8 OplockLevel; + __u16 Fid; + __le32 CreateAction; + __le64 CreationTime; + __le64 LastAccessTime; + __le64 LastWriteTime; + __le64 ChangeTime; + __le32 FileAttributes; + __le64 AllocationSize; + __le64 EndOfFile; + __le16 FileType; + __le16 DeviceState; + __u8 DirectoryFlag; + __u8 VolumeGUID[16]; + __u64 FileId; /* note no endian conversion - is opaque UniqueID */ + __le32 MaximalAccessRights; + __le32 GuestMaximalAccessRights; + __u16 ByteCount; /* bct = 0 */ +} __attribute__((packed)) OPEN_RSP_EXT; + + /* format of legacy open request */ typedef struct smb_com_openx_req { struct smb_hdr hdr; /* wct = 15 */ @@ -1703,6 +1787,12 @@ typedef struct smb_com_transaction2_fnext_rsp_parms { #define SMB_QUERY_CIFS_UNIX_INFO 0x200 #define SMB_QUERY_POSIX_FS_INFO 0x201 #define SMB_QUERY_POSIX_WHO_AM_I 0x202 +#define SMB_REQUEST_TRANSPORT_ENCRYPTION 0x203 +#define SMB_QUERY_FS_PROXY 0x204 /* WAFS enabled. Returns structure + FILE_SYSTEM__UNIX_INFO to tell + whether new NTIOCTL available + (0xACE) for WAN friendly SMB + operations to be carried */ #define SMB_QUERY_LABEL_INFO 0x3ea #define SMB_QUERY_FS_QUOTA_INFO 0x3ee #define SMB_QUERY_FS_FULL_SIZE_INFO 0x3ef @@ -1959,7 +2049,10 @@ typedef struct { #define CIFS_UNIX_LARGE_READ_CAP 0x00000040 /* support reads >128K (up to 0xFFFF00 */ #define CIFS_UNIX_LARGE_WRITE_CAP 0x00000080 - +#define CIFS_UNIX_TRANSPORT_ENCRYPTION_CAP 0x00000100 /* can do SPNEGO crypt */ +#define CIFS_UNIX_TRANPSORT_ENCRYPTION_MANDATORY_CAP 0x00000200 /* must do */ +#define CIFS_UNIX_PROXY_CAP 0x00000400 /* Proxy cap: 0xACE ioctl and + QFS PROXY call */ #ifdef CONFIG_CIFS_POSIX /* Can not set pathnames cap yet until we send new posix create SMB since otherwise server can treat such handles opened with older ntcreatex diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 7e5e0e78cd7..50f9fdae19b 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -84,6 +84,7 @@ extern __u16 GetNextMid(struct TCP_Server_Info *server); extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16, struct cifsTconInfo *); extern void DeleteOplockQEntry(struct oplock_q_entry *); +extern void DeleteTconOplockQEntries(struct cifsTconInfo *); extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601); extern u64 cifs_UnixTimeToNT(struct timespec); extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); @@ -103,13 +104,7 @@ extern int mode_to_acl(struct inode *inode, const char *path, __u64); extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *, const char *); extern int cifs_umount(struct super_block *, struct cifs_sb_info *); -#ifdef CONFIG_CIFS_DFS_UPCALL -extern void dfs_shrink_umount_helper(struct vfsmount *vfsmnt); -#else -static inline void dfs_shrink_umount_helper(struct vfsmount *vfsmnt) -{ -} -#endif /* DFS_UPCALL */ +extern void cifs_dfs_release_automount_timer(void); void cifs_proc_init(void); void cifs_proc_clean(void); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 30bbe448e26..4728fa982a4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -165,17 +165,19 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); up(&tcon->ses->sesSem); - /* tell server which Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps(0 /* no xid */, - tcon, - NULL /* we do not know sb */, - NULL /* no vol info */); /* BB FIXME add code to check if wsize needs update due to negotiated smb buffer size shrinking */ - if (rc == 0) + if (rc == 0) { atomic_inc(&tconInfoReconnectCount); + /* tell server Unix caps we support */ + if (tcon->ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps( + 0 /* no xid */, + tcon, + NULL /* we do not know sb */, + NULL /* no vol info */); + } cFYI(1, ("reconnect tcon rc = %d", rc)); /* Removed call to reopen open files here. @@ -310,17 +312,19 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, rc = CIFSTCon(0, tcon->ses, tcon->treeName, tcon, nls_codepage); up(&tcon->ses->sesSem); - /* tell server which Unix caps we support */ - if (tcon->ses->capabilities & CAP_UNIX) - reset_cifs_unix_caps(0 /* no xid */, - tcon, - NULL /* do not know sb */, - NULL /* no vol info */); /* BB FIXME add code to check if wsize needs update due to negotiated smb buffer size shrinking */ - if (rc == 0) + if (rc == 0) { atomic_inc(&tconInfoReconnectCount); + /* tell server Unix caps we support */ + if (tcon->ses->capabilities & CAP_UNIX) + reset_cifs_unix_caps( + 0 /* no xid */, + tcon, + NULL /* do not know sb */, + NULL /* no vol info */); + } cFYI(1, ("reconnect tcon rc = %d", rc)); /* Removed call to reopen open files here. diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8dbfa97cd18..e1710673016 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -3527,6 +3527,7 @@ cifs_umount(struct super_block *sb, struct cifs_sb_info *cifs_sb) FreeXid(xid); return 0; } + DeleteTconOplockQEntries(cifs_sb->tcon); tconInfoFree(cifs_sb->tcon); if ((ses) && (ses->server)) { /* save off task so we do not refer to ses later */ diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index bc673c8c1e6..e1031b9e2c5 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -161,12 +161,14 @@ static void cifs_unix_info_to_inode(struct inode *inode, spin_unlock(&inode->i_lock); } -static const unsigned char *cifs_get_search_path(struct cifsTconInfo *pTcon, - const char *search_path) +static const unsigned char *cifs_get_search_path(struct cifs_sb_info *cifs_sb, + const char *search_path) { int tree_len; int path_len; + int i; char *tmp_path; + struct cifsTconInfo *pTcon = cifs_sb->tcon; if (!(pTcon->Flags & SMB_SHARE_IS_IN_DFS)) return search_path; @@ -180,6 +182,11 @@ static const unsigned char *cifs_get_search_path(struct cifsTconInfo *pTcon, return search_path; strncpy(tmp_path, pTcon->treeName, tree_len); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) + for (i = 0; i < tree_len; i++) { + if (tmp_path[i] == '\\') + tmp_path[i] = '/'; + } strncpy(tmp_path+tree_len, search_path, path_len); tmp_path[tree_len+path_len] = 0; return tmp_path; @@ -199,7 +206,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, pTcon = cifs_sb->tcon; cFYI(1, ("Getting info on %s", search_path)); - full_path = cifs_get_search_path(pTcon, search_path); + full_path = cifs_get_search_path(cifs_sb, search_path); try_again_CIFSSMBUnixQPathInfo: /* could have done a find first instead but this returns more info */ @@ -402,7 +409,7 @@ int cifs_get_inode_info(struct inode **pinode, return -ENOMEM; pfindData = (FILE_ALL_INFO *)buf; - full_path = cifs_get_search_path(pTcon, search_path); + full_path = cifs_get_search_path(cifs_sb, search_path); try_again_CIFSSMBQPathInfo: /* could do find first instead but this returns more info */ diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3612d6c0a0b..000ac509c98 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -142,6 +142,24 @@ void DeleteOplockQEntry(struct oplock_q_entry *oplockEntry) kmem_cache_free(cifs_oplock_cachep, oplockEntry); } + +void DeleteTconOplockQEntries(struct cifsTconInfo *tcon) +{ + struct oplock_q_entry *temp; + + if (tcon == NULL) + return; + + spin_lock(&GlobalMid_Lock); + list_for_each_entry(temp, &GlobalOplock_Q, qhead) { + if ((temp->tcon) && (temp->tcon == tcon)) { + list_del(&temp->qhead); + kmem_cache_free(cifs_oplock_cachep, temp); + } + } + spin_unlock(&GlobalMid_Lock); +} + int smb_send(struct socket *ssocket, struct smb_hdr *smb_buffer, unsigned int smb_buf_length, struct sockaddr *sin) diff --git a/fs/coda/coda_linux.c b/fs/coda/coda_linux.c index 95a54253c04..e1c854890f9 100644 --- a/fs/coda/coda_linux.c +++ b/fs/coda/coda_linux.c @@ -134,7 +134,7 @@ void coda_iattr_to_vattr(struct iattr *iattr, struct coda_vattr *vattr) unsigned int valid; /* clean out */ - vattr->va_mode = (umode_t) -1; + vattr->va_mode = -1; vattr->va_uid = (vuid_t) -1; vattr->va_gid = (vgid_t) -1; vattr->va_size = (off_t) -1; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index f89ff083079..3d2580e00a3 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -345,7 +345,7 @@ static int coda_symlink(struct inode *dir_inode, struct dentry *de, } /* destruction routines: unlink, rmdir */ -int coda_unlink(struct inode *dir, struct dentry *de) +static int coda_unlink(struct inode *dir, struct dentry *de) { int error; const char *name = de->d_name.name; @@ -365,7 +365,7 @@ int coda_unlink(struct inode *dir, struct dentry *de) return 0; } -int coda_rmdir(struct inode *dir, struct dentry *de) +static int coda_rmdir(struct inode *dir, struct dentry *de) { const char *name = de->d_name.name; int len = de->d_name.len; @@ -424,7 +424,7 @@ static int coda_rename(struct inode *old_dir, struct dentry *old_dentry, /* file operations for directories */ -int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) +static int coda_readdir(struct file *coda_file, void *buf, filldir_t filldir) { struct coda_file_info *cfi; struct file *host_file; diff --git a/fs/compat.c b/fs/compat.c index 2ce4456aad3..332a869d2c5 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -24,6 +24,7 @@ #include <linux/fcntl.h> #include <linux/namei.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/vfs.h> #include <linux/ioctl.h> #include <linux/init.h> @@ -1634,7 +1635,7 @@ sticky: return ret; } -#ifdef TIF_RESTORE_SIGMASK +#ifdef HAVE_SET_RESTORE_SIGMASK asmlinkage long compat_sys_pselect7(int n, compat_ulong_t __user *inp, compat_ulong_t __user *outp, compat_ulong_t __user *exp, struct compat_timespec __user *tsp, compat_sigset_t __user *sigmask, @@ -1720,7 +1721,7 @@ sticky: if (sigmask) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); } } else if (sigmask) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -1791,7 +1792,7 @@ asmlinkage long compat_sys_ppoll(struct pollfd __user *ufds, if (sigmask) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); } ret = -ERESTARTNOHAND; } else if (sigmask) @@ -1825,7 +1826,7 @@ sticky: return ret; } -#endif /* TIF_RESTORE_SIGMASK */ +#endif /* HAVE_SET_RESTORE_SIGMASK */ #if defined(CONFIG_NFSD) || defined(CONFIG_NFSD_MODULE) /* Stuff for NFS server syscalls... */ @@ -2080,7 +2081,7 @@ long asmlinkage compat_sys_nfsservctl(int cmd, void *notused, void *notused2) #ifdef CONFIG_EPOLL -#ifdef TIF_RESTORE_SIGMASK +#ifdef HAVE_SET_RESTORE_SIGMASK asmlinkage long compat_sys_epoll_pwait(int epfd, struct compat_epoll_event __user *events, int maxevents, int timeout, @@ -2117,14 +2118,14 @@ asmlinkage long compat_sys_epoll_pwait(int epfd, if (err == -EINTR) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); } else sigprocmask(SIG_SETMASK, &sigsaved, NULL); } return err; } -#endif /* TIF_RESTORE_SIGMASK */ +#endif /* HAVE_SET_RESTORE_SIGMASK */ #endif /* CONFIG_EPOLL */ diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index c6e72aebd16..97dba0d9234 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1046,14 +1046,14 @@ static int vt_check(struct file *file) struct inode *inode = file->f_path.dentry->d_inode; struct vc_data *vc; - if (file->f_op->ioctl != tty_ioctl) + if (file->f_op->unlocked_ioctl != tty_ioctl) return -EINVAL; tty = (struct tty_struct *)file->private_data; if (tty_paranoia_check(tty, inode, "tty_ioctl")) return -EINVAL; - if (tty->driver->ioctl != vt_ioctl) + if (tty->ops->ioctl != vt_ioctl) return -EINVAL; vc = (struct vc_data *)tty->driver_data; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 397cb503a18..2b6cb23dd14 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -115,7 +115,7 @@ configfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *pp goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", - __FUNCTION__, count, *ppos, buffer->page); + __func__, count, *ppos, buffer->page); retval = simple_read_from_buffer(buf, count, ppos, buffer->page, buffer->count); out: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 4c1ebff778e..b9a1d810346 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -47,7 +47,7 @@ static const struct address_space_operations configfs_aops = { static struct backing_dev_info configfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations configfs_inode_operations ={ diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index de3b31d0a37..8421cea7d8c 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -92,7 +92,7 @@ static int configfs_fill_super(struct super_block *sb, void *data, int silent) root = d_alloc_root(inode); if (!root) { - pr_debug("%s: could not get root dentry!\n",__FUNCTION__); + pr_debug("%s: could not get root dentry!\n",__func__); iput(inode); return -ENOMEM; } diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 78929ea84ff..2a731ef5f30 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -210,13 +210,13 @@ static int configfs_get_target_path(struct config_item * item, struct config_ite if (size > PATH_MAX) return -ENAMETOOLONG; - pr_debug("%s: depth = %d, size = %d\n", __FUNCTION__, depth, size); + pr_debug("%s: depth = %d, size = %d\n", __func__, depth, size); for (s = path; depth--; s += 3) strcpy(s,"../"); fill_item_path(target, path, size); - pr_debug("%s: path = '%s'\n", __FUNCTION__, path); + pr_debug("%s: path = '%s'\n", __func__, path); return 0; } diff --git a/fs/dcache.c b/fs/dcache.c index 43455776711..3ee588d5f58 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1746,12 +1746,21 @@ shouldnt_be_hashed: goto shouldnt_be_hashed; } +static int prepend(char **buffer, int *buflen, const char *str, + int namelen) +{ + *buflen -= namelen; + if (*buflen < 0) + return -ENAMETOOLONG; + *buffer -= namelen; + memcpy(*buffer, str, namelen); + return 0; +} + /** * d_path - return the path of a dentry - * @dentry: dentry to report - * @vfsmnt: vfsmnt to which the dentry belongs - * @root: root dentry - * @rootmnt: vfsmnt to which the root dentry belongs + * @path: the dentry/vfsmount to report + * @root: root vfsmnt/dentry (may be modified by this function) * @buffer: buffer to return value in * @buflen: buffer length * @@ -1761,23 +1770,22 @@ shouldnt_be_hashed: * Returns the buffer or an error code if the path was too long. * * "buflen" should be positive. Caller holds the dcache_lock. + * + * If path is not reachable from the supplied root, then the value of + * root is changed (without modifying refcounts). */ -static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, - struct path *root, char *buffer, int buflen) +char *__d_path(const struct path *path, struct path *root, + char *buffer, int buflen) { + struct dentry *dentry = path->dentry; + struct vfsmount *vfsmnt = path->mnt; char * end = buffer+buflen; char * retval; - int namelen; - - *--end = '\0'; - buflen--; - if (!IS_ROOT(dentry) && d_unhashed(dentry)) { - buflen -= 10; - end -= 10; - if (buflen < 0) + + prepend(&end, &buflen, "\0", 1); + if (!IS_ROOT(dentry) && d_unhashed(dentry) && + (prepend(&end, &buflen, " (deleted)", 10) != 0)) goto Elong; - memcpy(end, " (deleted)", 10); - } if (buflen < 1) goto Elong; @@ -1804,13 +1812,10 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, } parent = dentry->d_parent; prefetch(parent); - namelen = dentry->d_name.len; - buflen -= namelen + 1; - if (buflen < 0) + if ((prepend(&end, &buflen, dentry->d_name.name, + dentry->d_name.len) != 0) || + (prepend(&end, &buflen, "/", 1) != 0)) goto Elong; - end -= namelen; - memcpy(end, dentry->d_name.name, namelen); - *--end = '/'; retval = end; dentry = parent; } @@ -1818,12 +1823,12 @@ static char *__d_path(struct dentry *dentry, struct vfsmount *vfsmnt, return retval; global_root: - namelen = dentry->d_name.len; - buflen -= namelen; - if (buflen < 0) + retval += 1; /* hit the slash */ + if (prepend(&retval, &buflen, dentry->d_name.name, + dentry->d_name.len) != 0) goto Elong; - retval -= namelen-1; /* hit the slash */ - memcpy(retval, dentry->d_name.name, namelen); + root->mnt = vfsmnt; + root->dentry = dentry; return retval; Elong: return ERR_PTR(-ENAMETOOLONG); @@ -1846,6 +1851,7 @@ char *d_path(struct path *path, char *buf, int buflen) { char *res; struct path root; + struct path tmp; /* * We have various synthetic filesystems that never get mounted. On @@ -1859,10 +1865,11 @@ char *d_path(struct path *path, char *buf, int buflen) read_lock(¤t->fs->lock); root = current->fs->root; - path_get(¤t->fs->root); + path_get(&root); read_unlock(¤t->fs->lock); spin_lock(&dcache_lock); - res = __d_path(path->dentry, path->mnt, &root, buf, buflen); + tmp = root; + res = __d_path(path, &tmp, buf, buflen); spin_unlock(&dcache_lock); path_put(&root); return res; @@ -1890,6 +1897,48 @@ char *dynamic_dname(struct dentry *dentry, char *buffer, int buflen, } /* + * Write full pathname from the root of the filesystem into the buffer. + */ +char *dentry_path(struct dentry *dentry, char *buf, int buflen) +{ + char *end = buf + buflen; + char *retval; + + spin_lock(&dcache_lock); + prepend(&end, &buflen, "\0", 1); + if (!IS_ROOT(dentry) && d_unhashed(dentry) && + (prepend(&end, &buflen, "//deleted", 9) != 0)) + goto Elong; + if (buflen < 1) + goto Elong; + /* Get '/' right */ + retval = end-1; + *retval = '/'; + + for (;;) { + struct dentry *parent; + if (IS_ROOT(dentry)) + break; + + parent = dentry->d_parent; + prefetch(parent); + + if ((prepend(&end, &buflen, dentry->d_name.name, + dentry->d_name.len) != 0) || + (prepend(&end, &buflen, "/", 1) != 0)) + goto Elong; + + retval = end; + dentry = parent; + } + spin_unlock(&dcache_lock); + return retval; +Elong: + spin_unlock(&dcache_lock); + return ERR_PTR(-ENAMETOOLONG); +} + +/* * NOTE! The user-level library version returns a * character pointer. The kernel system call just * returns the length of the buffer filled (which @@ -1918,9 +1967,9 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) read_lock(¤t->fs->lock); pwd = current->fs->pwd; - path_get(¤t->fs->pwd); + path_get(&pwd); root = current->fs->root; - path_get(¤t->fs->root); + path_get(&root); read_unlock(¤t->fs->lock); error = -ENOENT; @@ -1928,9 +1977,10 @@ asmlinkage long sys_getcwd(char __user *buf, unsigned long size) spin_lock(&dcache_lock); if (pwd.dentry->d_parent == pwd.dentry || !d_unhashed(pwd.dentry)) { unsigned long len; + struct path tmp = root; char * cwd; - cwd = __d_path(pwd.dentry, pwd.mnt, &root, page, PAGE_SIZE); + cwd = __d_path(&pwd, &tmp, page, PAGE_SIZE); spin_unlock(&dcache_lock); error = PTR_ERR(cwd); diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index fddffe4851f..159a5efd6a8 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -9,7 +9,7 @@ * 2 as published by the Free Software Foundation. * * debugfs is for people to use instead of /proc or /sys. - * See Documentation/DocBook/kernel-api for more details. + * See Documentation/DocBook/filesystems for more details. * */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index f120e120787..285b64a8b06 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -17,6 +17,8 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/tty.h> +#include <linux/mutex.h> +#include <linux/idr.h> #include <linux/devpts_fs.h> #include <linux/parser.h> #include <linux/fsnotify.h> @@ -26,6 +28,10 @@ #define DEVPTS_DEFAULT_MODE 0600 +extern int pty_limit; /* Config limit on Unix98 ptys */ +static DEFINE_IDR(allocated_ptys); +static DEFINE_MUTEX(allocated_ptys_lock); + static struct vfsmount *devpts_mnt; static struct dentry *devpts_root; @@ -171,9 +177,44 @@ static struct dentry *get_node(int num) return lookup_one_len(s, root, sprintf(s, "%d", num)); } +int devpts_new_index(void) +{ + int index; + int idr_ret; + +retry: + if (!idr_pre_get(&allocated_ptys, GFP_KERNEL)) { + return -ENOMEM; + } + + mutex_lock(&allocated_ptys_lock); + idr_ret = idr_get_new(&allocated_ptys, NULL, &index); + if (idr_ret < 0) { + mutex_unlock(&allocated_ptys_lock); + if (idr_ret == -EAGAIN) + goto retry; + return -EIO; + } + + if (index >= pty_limit) { + idr_remove(&allocated_ptys, index); + mutex_unlock(&allocated_ptys_lock); + return -EIO; + } + mutex_unlock(&allocated_ptys_lock); + return index; +} + +void devpts_kill_index(int idx) +{ + mutex_lock(&allocated_ptys_lock); + idr_remove(&allocated_ptys, idx); + mutex_unlock(&allocated_ptys_lock); +} + int devpts_pty_new(struct tty_struct *tty) { - int number = tty->index; + int number = tty->index; /* tty layer puts index from devpts_new_index() in here */ struct tty_driver *driver = tty->driver; dev_t device = MKDEV(driver->major, driver->minor_start+number); struct dentry *dentry; diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index b64e55e0515..499e16759e9 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -200,7 +200,7 @@ int __init dlm_lockspace_init(void) dlm_kset = kset_create_and_add("dlm", NULL, kernel_kobj); if (!dlm_kset) { - printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__); + printk(KERN_WARNING "%s: can not create kset\n", __func__); return -ENOMEM; } return 0; diff --git a/fs/dnotify.c b/fs/dnotify.c index 28d01ed66de..676073b8dda 100644 --- a/fs/dnotify.c +++ b/fs/dnotify.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/spinlock.h> #include <linux/slab.h> +#include <linux/fdtable.h> int dir_notify_enable __read_mostly = 1; @@ -66,6 +67,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) struct dnotify_struct **prev; struct inode *inode; fl_owner_t id = current->files; + struct file *f; int error = 0; if ((arg & ~DN_MULTISHOT) == 0) { @@ -92,6 +94,15 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) prev = &odn->dn_next; } + rcu_read_lock(); + f = fcheck(fd); + rcu_read_unlock(); + /* we'd lost the race with close(), sod off silently */ + /* note that inode->i_lock prevents reordering problems + * between accesses to descriptor table and ->i_dnotify */ + if (f != filp) + goto out_free; + error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); if (error) goto out_free; diff --git a/fs/dquot.c b/fs/dquot.c index 41b9dbd68b0..dfba1623ccc 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -289,7 +289,15 @@ static void wait_on_dquot(struct dquot *dquot) mutex_unlock(&dquot->dq_lock); } -#define mark_dquot_dirty(dquot) ((dquot)->dq_sb->dq_op->mark_dirty(dquot)) +static inline int dquot_dirty(struct dquot *dquot) +{ + return test_bit(DQ_MOD_B, &dquot->dq_flags); +} + +static inline int mark_dquot_dirty(struct dquot *dquot) +{ + return dquot->dq_sb->dq_op->mark_dirty(dquot); +} int dquot_mark_dquot_dirty(struct dquot *dquot) { @@ -1441,31 +1449,43 @@ static inline void set_enable_flags(struct quota_info *dqopt, int type) switch (type) { case USRQUOTA: dqopt->flags |= DQUOT_USR_ENABLED; + dqopt->flags &= ~DQUOT_USR_SUSPENDED; break; case GRPQUOTA: dqopt->flags |= DQUOT_GRP_ENABLED; + dqopt->flags &= ~DQUOT_GRP_SUSPENDED; break; } } -static inline void reset_enable_flags(struct quota_info *dqopt, int type) +static inline void reset_enable_flags(struct quota_info *dqopt, int type, + int remount) { switch (type) { case USRQUOTA: dqopt->flags &= ~DQUOT_USR_ENABLED; + if (remount) + dqopt->flags |= DQUOT_USR_SUSPENDED; + else + dqopt->flags &= ~DQUOT_USR_SUSPENDED; break; case GRPQUOTA: dqopt->flags &= ~DQUOT_GRP_ENABLED; + if (remount) + dqopt->flags |= DQUOT_GRP_SUSPENDED; + else + dqopt->flags &= ~DQUOT_GRP_SUSPENDED; break; } } + /* * Turn quota off on a device. type == -1 ==> quotaoff for all types (umount) */ -int vfs_quota_off(struct super_block *sb, int type) +int vfs_quota_off(struct super_block *sb, int type, int remount) { - int cnt; + int cnt, ret = 0; struct quota_info *dqopt = sb_dqopt(sb); struct inode *toputinode[MAXQUOTAS]; @@ -1475,9 +1495,17 @@ int vfs_quota_off(struct super_block *sb, int type) toputinode[cnt] = NULL; if (type != -1 && cnt != type) continue; + /* If we keep inodes of quota files after remount and quotaoff + * is called, drop kept inodes. */ + if (!remount && sb_has_quota_suspended(sb, cnt)) { + iput(dqopt->files[cnt]); + dqopt->files[cnt] = NULL; + reset_enable_flags(dqopt, cnt, 0); + continue; + } if (!sb_has_quota_enabled(sb, cnt)) continue; - reset_enable_flags(dqopt, cnt); + reset_enable_flags(dqopt, cnt, remount); /* Note: these are blocking operations */ drop_dquot_ref(sb, cnt); @@ -1493,7 +1521,8 @@ int vfs_quota_off(struct super_block *sb, int type) put_quota_format(dqopt->info[cnt].dqi_format); toputinode[cnt] = dqopt->files[cnt]; - dqopt->files[cnt] = NULL; + if (!remount) + dqopt->files[cnt] = NULL; dqopt->info[cnt].dqi_flags = 0; dqopt->info[cnt].dqi_igrace = 0; dqopt->info[cnt].dqi_bgrace = 0; @@ -1523,12 +1552,19 @@ int vfs_quota_off(struct super_block *sb, int type) mutex_unlock(&toputinode[cnt]->i_mutex); mark_inode_dirty(toputinode[cnt]); } - iput(toputinode[cnt]); mutex_unlock(&dqopt->dqonoff_mutex); + /* On remount RO, we keep the inode pointer so that we + * can reenable quota on the subsequent remount RW. + * But we have better not keep inode pointer when there + * is pending delete on the quota file... */ + if (!remount) + iput(toputinode[cnt]); + else if (!toputinode[cnt]->i_nlink) + ret = -EBUSY; } if (sb->s_bdev) invalidate_bdev(sb->s_bdev); - return 0; + return ret; } /* @@ -1566,7 +1602,8 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) invalidate_bdev(sb->s_bdev); mutex_lock(&inode->i_mutex); mutex_lock(&dqopt->dqonoff_mutex); - if (sb_has_quota_enabled(sb, type)) { + if (sb_has_quota_enabled(sb, type) || + sb_has_quota_suspended(sb, type)) { error = -EBUSY; goto out_lock; } @@ -1589,6 +1626,7 @@ static int vfs_quota_on_inode(struct inode *inode, int type, int format_id) dqopt->ops[type] = fmt->qf_ops; dqopt->info[type].dqi_format = fmt; + dqopt->info[type].dqi_fmt_id = format_id; INIT_LIST_HEAD(&dqopt->info[type].dqi_dirty_list); mutex_lock(&dqopt->dqio_mutex); if ((error = dqopt->ops[type]->read_file_info(sb, type)) < 0) { @@ -1624,12 +1662,41 @@ out_fmt: return error; } +/* Reenable quotas on remount RW */ +static int vfs_quota_on_remount(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + struct inode *inode; + int ret; + + mutex_lock(&dqopt->dqonoff_mutex); + if (!sb_has_quota_suspended(sb, type)) { + mutex_unlock(&dqopt->dqonoff_mutex); + return 0; + } + BUG_ON(sb_has_quota_enabled(sb, type)); + + inode = dqopt->files[type]; + dqopt->files[type] = NULL; + reset_enable_flags(dqopt, type, 0); + mutex_unlock(&dqopt->dqonoff_mutex); + + ret = vfs_quota_on_inode(inode, type, dqopt->info[type].dqi_fmt_id); + iput(inode); + + return ret; +} + /* Actual function called from quotactl() */ -int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path) +int vfs_quota_on(struct super_block *sb, int type, int format_id, char *path, + int remount) { struct nameidata nd; int error; + if (remount) + return vfs_quota_on_remount(sb, type); + error = path_lookup(path, LOOKUP_FOLLOW, &nd); if (error < 0) return error; @@ -1709,10 +1776,19 @@ int vfs_get_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *d } /* Generic routine for setting common part of quota structure */ -static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) +static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) { struct mem_dqblk *dm = &dquot->dq_dqb; int check_blim = 0, check_ilim = 0; + struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; + + if ((di->dqb_valid & QIF_BLIMITS && + (di->dqb_bhardlimit > dqi->dqi_maxblimit || + di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || + (di->dqb_valid & QIF_ILIMITS && + (di->dqb_ihardlimit > dqi->dqi_maxilimit || + di->dqb_isoftlimit > dqi->dqi_maxilimit))) + return -ERANGE; spin_lock(&dq_data_lock); if (di->dqb_valid & QIF_SPACE) { @@ -1744,7 +1820,7 @@ static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) clear_bit(DQ_BLKS_B, &dquot->dq_flags); } else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */ - dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; + dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; } if (check_ilim) { if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) { @@ -1752,7 +1828,7 @@ static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) clear_bit(DQ_INODES_B, &dquot->dq_flags); } else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */ - dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; + dm->dqb_itime = get_seconds() + dqi->dqi_igrace; } if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) clear_bit(DQ_FAKE_B, &dquot->dq_flags); @@ -1760,21 +1836,24 @@ static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) set_bit(DQ_FAKE_B, &dquot->dq_flags); spin_unlock(&dq_data_lock); mark_dquot_dirty(dquot); + + return 0; } int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) { struct dquot *dquot; + int rc; mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); if (!(dquot = dqget(sb, id, type))) { mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); return -ESRCH; } - do_set_dqblk(dquot, di); + rc = do_set_dqblk(dquot, di); dqput(dquot); mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); - return 0; + return rc; } /* Generic routine for getting common part of quota file information */ diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 59375efcf39..3e5637fc377 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -14,18 +14,26 @@ int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb) { - struct inode *inode; + struct inode *inode, *toput_inode = NULL; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { if (inode->i_state & (I_FREEING|I_WILL_FREE)) continue; + if (inode->i_mapping->nrpages == 0) + continue; + __iget(inode); + spin_unlock(&inode_lock); __invalidate_mapping_pages(inode->i_mapping, 0, -1, true); + iput(toput_inode); + toput_inode = inode; + spin_lock(&inode_lock); } spin_unlock(&inode_lock); + iput(toput_inode); } -void drop_pagecache(void) +static void drop_pagecache(void) { struct super_block *sb; @@ -45,7 +53,7 @@ restart: spin_unlock(&sb_lock); } -void drop_slab(void) +static void drop_slab(void) { int nr_objects; diff --git a/fs/ecryptfs/Makefile b/fs/ecryptfs/Makefile index 76885701551..1e34a7fd488 100644 --- a/fs/ecryptfs/Makefile +++ b/fs/ecryptfs/Makefile @@ -4,4 +4,4 @@ obj-$(CONFIG_ECRYPT_FS) += ecryptfs.o -ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o debug.o +ecryptfs-objs := dentry.o file.o inode.o main.o super.o mmap.o read_write.o crypto.o keystore.o messaging.o netlink.o miscdev.o debug.o diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index a066e109ad9..cd62d75b2cc 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -119,21 +119,21 @@ static int ecryptfs_calculate_md5(char *dst, if (rc) { printk(KERN_ERR "%s: Error initializing crypto hash; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); goto out; } rc = crypto_hash_update(&desc, &sg, len); if (rc) { printk(KERN_ERR "%s: Error updating crypto hash; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); goto out; } rc = crypto_hash_final(&desc, dst); if (rc) { printk(KERN_ERR "%s: Error finalizing crypto hash; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); goto out; } out: @@ -437,7 +437,7 @@ static int ecryptfs_encrypt_extent(struct page *enc_extent_page, if (rc < 0) { printk(KERN_ERR "%s: Error attempting to encrypt page with " "page->index = [%ld], extent_offset = [%ld]; " - "rc = [%d]\n", __FUNCTION__, page->index, extent_offset, + "rc = [%d]\n", __func__, page->index, extent_offset, rc); goto out; } @@ -487,7 +487,7 @@ int ecryptfs_encrypt_page(struct page *page) 0, PAGE_CACHE_SIZE); if (rc) printk(KERN_ERR "%s: Error attempting to copy " - "page at index [%ld]\n", __FUNCTION__, + "page at index [%ld]\n", __func__, page->index); goto out; } @@ -508,7 +508,7 @@ int ecryptfs_encrypt_page(struct page *page) extent_offset); if (rc) { printk(KERN_ERR "%s: Error encrypting extent; " - "rc = [%d]\n", __FUNCTION__, rc); + "rc = [%d]\n", __func__, rc); goto out; } ecryptfs_lower_offset_for_extent( @@ -569,7 +569,7 @@ static int ecryptfs_decrypt_extent(struct page *page, if (rc < 0) { printk(KERN_ERR "%s: Error attempting to decrypt to page with " "page->index = [%ld], extent_offset = [%ld]; " - "rc = [%d]\n", __FUNCTION__, page->index, extent_offset, + "rc = [%d]\n", __func__, page->index, extent_offset, rc); goto out; } @@ -622,7 +622,7 @@ int ecryptfs_decrypt_page(struct page *page) ecryptfs_inode); if (rc) printk(KERN_ERR "%s: Error attempting to copy " - "page at index [%ld]\n", __FUNCTION__, + "page at index [%ld]\n", __func__, page->index); goto out; } @@ -656,7 +656,7 @@ int ecryptfs_decrypt_page(struct page *page) extent_offset); if (rc) { printk(KERN_ERR "%s: Error encrypting extent; " - "rc = [%d]\n", __FUNCTION__, rc); + "rc = [%d]\n", __func__, rc); goto out; } } @@ -1215,7 +1215,7 @@ int ecryptfs_read_and_validate_header_region(char *data, ecryptfs_inode); if (rc) { printk(KERN_ERR "%s: Error reading header region; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); goto out; } if (!contains_ecryptfs_marker(data + ECRYPTFS_FILE_SIZE_BYTES)) { @@ -1246,7 +1246,6 @@ ecryptfs_write_header_metadata(char *virt, (*written) = 6; } -struct kmem_cache *ecryptfs_header_cache_0; struct kmem_cache *ecryptfs_header_cache_1; struct kmem_cache *ecryptfs_header_cache_2; @@ -1320,7 +1319,7 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, 0, crypt_stat->num_header_bytes_at_front); if (rc) printk(KERN_ERR "%s: Error attempting to write header " - "information to lower file; rc = [%d]\n", __FUNCTION__, + "information to lower file; rc = [%d]\n", __func__, rc); return rc; } @@ -1365,14 +1364,14 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) } } else { printk(KERN_WARNING "%s: Encrypted flag not set\n", - __FUNCTION__); + __func__); rc = -EINVAL; goto out; } /* Released in this function */ virt = kzalloc(crypt_stat->num_header_bytes_at_front, GFP_KERNEL); if (!virt) { - printk(KERN_ERR "%s: Out of memory\n", __FUNCTION__); + printk(KERN_ERR "%s: Out of memory\n", __func__); rc = -ENOMEM; goto out; } @@ -1380,7 +1379,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) ecryptfs_dentry); if (unlikely(rc)) { printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) @@ -1391,7 +1390,7 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) ecryptfs_dentry, virt); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " - "rc = [%d]\n", __FUNCTION__, rc); + "rc = [%d]\n", __func__, rc); goto out_free; } out_free: @@ -1585,7 +1584,7 @@ int ecryptfs_read_metadata(struct dentry *ecryptfs_dentry) if (!page_virt) { rc = -ENOMEM; printk(KERN_ERR "%s: Unable to allocate page_virt\n", - __FUNCTION__); + __func__); goto out; } rc = ecryptfs_read_lower(page_virt, 0, crypt_stat->extent_size, diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 5007f788da0..951ee33a022 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -4,7 +4,7 @@ * * Copyright (C) 1997-2003 Erez Zadok * Copyright (C) 2001-2003 Stony Brook University - * Copyright (C) 2004-2007 International Business Machines Corp. + * Copyright (C) 2004-2008 International Business Machines Corp. * Author(s): Michael A. Halcrow <mahalcro@us.ibm.com> * Trevor S. Highland <trevor.highland@gmail.com> * Tyler Hicks <tyhicks@ou.edu> @@ -34,6 +34,7 @@ #include <linux/namei.h> #include <linux/scatterlist.h> #include <linux/hash.h> +#include <linux/nsproxy.h> /* Version verification for shared data structures w/ userspace */ #define ECRYPTFS_VERSION_MAJOR 0x00 @@ -49,11 +50,13 @@ #define ECRYPTFS_VERSIONING_POLICY 0x00000008 #define ECRYPTFS_VERSIONING_XATTR 0x00000010 #define ECRYPTFS_VERSIONING_MULTKEY 0x00000020 +#define ECRYPTFS_VERSIONING_DEVMISC 0x00000040 #define ECRYPTFS_VERSIONING_MASK (ECRYPTFS_VERSIONING_PASSPHRASE \ | ECRYPTFS_VERSIONING_PLAINTEXT_PASSTHROUGH \ | ECRYPTFS_VERSIONING_PUBKEY \ | ECRYPTFS_VERSIONING_XATTR \ - | ECRYPTFS_VERSIONING_MULTKEY) + | ECRYPTFS_VERSIONING_MULTKEY \ + | ECRYPTFS_VERSIONING_DEVMISC) #define ECRYPTFS_MAX_PASSWORD_LENGTH 64 #define ECRYPTFS_MAX_PASSPHRASE_BYTES ECRYPTFS_MAX_PASSWORD_LENGTH #define ECRYPTFS_SALT_SIZE 8 @@ -73,17 +76,14 @@ #define ECRYPTFS_DEFAULT_MSG_CTX_ELEMS 32 #define ECRYPTFS_DEFAULT_SEND_TIMEOUT HZ #define ECRYPTFS_MAX_MSG_CTX_TTL (HZ*3) -#define ECRYPTFS_NLMSG_HELO 100 -#define ECRYPTFS_NLMSG_QUIT 101 -#define ECRYPTFS_NLMSG_REQUEST 102 -#define ECRYPTFS_NLMSG_RESPONSE 103 #define ECRYPTFS_MAX_PKI_NAME_BYTES 16 #define ECRYPTFS_DEFAULT_NUM_USERS 4 #define ECRYPTFS_MAX_NUM_USERS 32768 #define ECRYPTFS_TRANSPORT_NETLINK 0 #define ECRYPTFS_TRANSPORT_CONNECTOR 1 #define ECRYPTFS_TRANSPORT_RELAYFS 2 -#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_NETLINK +#define ECRYPTFS_TRANSPORT_MISCDEV 3 +#define ECRYPTFS_DEFAULT_TRANSPORT ECRYPTFS_TRANSPORT_MISCDEV #define ECRYPTFS_XATTR_NAME "user.ecryptfs" #define RFC2440_CIPHER_DES3_EDE 0x02 @@ -366,32 +366,63 @@ struct ecryptfs_auth_tok_list_item { }; struct ecryptfs_message { + /* Can never be greater than ecryptfs_message_buf_len */ + /* Used to find the parent msg_ctx */ + /* Inherits from msg_ctx->index */ u32 index; u32 data_len; u8 data[]; }; struct ecryptfs_msg_ctx { -#define ECRYPTFS_MSG_CTX_STATE_FREE 0x0001 -#define ECRYPTFS_MSG_CTX_STATE_PENDING 0x0002 -#define ECRYPTFS_MSG_CTX_STATE_DONE 0x0003 - u32 state; - unsigned int index; - unsigned int counter; +#define ECRYPTFS_MSG_CTX_STATE_FREE 0x01 +#define ECRYPTFS_MSG_CTX_STATE_PENDING 0x02 +#define ECRYPTFS_MSG_CTX_STATE_DONE 0x03 +#define ECRYPTFS_MSG_CTX_STATE_NO_REPLY 0x04 + u8 state; +#define ECRYPTFS_MSG_HELO 100 +#define ECRYPTFS_MSG_QUIT 101 +#define ECRYPTFS_MSG_REQUEST 102 +#define ECRYPTFS_MSG_RESPONSE 103 + u8 type; + u32 index; + /* Counter converts to a sequence number. Each message sent + * out for which we expect a response has an associated + * sequence number. The response must have the same sequence + * number as the counter for the msg_stc for the message to be + * valid. */ + u32 counter; + size_t msg_size; struct ecryptfs_message *msg; struct task_struct *task; struct list_head node; + struct list_head daemon_out_list; struct mutex mux; }; extern unsigned int ecryptfs_transport; -struct ecryptfs_daemon_id { - pid_t pid; - uid_t uid; - struct hlist_node id_chain; +struct ecryptfs_daemon; + +struct ecryptfs_daemon { +#define ECRYPTFS_DAEMON_IN_READ 0x00000001 +#define ECRYPTFS_DAEMON_IN_POLL 0x00000002 +#define ECRYPTFS_DAEMON_ZOMBIE 0x00000004 +#define ECRYPTFS_DAEMON_MISCDEV_OPEN 0x00000008 + u32 flags; + u32 num_queued_msg_ctx; + struct pid *pid; + uid_t euid; + struct user_namespace *user_ns; + struct task_struct *task; + struct mutex mux; + struct list_head msg_ctx_out_queue; + wait_queue_head_t wait; + struct hlist_node euid_chain; }; +extern struct mutex ecryptfs_daemon_hash_mux; + static inline struct ecryptfs_file_info * ecryptfs_file_to_private(struct file *file) { @@ -500,7 +531,7 @@ ecryptfs_set_dentry_lower_mnt(struct dentry *dentry, struct vfsmount *lower_mnt) } #define ecryptfs_printk(type, fmt, arg...) \ - __ecryptfs_printk(type "%s: " fmt, __FUNCTION__, ## arg); + __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); void __ecryptfs_printk(const char *fmt, ...); extern const struct file_operations ecryptfs_main_fops; @@ -581,10 +612,13 @@ int ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags); int ecryptfs_read_xattr_region(char *page_virt, struct inode *ecryptfs_inode); -int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid); -int ecryptfs_process_quit(uid_t uid, pid_t pid); -int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid, - pid_t pid, u32 seq); +int ecryptfs_process_helo(unsigned int transport, uid_t euid, + struct user_namespace *user_ns, struct pid *pid); +int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, + struct pid *pid); +int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, + struct user_namespace *user_ns, struct pid *pid, + u32 seq); int ecryptfs_send_message(unsigned int transport, char *data, int data_len, struct ecryptfs_msg_ctx **msg_ctx); int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, @@ -593,14 +627,14 @@ int ecryptfs_init_messaging(unsigned int transport); void ecryptfs_release_messaging(unsigned int transport); int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type, - u16 msg_flags, pid_t daemon_pid); + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct pid *daemon_pid); int ecryptfs_init_netlink(void); void ecryptfs_release_netlink(void); int ecryptfs_send_connector(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type, - u16 msg_flags, pid_t daemon_pid); + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct pid *daemon_pid); int ecryptfs_init_connector(void); void ecryptfs_release_connector(void); void @@ -642,5 +676,21 @@ int ecryptfs_read_lower_page_segment(struct page *page_for_ecryptfs, size_t offset_in_page, size_t size, struct inode *ecryptfs_inode); struct page *ecryptfs_get_locked_page(struct file *file, loff_t index); +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon); +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns); +int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size); +int ecryptfs_write_packet_length(char *dest, size_t size, + size_t *packet_size_length); +int ecryptfs_init_ecryptfs_miscdev(void); +void ecryptfs_destroy_ecryptfs_miscdev(void); +int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon); +void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx); +int +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns, struct pid *pid); #endif /* #ifndef ECRYPTFS_KERNEL_H */ diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 2b8f5ed4ade..2258b8f654a 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -195,7 +195,9 @@ static int ecryptfs_open(struct inode *inode, struct file *file) file, ecryptfs_inode_to_private(inode)->lower_file); if (S_ISDIR(ecryptfs_dentry->d_inode->i_mode)) { ecryptfs_printk(KERN_DEBUG, "This is a directory\n"); + mutex_lock(&crypt_stat->cs_mutex); crypt_stat->flags &= ~(ECRYPTFS_ENCRYPTED); + mutex_unlock(&crypt_stat->cs_mutex); rc = 0; goto out; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index e2386115210..0a1397335a8 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -111,7 +111,7 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - if (unlikely(IS_ERR(lower_dir_dentry))) { + if (IS_ERR(lower_dir_dentry)) { ecryptfs_printk(KERN_ERR, "Error locking directory of " "dentry\n"); rc = PTR_ERR(lower_dir_dentry); @@ -121,7 +121,7 @@ ecryptfs_do_create(struct inode *directory_inode, ecryptfs_dentry, mode, nd); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " - "rc = [%d]\n", __FUNCTION__, rc); + "rc = [%d]\n", __func__, rc); goto out_lock; } rc = ecryptfs_interpose(lower_dentry, ecryptfs_dentry, @@ -908,7 +908,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (ia->ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) ia->ia_valid &= ~ATTR_MODE; + mutex_lock(&lower_dentry->d_inode->i_mutex); rc = notify_change(lower_dentry, ia); + mutex_unlock(&lower_dentry->d_inode->i_mutex); out: fsstack_copy_attr_all(inode, lower_inode, NULL); return rc; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 682b1b2482c..e82b457180b 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -65,7 +65,7 @@ static int process_request_key_err(long err_code) } /** - * parse_packet_length + * ecryptfs_parse_packet_length * @data: Pointer to memory containing length at offset * @size: This function writes the decoded size to this memory * address; zero on error @@ -73,8 +73,8 @@ static int process_request_key_err(long err_code) * * Returns zero on success; non-zero on error */ -static int parse_packet_length(unsigned char *data, size_t *size, - size_t *length_size) +int ecryptfs_parse_packet_length(unsigned char *data, size_t *size, + size_t *length_size) { int rc = 0; @@ -105,7 +105,7 @@ out: } /** - * write_packet_length + * ecryptfs_write_packet_length * @dest: The byte array target into which to write the length. Must * have at least 5 bytes allocated. * @size: The length to write. @@ -114,8 +114,8 @@ out: * * Returns zero on success; non-zero on error. */ -static int write_packet_length(char *dest, size_t size, - size_t *packet_size_length) +int ecryptfs_write_packet_length(char *dest, size_t size, + size_t *packet_size_length) { int rc = 0; @@ -162,8 +162,8 @@ write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key, goto out; } message[i++] = ECRYPTFS_TAG_64_PACKET_TYPE; - rc = write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, - &packet_size_len); + rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " "header; cannot generate packet length\n"); @@ -172,8 +172,9 @@ write_tag_64_packet(char *signature, struct ecryptfs_session_key *session_key, i += packet_size_len; memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); i += ECRYPTFS_SIG_SIZE_HEX; - rc = write_packet_length(&message[i], session_key->encrypted_key_size, - &packet_size_len); + rc = ecryptfs_write_packet_length(&message[i], + session_key->encrypted_key_size, + &packet_size_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 64 packet " "header; cannot generate packet length\n"); @@ -225,7 +226,7 @@ parse_tag_65_packet(struct ecryptfs_session_key *session_key, u8 *cipher_code, rc = -EIO; goto out; } - rc = parse_packet_length(&data[i], &m_size, &data_len); + rc = ecryptfs_parse_packet_length(&data[i], &m_size, &data_len); if (rc) { ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " "rc = [%d]\n", rc); @@ -304,8 +305,8 @@ write_tag_66_packet(char *signature, u8 cipher_code, goto out; } message[i++] = ECRYPTFS_TAG_66_PACKET_TYPE; - rc = write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, - &packet_size_len); + rc = ecryptfs_write_packet_length(&message[i], ECRYPTFS_SIG_SIZE_HEX, + &packet_size_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " "header; cannot generate packet length\n"); @@ -315,8 +316,8 @@ write_tag_66_packet(char *signature, u8 cipher_code, memcpy(&message[i], signature, ECRYPTFS_SIG_SIZE_HEX); i += ECRYPTFS_SIG_SIZE_HEX; /* The encrypted key includes 1 byte cipher code and 2 byte checksum */ - rc = write_packet_length(&message[i], crypt_stat->key_size + 3, - &packet_size_len); + rc = ecryptfs_write_packet_length(&message[i], crypt_stat->key_size + 3, + &packet_size_len); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 66 packet " "header; cannot generate packet length\n"); @@ -357,20 +358,25 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, /* verify that everything through the encrypted FEK size is present */ if (message_len < 4) { rc = -EIO; + printk(KERN_ERR "%s: message_len is [%Zd]; minimum acceptable " + "message length is [%d]\n", __func__, message_len, 4); goto out; } if (data[i++] != ECRYPTFS_TAG_67_PACKET_TYPE) { - ecryptfs_printk(KERN_ERR, "Type should be ECRYPTFS_TAG_67\n"); rc = -EIO; + printk(KERN_ERR "%s: Type should be ECRYPTFS_TAG_67\n", + __func__); goto out; } if (data[i++]) { - ecryptfs_printk(KERN_ERR, "Status indicator has non zero value" - " [%d]\n", data[i-1]); rc = -EIO; + printk(KERN_ERR "%s: Status indicator has non zero " + "value [%d]\n", __func__, data[i-1]); + goto out; } - rc = parse_packet_length(&data[i], &key_rec->enc_key_size, &data_len); + rc = ecryptfs_parse_packet_length(&data[i], &key_rec->enc_key_size, + &data_len); if (rc) { ecryptfs_printk(KERN_WARNING, "Error parsing packet length; " "rc = [%d]\n", rc); @@ -378,17 +384,17 @@ parse_tag_67_packet(struct ecryptfs_key_record *key_rec, } i += data_len; if (message_len < (i + key_rec->enc_key_size)) { - ecryptfs_printk(KERN_ERR, "message_len [%d]; max len is [%d]\n", - message_len, (i + key_rec->enc_key_size)); rc = -EIO; + printk(KERN_ERR "%s: message_len [%Zd]; max len is [%Zd]\n", + __func__, message_len, (i + key_rec->enc_key_size)); goto out; } if (key_rec->enc_key_size > ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES) { - ecryptfs_printk(KERN_ERR, "Encrypted key_size [%d] larger than " - "the maximum key size [%d]\n", - key_rec->enc_key_size, - ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); rc = -EIO; + printk(KERN_ERR "%s: Encrypted key_size [%Zd] larger than " + "the maximum key size [%d]\n", __func__, + key_rec->enc_key_size, + ECRYPTFS_MAX_ENCRYPTED_KEY_BYTES); goto out; } memcpy(key_rec->enc_key, &data[i], key_rec->enc_key_size); @@ -445,7 +451,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = write_tag_64_packet(auth_tok_sig, &(auth_tok->session_key), &netlink_message, &netlink_message_length); if (rc) { - ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet"); + ecryptfs_printk(KERN_ERR, "Failed to write tag 64 packet\n"); goto out; } rc = ecryptfs_send_message(ecryptfs_transport, netlink_message, @@ -570,8 +576,8 @@ parse_tag_1_packet(struct ecryptfs_crypt_stat *crypt_stat, goto out; } (*new_auth_tok) = &auth_tok_list_item->auth_tok; - rc = parse_packet_length(&data[(*packet_size)], &body_size, - &length_size); + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); if (rc) { printk(KERN_WARNING "Error parsing packet length; " "rc = [%d]\n", rc); @@ -704,8 +710,8 @@ parse_tag_3_packet(struct ecryptfs_crypt_stat *crypt_stat, goto out; } (*new_auth_tok) = &auth_tok_list_item->auth_tok; - rc = parse_packet_length(&data[(*packet_size)], &body_size, - &length_size); + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); if (rc) { printk(KERN_WARNING "Error parsing packet length; rc = [%d]\n", rc); @@ -852,8 +858,8 @@ parse_tag_11_packet(unsigned char *data, unsigned char *contents, rc = -EINVAL; goto out; } - rc = parse_packet_length(&data[(*packet_size)], &body_size, - &length_size); + rc = ecryptfs_parse_packet_length(&data[(*packet_size)], &body_size, + &length_size); if (rc) { printk(KERN_WARNING "Invalid tag 11 packet format\n"); goto out; @@ -1405,8 +1411,8 @@ write_tag_1_packet(char *dest, size_t *remaining_bytes, auth_tok->token.private_key.key_size; rc = pki_encrypt_session_key(auth_tok, crypt_stat, key_rec); if (rc) { - ecryptfs_printk(KERN_ERR, "Failed to encrypt session key " - "via a pki"); + printk(KERN_ERR "Failed to encrypt session key via a key " + "module; rc = [%d]\n", rc); goto out; } if (ecryptfs_verbosity > 0) { @@ -1430,8 +1436,9 @@ encrypted_session_key_set: goto out; } dest[(*packet_size)++] = ECRYPTFS_TAG_1_PACKET_TYPE; - rc = write_packet_length(&dest[(*packet_size)], (max_packet_size - 4), - &packet_size_length); + rc = ecryptfs_write_packet_length(&dest[(*packet_size)], + (max_packet_size - 4), + &packet_size_length); if (rc) { ecryptfs_printk(KERN_ERR, "Error generating tag 1 packet " "header; cannot generate packet length\n"); @@ -1489,8 +1496,9 @@ write_tag_11_packet(char *dest, size_t *remaining_bytes, char *contents, goto out; } dest[(*packet_length)++] = ECRYPTFS_TAG_11_PACKET_TYPE; - rc = write_packet_length(&dest[(*packet_length)], - (max_packet_size - 4), &packet_size_length); + rc = ecryptfs_write_packet_length(&dest[(*packet_length)], + (max_packet_size - 4), + &packet_size_length); if (rc) { printk(KERN_ERR "Error generating tag 11 packet header; cannot " "generate packet length. rc = [%d]\n", rc); @@ -1682,8 +1690,9 @@ encrypted_session_key_set: dest[(*packet_size)++] = ECRYPTFS_TAG_3_PACKET_TYPE; /* Chop off the Tag 3 identifier(1) and Tag 3 packet size(3) * to get the number of octets in the actual Tag 3 packet */ - rc = write_packet_length(&dest[(*packet_size)], (max_packet_size - 4), - &packet_size_length); + rc = ecryptfs_write_packet_length(&dest[(*packet_size)], + (max_packet_size - 4), + &packet_size_length); if (rc) { printk(KERN_ERR "Error generating tag 3 packet header; cannot " "generate packet length. rc = [%d]\n", rc); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index d25ac9500a9..d603631601e 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -219,7 +219,7 @@ int ecryptfs_interpose(struct dentry *lower_dentry, struct dentry *dentry, if (rc) { printk(KERN_ERR "%s: Error attempting to initialize the " "persistent file for the dentry with name [%s]; " - "rc = [%d]\n", __FUNCTION__, dentry->d_name.name, rc); + "rc = [%d]\n", __func__, dentry->d_name.name, rc); goto out; } out: diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index 9cc2aec27b0..1b5c20058ac 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -1,7 +1,7 @@ /** * eCryptfs: Linux filesystem encryption layer * - * Copyright (C) 2004-2006 International Business Machines Corp. + * Copyright (C) 2004-2008 International Business Machines Corp. * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> * Tyler Hicks <tyhicks@ou.edu> * @@ -20,19 +20,21 @@ * 02111-1307, USA. */ #include <linux/sched.h> +#include <linux/user_namespace.h> +#include <linux/nsproxy.h> #include "ecryptfs_kernel.h" static LIST_HEAD(ecryptfs_msg_ctx_free_list); static LIST_HEAD(ecryptfs_msg_ctx_alloc_list); static struct mutex ecryptfs_msg_ctx_lists_mux; -static struct hlist_head *ecryptfs_daemon_id_hash; -static struct mutex ecryptfs_daemon_id_hash_mux; +static struct hlist_head *ecryptfs_daemon_hash; +struct mutex ecryptfs_daemon_hash_mux; static int ecryptfs_hash_buckets; #define ecryptfs_uid_hash(uid) \ hash_long((unsigned long)uid, ecryptfs_hash_buckets) -static unsigned int ecryptfs_msg_counter; +static u32 ecryptfs_msg_counter; static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; /** @@ -40,9 +42,10 @@ static struct ecryptfs_msg_ctx *ecryptfs_msg_ctx_arr; * @msg_ctx: The context that was acquired from the free list * * Acquires a context element from the free list and locks the mutex - * on the context. Returns zero on success; non-zero on error or upon - * failure to acquire a free context element. Be sure to lock the - * list mutex before calling. + * on the context. Sets the msg_ctx task to current. Returns zero on + * success; non-zero on error or upon failure to acquire a free + * context element. Must be called with ecryptfs_msg_ctx_lists_mux + * held. */ static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx) { @@ -50,11 +53,11 @@ static int ecryptfs_acquire_free_msg_ctx(struct ecryptfs_msg_ctx **msg_ctx) int rc; if (list_empty(&ecryptfs_msg_ctx_free_list)) { - ecryptfs_printk(KERN_WARNING, "The eCryptfs free " - "context list is empty. It may be helpful to " - "specify the ecryptfs_message_buf_len " - "parameter to be greater than the current " - "value of [%d]\n", ecryptfs_message_buf_len); + printk(KERN_WARNING "%s: The eCryptfs free " + "context list is empty. It may be helpful to " + "specify the ecryptfs_message_buf_len " + "parameter to be greater than the current " + "value of [%d]\n", __func__, ecryptfs_message_buf_len); rc = -ENOMEM; goto out; } @@ -75,8 +78,7 @@ out: * ecryptfs_msg_ctx_free_to_alloc * @msg_ctx: The context to move from the free list to the alloc list * - * Be sure to lock the list mutex and the context mutex before - * calling. + * Must be called with ecryptfs_msg_ctx_lists_mux held. */ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) { @@ -89,36 +91,39 @@ static void ecryptfs_msg_ctx_free_to_alloc(struct ecryptfs_msg_ctx *msg_ctx) * ecryptfs_msg_ctx_alloc_to_free * @msg_ctx: The context to move from the alloc list to the free list * - * Be sure to lock the list mutex and the context mutex before - * calling. + * Must be called with ecryptfs_msg_ctx_lists_mux held. */ -static void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) +void ecryptfs_msg_ctx_alloc_to_free(struct ecryptfs_msg_ctx *msg_ctx) { list_move(&(msg_ctx->node), &ecryptfs_msg_ctx_free_list); if (msg_ctx->msg) kfree(msg_ctx->msg); + msg_ctx->msg = NULL; msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_FREE; } /** - * ecryptfs_find_daemon_id - * @uid: The user id which maps to the desired daemon id - * @id: If return value is zero, points to the desired daemon id - * pointer + * ecryptfs_find_daemon_by_euid + * @euid: The effective user id which maps to the desired daemon id + * @user_ns: The namespace in which @euid applies + * @daemon: If return value is zero, points to the desired daemon pointer * - * Search the hash list for the given user id. Returns zero if the - * user id exists in the list; non-zero otherwise. The daemon id hash - * mutex should be held before calling this function. + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Search the hash list for the given user id. + * + * Returns zero if the user id exists in the list; non-zero otherwise. */ -static int ecryptfs_find_daemon_id(uid_t uid, struct ecryptfs_daemon_id **id) +int ecryptfs_find_daemon_by_euid(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns) { struct hlist_node *elem; int rc; - hlist_for_each_entry(*id, elem, - &ecryptfs_daemon_id_hash[ecryptfs_uid_hash(uid)], - id_chain) { - if ((*id)->uid == uid) { + hlist_for_each_entry(*daemon, elem, + &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)], + euid_chain) { + if ((*daemon)->euid == euid && (*daemon)->user_ns == user_ns) { rc = 0; goto out; } @@ -128,181 +133,325 @@ out: return rc; } -static int ecryptfs_send_raw_message(unsigned int transport, u16 msg_type, - pid_t pid) +static int +ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, + u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx); + +/** + * ecryptfs_send_raw_message + * @transport: Transport type + * @msg_type: Message type + * @daemon: Daemon struct for recipient of message + * + * A raw message is one that does not include an ecryptfs_message + * struct. It simply has a type. + * + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_send_raw_message(unsigned int transport, u8 msg_type, + struct ecryptfs_daemon *daemon) { + struct ecryptfs_msg_ctx *msg_ctx; int rc; switch(transport) { case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, pid); + rc = ecryptfs_send_netlink(NULL, 0, NULL, msg_type, 0, + daemon->pid); + break; + case ECRYPTFS_TRANSPORT_MISCDEV: + rc = ecryptfs_send_message_locked(transport, NULL, 0, msg_type, + &msg_ctx); + if (rc) { + printk(KERN_ERR "%s: Error whilst attempting to send " + "message via procfs; rc = [%d]\n", __func__, rc); + goto out; + } + /* Raw messages are logically context-free (e.g., no + * reply is expected), so we set the state of the + * ecryptfs_msg_ctx object to indicate that it should + * be freed as soon as the transport sends out the message. */ + mutex_lock(&msg_ctx->mux); + msg_ctx->state = ECRYPTFS_MSG_CTX_STATE_NO_REPLY; + mutex_unlock(&msg_ctx->mux); break; case ECRYPTFS_TRANSPORT_CONNECTOR: case ECRYPTFS_TRANSPORT_RELAYFS: default: rc = -ENOSYS; } +out: + return rc; +} + +/** + * ecryptfs_spawn_daemon - Create and initialize a new daemon struct + * @daemon: Pointer to set to newly allocated daemon struct + * @euid: Effective user id for the daemon + * @user_ns: The namespace in which @euid applies + * @pid: Process id for the daemon + * + * Must be called ceremoniously while in possession of + * ecryptfs_sacred_daemon_hash_mux + * + * Returns zero on success; non-zero otherwise + */ +int +ecryptfs_spawn_daemon(struct ecryptfs_daemon **daemon, uid_t euid, + struct user_namespace *user_ns, struct pid *pid) +{ + int rc = 0; + + (*daemon) = kzalloc(sizeof(**daemon), GFP_KERNEL); + if (!(*daemon)) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + "GFP_KERNEL memory\n", __func__, sizeof(**daemon)); + goto out; + } + (*daemon)->euid = euid; + (*daemon)->user_ns = get_user_ns(user_ns); + (*daemon)->pid = get_pid(pid); + (*daemon)->task = current; + mutex_init(&(*daemon)->mux); + INIT_LIST_HEAD(&(*daemon)->msg_ctx_out_queue); + init_waitqueue_head(&(*daemon)->wait); + (*daemon)->num_queued_msg_ctx = 0; + hlist_add_head(&(*daemon)->euid_chain, + &ecryptfs_daemon_hash[ecryptfs_uid_hash(euid)]); +out: return rc; } /** * ecryptfs_process_helo * @transport: The underlying transport (netlink, etc.) - * @uid: The user ID owner of the message + * @euid: The user ID owner of the message + * @user_ns: The namespace in which @euid applies * @pid: The process ID for the userspace program that sent the * message * - * Adds the uid and pid values to the daemon id hash. If a uid + * Adds the euid and pid values to the daemon euid hash. If an euid * already has a daemon pid registered, the daemon will be - * unregistered before the new daemon id is put into the hash list. - * Returns zero after adding a new daemon id to the hash list; + * unregistered before the new daemon is put into the hash list. + * Returns zero after adding a new daemon to the hash list; * non-zero otherwise. */ -int ecryptfs_process_helo(unsigned int transport, uid_t uid, pid_t pid) +int ecryptfs_process_helo(unsigned int transport, uid_t euid, + struct user_namespace *user_ns, struct pid *pid) { - struct ecryptfs_daemon_id *new_id; - struct ecryptfs_daemon_id *old_id; + struct ecryptfs_daemon *new_daemon; + struct ecryptfs_daemon *old_daemon; int rc; - mutex_lock(&ecryptfs_daemon_id_hash_mux); - new_id = kmalloc(sizeof(*new_id), GFP_KERNEL); - if (!new_id) { - rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate memory; unable " - "to register daemon [%d] for user [%d]\n", - pid, uid); - goto unlock; - } - if (!ecryptfs_find_daemon_id(uid, &old_id)) { + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&old_daemon, euid, user_ns); + if (rc != 0) { printk(KERN_WARNING "Received request from user [%d] " - "to register daemon [%d]; unregistering daemon " - "[%d]\n", uid, pid, old_id->pid); - hlist_del(&old_id->id_chain); - rc = ecryptfs_send_raw_message(transport, ECRYPTFS_NLMSG_QUIT, - old_id->pid); + "to register daemon [0x%p]; unregistering daemon " + "[0x%p]\n", euid, pid, old_daemon->pid); + rc = ecryptfs_send_raw_message(transport, ECRYPTFS_MSG_QUIT, + old_daemon); if (rc) printk(KERN_WARNING "Failed to send QUIT " - "message to daemon [%d]; rc = [%d]\n", - old_id->pid, rc); - kfree(old_id); + "message to daemon [0x%p]; rc = [%d]\n", + old_daemon->pid, rc); + hlist_del(&old_daemon->euid_chain); + kfree(old_daemon); } - new_id->uid = uid; - new_id->pid = pid; - hlist_add_head(&new_id->id_chain, - &ecryptfs_daemon_id_hash[ecryptfs_uid_hash(uid)]); - rc = 0; -unlock: - mutex_unlock(&ecryptfs_daemon_id_hash_mux); + rc = ecryptfs_spawn_daemon(&new_daemon, euid, user_ns, pid); + if (rc) + printk(KERN_ERR "%s: The gods are displeased with this attempt " + "to create a new daemon object for euid [%d]; pid " + "[0x%p]; rc = [%d]\n", __func__, euid, pid, rc); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_exorcise_daemon - Destroy the daemon struct + * + * Must be called ceremoniously while in possession of + * ecryptfs_daemon_hash_mux and the daemon's own mux. + */ +int ecryptfs_exorcise_daemon(struct ecryptfs_daemon *daemon) +{ + struct ecryptfs_msg_ctx *msg_ctx, *msg_ctx_tmp; + int rc = 0; + + mutex_lock(&daemon->mux); + if ((daemon->flags & ECRYPTFS_DAEMON_IN_READ) + || (daemon->flags & ECRYPTFS_DAEMON_IN_POLL)) { + rc = -EBUSY; + printk(KERN_WARNING "%s: Attempt to destroy daemon with pid " + "[0x%p], but it is in the midst of a read or a poll\n", + __func__, daemon->pid); + mutex_unlock(&daemon->mux); + goto out; + } + list_for_each_entry_safe(msg_ctx, msg_ctx_tmp, + &daemon->msg_ctx_out_queue, daemon_out_list) { + list_del(&msg_ctx->daemon_out_list); + daemon->num_queued_msg_ctx--; + printk(KERN_WARNING "%s: Warning: dropping message that is in " + "the out queue of a dying daemon\n", __func__); + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); + } + hlist_del(&daemon->euid_chain); + if (daemon->task) + wake_up_process(daemon->task); + if (daemon->pid) + put_pid(daemon->pid); + if (daemon->user_ns) + put_user_ns(daemon->user_ns); + mutex_unlock(&daemon->mux); + memset(daemon, 0, sizeof(*daemon)); + kfree(daemon); +out: return rc; } /** * ecryptfs_process_quit - * @uid: The user ID owner of the message + * @euid: The user ID owner of the message + * @user_ns: The namespace in which @euid applies * @pid: The process ID for the userspace program that sent the * message * - * Deletes the corresponding daemon id for the given uid and pid, if + * Deletes the corresponding daemon for the given euid and pid, if * it is the registered that is requesting the deletion. Returns zero - * after deleting the desired daemon id; non-zero otherwise. + * after deleting the desired daemon; non-zero otherwise. */ -int ecryptfs_process_quit(uid_t uid, pid_t pid) +int ecryptfs_process_quit(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) { - struct ecryptfs_daemon_id *id; + struct ecryptfs_daemon *daemon; int rc; - mutex_lock(&ecryptfs_daemon_id_hash_mux); - if (ecryptfs_find_daemon_id(uid, &id)) { + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, euid, user_ns); + if (rc || !daemon) { rc = -EINVAL; - ecryptfs_printk(KERN_ERR, "Received request from user [%d] to " - "unregister unrecognized daemon [%d]\n", uid, - pid); - goto unlock; + printk(KERN_ERR "Received request from user [%d] to " + "unregister unrecognized daemon [0x%p]\n", euid, pid); + goto out_unlock; } - if (id->pid != pid) { - rc = -EINVAL; - ecryptfs_printk(KERN_WARNING, "Received request from user [%d] " - "with pid [%d] to unregister daemon [%d]\n", - uid, pid, id->pid); - goto unlock; - } - hlist_del(&id->id_chain); - kfree(id); - rc = 0; -unlock: - mutex_unlock(&ecryptfs_daemon_id_hash_mux); + rc = ecryptfs_exorcise_daemon(daemon); +out_unlock: + mutex_unlock(&ecryptfs_daemon_hash_mux); return rc; } /** * ecryptfs_process_reponse * @msg: The ecryptfs message received; the caller should sanity check - * msg->data_len + * msg->data_len and free the memory * @pid: The process ID of the userspace application that sent the * message - * @seq: The sequence number of the message + * @seq: The sequence number of the message; must match the sequence + * number for the existing message context waiting for this + * response + * + * Processes a response message after sending an operation request to + * userspace. Some other process is awaiting this response. Before + * sending out its first communications, the other process allocated a + * msg_ctx from the ecryptfs_msg_ctx_arr at a particular index. The + * response message contains this index so that we can copy over the + * response message into the msg_ctx that the process holds a + * reference to. The other process is going to wake up, check to see + * that msg_ctx->state == ECRYPTFS_MSG_CTX_STATE_DONE, and then + * proceed to read off and process the response message. Returns zero + * upon delivery to desired context element; non-zero upon delivery + * failure or error. * - * Processes a response message after sending a operation request to - * userspace. Returns zero upon delivery to desired context element; - * non-zero upon delivery failure or error. + * Returns zero on success; non-zero otherwise */ -int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t uid, - pid_t pid, u32 seq) +int ecryptfs_process_response(struct ecryptfs_message *msg, uid_t euid, + struct user_namespace *user_ns, struct pid *pid, + u32 seq) { - struct ecryptfs_daemon_id *id; + struct ecryptfs_daemon *daemon; struct ecryptfs_msg_ctx *msg_ctx; - int msg_size; + size_t msg_size; + struct nsproxy *nsproxy; + struct user_namespace *current_user_ns; int rc; if (msg->index >= ecryptfs_message_buf_len) { rc = -EINVAL; - ecryptfs_printk(KERN_ERR, "Attempt to reference " - "context buffer at index [%d]; maximum " - "allowable is [%d]\n", msg->index, - (ecryptfs_message_buf_len - 1)); + printk(KERN_ERR "%s: Attempt to reference " + "context buffer at index [%d]; maximum " + "allowable is [%d]\n", __func__, msg->index, + (ecryptfs_message_buf_len - 1)); goto out; } msg_ctx = &ecryptfs_msg_ctx_arr[msg->index]; mutex_lock(&msg_ctx->mux); - if (ecryptfs_find_daemon_id(msg_ctx->task->euid, &id)) { + mutex_lock(&ecryptfs_daemon_hash_mux); + rcu_read_lock(); + nsproxy = task_nsproxy(msg_ctx->task); + if (nsproxy == NULL) { rc = -EBADMSG; - ecryptfs_printk(KERN_WARNING, "User [%d] received a " - "message response from process [%d] but does " - "not have a registered daemon\n", - msg_ctx->task->euid, pid); + printk(KERN_ERR "%s: Receiving process is a zombie. Dropping " + "message.\n", __func__); + rcu_read_unlock(); + mutex_unlock(&ecryptfs_daemon_hash_mux); goto wake_up; } - if (msg_ctx->task->euid != uid) { + current_user_ns = nsproxy->user_ns; + rc = ecryptfs_find_daemon_by_euid(&daemon, msg_ctx->task->euid, + current_user_ns); + rcu_read_unlock(); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (rc) { + rc = -EBADMSG; + printk(KERN_WARNING "%s: User [%d] received a " + "message response from process [0x%p] but does " + "not have a registered daemon\n", __func__, + msg_ctx->task->euid, pid); + goto wake_up; + } + if (msg_ctx->task->euid != euid) { rc = -EBADMSG; - ecryptfs_printk(KERN_WARNING, "Received message from user " - "[%d]; expected message from user [%d]\n", - uid, msg_ctx->task->euid); + printk(KERN_WARNING "%s: Received message from user " + "[%d]; expected message from user [%d]\n", __func__, + euid, msg_ctx->task->euid); goto unlock; } - if (id->pid != pid) { + if (current_user_ns != user_ns) { rc = -EBADMSG; - ecryptfs_printk(KERN_ERR, "User [%d] received a " - "message response from an unrecognized " - "process [%d]\n", msg_ctx->task->euid, pid); + printk(KERN_WARNING "%s: Received message from user_ns " + "[0x%p]; expected message from user_ns [0x%p]\n", + __func__, user_ns, nsproxy->user_ns); + goto unlock; + } + if (daemon->pid != pid) { + rc = -EBADMSG; + printk(KERN_ERR "%s: User [%d] sent a message response " + "from an unrecognized process [0x%p]\n", + __func__, msg_ctx->task->euid, pid); goto unlock; } if (msg_ctx->state != ECRYPTFS_MSG_CTX_STATE_PENDING) { rc = -EINVAL; - ecryptfs_printk(KERN_WARNING, "Desired context element is not " - "pending a response\n"); + printk(KERN_WARNING "%s: Desired context element is not " + "pending a response\n", __func__); goto unlock; } else if (msg_ctx->counter != seq) { rc = -EINVAL; - ecryptfs_printk(KERN_WARNING, "Invalid message sequence; " - "expected [%d]; received [%d]\n", - msg_ctx->counter, seq); + printk(KERN_WARNING "%s: Invalid message sequence; " + "expected [%d]; received [%d]\n", __func__, + msg_ctx->counter, seq); goto unlock; } - msg_size = sizeof(*msg) + msg->data_len; + msg_size = (sizeof(*msg) + msg->data_len); msg_ctx->msg = kmalloc(msg_size, GFP_KERNEL); if (!msg_ctx->msg) { rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); + printk(KERN_ERR "%s: Failed to allocate [%Zd] bytes of " + "GFP_KERNEL memory\n", __func__, msg_size); goto unlock; } memcpy(msg_ctx->msg, msg, msg_size); @@ -317,34 +466,38 @@ out: } /** - * ecryptfs_send_message + * ecryptfs_send_message_locked * @transport: The transport over which to send the message (i.e., * netlink) * @data: The data to send * @data_len: The length of data * @msg_ctx: The message context allocated for the send + * + * Must be called with ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise */ -int ecryptfs_send_message(unsigned int transport, char *data, int data_len, - struct ecryptfs_msg_ctx **msg_ctx) +static int +ecryptfs_send_message_locked(unsigned int transport, char *data, int data_len, + u8 msg_type, struct ecryptfs_msg_ctx **msg_ctx) { - struct ecryptfs_daemon_id *id; + struct ecryptfs_daemon *daemon; int rc; - mutex_lock(&ecryptfs_daemon_id_hash_mux); - if (ecryptfs_find_daemon_id(current->euid, &id)) { - mutex_unlock(&ecryptfs_daemon_id_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid, + current->nsproxy->user_ns); + if (rc || !daemon) { rc = -ENOTCONN; - ecryptfs_printk(KERN_ERR, "User [%d] does not have a daemon " - "registered\n", current->euid); + printk(KERN_ERR "%s: User [%d] does not have a daemon " + "registered\n", __func__, current->euid); goto out; } - mutex_unlock(&ecryptfs_daemon_id_hash_mux); mutex_lock(&ecryptfs_msg_ctx_lists_mux); rc = ecryptfs_acquire_free_msg_ctx(msg_ctx); if (rc) { mutex_unlock(&ecryptfs_msg_ctx_lists_mux); - ecryptfs_printk(KERN_WARNING, "Could not claim a free " - "context element\n"); + printk(KERN_WARNING "%s: Could not claim a free " + "context element\n", __func__); goto out; } ecryptfs_msg_ctx_free_to_alloc(*msg_ctx); @@ -352,23 +505,50 @@ int ecryptfs_send_message(unsigned int transport, char *data, int data_len, mutex_unlock(&ecryptfs_msg_ctx_lists_mux); switch (transport) { case ECRYPTFS_TRANSPORT_NETLINK: - rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, - ECRYPTFS_NLMSG_REQUEST, 0, id->pid); + rc = ecryptfs_send_netlink(data, data_len, *msg_ctx, msg_type, + 0, daemon->pid); + break; + case ECRYPTFS_TRANSPORT_MISCDEV: + rc = ecryptfs_send_miscdev(data, data_len, *msg_ctx, msg_type, + 0, daemon); break; case ECRYPTFS_TRANSPORT_CONNECTOR: case ECRYPTFS_TRANSPORT_RELAYFS: default: rc = -ENOSYS; } - if (rc) { - printk(KERN_ERR "Error attempting to send message to userspace " - "daemon; rc = [%d]\n", rc); - } + if (rc) + printk(KERN_ERR "%s: Error attempting to send message to " + "userspace daemon; rc = [%d]\n", __func__, rc); out: return rc; } /** + * ecryptfs_send_message + * @transport: The transport over which to send the message (i.e., + * netlink) + * @data: The data to send + * @data_len: The length of data + * @msg_ctx: The message context allocated for the send + * + * Grabs ecryptfs_daemon_hash_mux. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_send_message(unsigned int transport, char *data, int data_len, + struct ecryptfs_msg_ctx **msg_ctx) +{ + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_send_message_locked(transport, data, data_len, + ECRYPTFS_MSG_REQUEST, msg_ctx); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** * ecryptfs_wait_for_response * @msg_ctx: The context that was assigned when sending a message * @msg: The incoming message from userspace; not set if rc != 0 @@ -377,7 +557,7 @@ out: * of time exceeds ecryptfs_message_wait_timeout. If zero is * returned, msg will point to a valid message from userspace; a * non-zero value is returned upon failure to receive a message or an - * error occurs. + * error occurs. Callee must free @msg on success. */ int ecryptfs_wait_for_response(struct ecryptfs_msg_ctx *msg_ctx, struct ecryptfs_message **msg) @@ -413,32 +593,32 @@ int ecryptfs_init_messaging(unsigned int transport) if (ecryptfs_number_of_users > ECRYPTFS_MAX_NUM_USERS) { ecryptfs_number_of_users = ECRYPTFS_MAX_NUM_USERS; - ecryptfs_printk(KERN_WARNING, "Specified number of users is " - "too large, defaulting to [%d] users\n", - ecryptfs_number_of_users); + printk(KERN_WARNING "%s: Specified number of users is " + "too large, defaulting to [%d] users\n", __func__, + ecryptfs_number_of_users); } - mutex_init(&ecryptfs_daemon_id_hash_mux); - mutex_lock(&ecryptfs_daemon_id_hash_mux); + mutex_init(&ecryptfs_daemon_hash_mux); + mutex_lock(&ecryptfs_daemon_hash_mux); ecryptfs_hash_buckets = 1; while (ecryptfs_number_of_users >> ecryptfs_hash_buckets) ecryptfs_hash_buckets++; - ecryptfs_daemon_id_hash = kmalloc(sizeof(struct hlist_head) - * ecryptfs_hash_buckets, GFP_KERNEL); - if (!ecryptfs_daemon_id_hash) { + ecryptfs_daemon_hash = kmalloc((sizeof(struct hlist_head) + * ecryptfs_hash_buckets), GFP_KERNEL); + if (!ecryptfs_daemon_hash) { rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); - mutex_unlock(&ecryptfs_daemon_id_hash_mux); + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); + mutex_unlock(&ecryptfs_daemon_hash_mux); goto out; } for (i = 0; i < ecryptfs_hash_buckets; i++) - INIT_HLIST_HEAD(&ecryptfs_daemon_id_hash[i]); - mutex_unlock(&ecryptfs_daemon_id_hash_mux); - + INIT_HLIST_HEAD(&ecryptfs_daemon_hash[i]); + mutex_unlock(&ecryptfs_daemon_hash_mux); ecryptfs_msg_ctx_arr = kmalloc((sizeof(struct ecryptfs_msg_ctx) - * ecryptfs_message_buf_len), GFP_KERNEL); + * ecryptfs_message_buf_len), + GFP_KERNEL); if (!ecryptfs_msg_ctx_arr) { rc = -ENOMEM; - ecryptfs_printk(KERN_ERR, "Failed to allocate memory\n"); + printk(KERN_ERR "%s: Failed to allocate memory\n", __func__); goto out; } mutex_init(&ecryptfs_msg_ctx_lists_mux); @@ -446,6 +626,7 @@ int ecryptfs_init_messaging(unsigned int transport) ecryptfs_msg_counter = 0; for (i = 0; i < ecryptfs_message_buf_len; i++) { INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].node); + INIT_LIST_HEAD(&ecryptfs_msg_ctx_arr[i].daemon_out_list); mutex_init(&ecryptfs_msg_ctx_arr[i].mux); mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); ecryptfs_msg_ctx_arr[i].index = i; @@ -464,6 +645,11 @@ int ecryptfs_init_messaging(unsigned int transport) if (rc) ecryptfs_release_messaging(transport); break; + case ECRYPTFS_TRANSPORT_MISCDEV: + rc = ecryptfs_init_ecryptfs_miscdev(); + if (rc) + ecryptfs_release_messaging(transport); + break; case ECRYPTFS_TRANSPORT_CONNECTOR: case ECRYPTFS_TRANSPORT_RELAYFS: default: @@ -488,27 +674,37 @@ void ecryptfs_release_messaging(unsigned int transport) kfree(ecryptfs_msg_ctx_arr); mutex_unlock(&ecryptfs_msg_ctx_lists_mux); } - if (ecryptfs_daemon_id_hash) { + if (ecryptfs_daemon_hash) { struct hlist_node *elem; - struct ecryptfs_daemon_id *id; + struct ecryptfs_daemon *daemon; int i; - mutex_lock(&ecryptfs_daemon_id_hash_mux); + mutex_lock(&ecryptfs_daemon_hash_mux); for (i = 0; i < ecryptfs_hash_buckets; i++) { - hlist_for_each_entry(id, elem, - &ecryptfs_daemon_id_hash[i], - id_chain) { - hlist_del(elem); - kfree(id); + int rc; + + hlist_for_each_entry(daemon, elem, + &ecryptfs_daemon_hash[i], + euid_chain) { + rc = ecryptfs_exorcise_daemon(daemon); + if (rc) + printk(KERN_ERR "%s: Error whilst " + "attempting to destroy daemon; " + "rc = [%d]. Dazed and confused, " + "but trying to continue.\n", + __func__, rc); } } - kfree(ecryptfs_daemon_id_hash); - mutex_unlock(&ecryptfs_daemon_id_hash_mux); + kfree(ecryptfs_daemon_hash); + mutex_unlock(&ecryptfs_daemon_hash_mux); } switch(transport) { case ECRYPTFS_TRANSPORT_NETLINK: ecryptfs_release_netlink(); break; + case ECRYPTFS_TRANSPORT_MISCDEV: + ecryptfs_destroy_ecryptfs_miscdev(); + break; case ECRYPTFS_TRANSPORT_CONNECTOR: case ECRYPTFS_TRANSPORT_RELAYFS: default: diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c new file mode 100644 index 00000000000..788995efd1d --- /dev/null +++ b/fs/ecryptfs/miscdev.c @@ -0,0 +1,598 @@ +/** + * eCryptfs: Linux filesystem encryption layer + * + * Copyright (C) 2008 International Business Machines Corp. + * Author(s): Michael A. Halcrow <mhalcrow@us.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA + * 02111-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/hash.h> +#include <linux/random.h> +#include <linux/miscdevice.h> +#include <linux/poll.h> +#include <linux/wait.h> +#include <linux/module.h> +#include "ecryptfs_kernel.h" + +static atomic_t ecryptfs_num_miscdev_opens; + +/** + * ecryptfs_miscdev_poll + * @file: dev file (ignored) + * @pt: dev poll table (ignored) + * + * Returns the poll mask + */ +static unsigned int +ecryptfs_miscdev_poll(struct file *file, poll_table *pt) +{ + struct ecryptfs_daemon *daemon; + unsigned int mask = 0; + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid, + current->nsproxy->user_ns); + BUG_ON(rc || !daemon); + mutex_lock(&daemon->mux); + mutex_unlock(&ecryptfs_daemon_hash_mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + printk(KERN_WARNING "%s: Attempt to poll on zombified " + "daemon\n", __func__); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) + goto out_unlock_daemon; + if (daemon->flags & ECRYPTFS_DAEMON_IN_POLL) + goto out_unlock_daemon; + daemon->flags |= ECRYPTFS_DAEMON_IN_POLL; + mutex_unlock(&daemon->mux); + poll_wait(file, &daemon->wait, pt); + mutex_lock(&daemon->mux); + if (!list_empty(&daemon->msg_ctx_out_queue)) + mask |= POLLIN | POLLRDNORM; +out_unlock_daemon: + daemon->flags &= ~ECRYPTFS_DAEMON_IN_POLL; + mutex_unlock(&daemon->mux); + return mask; +} + +/** + * ecryptfs_miscdev_open + * @inode: inode of miscdev handle (ignored) + * @file: file for miscdev handle (ignored) + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_miscdev_open(struct inode *inode, struct file *file) +{ + struct ecryptfs_daemon *daemon = NULL; + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = try_module_get(THIS_MODULE); + if (rc == 0) { + rc = -EIO; + printk(KERN_ERR "%s: Error attempting to increment module use " + "count; rc = [%d]\n", __func__, rc); + goto out_unlock_daemon_list; + } + rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid, + current->nsproxy->user_ns); + if (rc || !daemon) { + rc = ecryptfs_spawn_daemon(&daemon, current->euid, + current->nsproxy->user_ns, + task_pid(current)); + if (rc) { + printk(KERN_ERR "%s: Error attempting to spawn daemon; " + "rc = [%d]\n", __func__, rc); + goto out_module_put_unlock_daemon_list; + } + } + mutex_lock(&daemon->mux); + if (daemon->pid != task_pid(current)) { + rc = -EINVAL; + printk(KERN_ERR "%s: pid [0x%p] has registered with euid [%d], " + "but pid [0x%p] has attempted to open the handle " + "instead\n", __func__, daemon->pid, daemon->euid, + task_pid(current)); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN) { + rc = -EBUSY; + printk(KERN_ERR "%s: Miscellaneous device handle may only be " + "opened once per daemon; pid [0x%p] already has this " + "handle open\n", __func__, daemon->pid); + goto out_unlock_daemon; + } + daemon->flags |= ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_inc(&ecryptfs_num_miscdev_opens); +out_unlock_daemon: + mutex_unlock(&daemon->mux); +out_module_put_unlock_daemon_list: + if (rc) + module_put(THIS_MODULE); +out_unlock_daemon_list: + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_miscdev_release + * @inode: inode of fs/ecryptfs/euid handle (ignored) + * @file: file for fs/ecryptfs/euid handle (ignored) + * + * This keeps the daemon registered until the daemon sends another + * ioctl to fs/ecryptfs/ctl or until the kernel module unregisters. + * + * Returns zero on success; non-zero otherwise + */ +static int +ecryptfs_miscdev_release(struct inode *inode, struct file *file) +{ + struct ecryptfs_daemon *daemon = NULL; + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid, + current->nsproxy->user_ns); + BUG_ON(rc || !daemon); + mutex_lock(&daemon->mux); + BUG_ON(daemon->pid != task_pid(current)); + BUG_ON(!(daemon->flags & ECRYPTFS_DAEMON_MISCDEV_OPEN)); + daemon->flags &= ~ECRYPTFS_DAEMON_MISCDEV_OPEN; + atomic_dec(&ecryptfs_num_miscdev_opens); + mutex_unlock(&daemon->mux); + rc = ecryptfs_exorcise_daemon(daemon); + if (rc) { + printk(KERN_CRIT "%s: Fatal error whilst attempting to " + "shut down daemon; rc = [%d]. Please report this " + "bug.\n", __func__, rc); + BUG(); + } + module_put(THIS_MODULE); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_send_miscdev + * @data: Data to send to daemon; may be NULL + * @data_size: Amount of data to send to daemon + * @msg_ctx: Message context, which is used to handle the reply. If + * this is NULL, then we do not expect a reply. + * @msg_type: Type of message + * @msg_flags: Flags for message + * @daemon: eCryptfs daemon object + * + * Add msg_ctx to queue and then, if it exists, notify the blocked + * miscdevess about the data being available. Must be called with + * ecryptfs_daemon_hash_mux held. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_send_miscdev(char *data, size_t data_size, + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct ecryptfs_daemon *daemon) +{ + int rc = 0; + + mutex_lock(&msg_ctx->mux); + if (data) { + msg_ctx->msg = kmalloc((sizeof(*msg_ctx->msg) + data_size), + GFP_KERNEL); + if (!msg_ctx->msg) { + rc = -ENOMEM; + printk(KERN_ERR "%s: Out of memory whilst attempting " + "to kmalloc(%Zd, GFP_KERNEL)\n", __func__, + (sizeof(*msg_ctx->msg) + data_size)); + goto out_unlock; + } + } else + msg_ctx->msg = NULL; + msg_ctx->msg->index = msg_ctx->index; + msg_ctx->msg->data_len = data_size; + msg_ctx->type = msg_type; + if (data) { + memcpy(msg_ctx->msg->data, data, data_size); + msg_ctx->msg_size = (sizeof(*msg_ctx->msg) + data_size); + } else + msg_ctx->msg_size = 0; + mutex_lock(&daemon->mux); + list_add_tail(&msg_ctx->daemon_out_list, &daemon->msg_ctx_out_queue); + daemon->num_queued_msg_ctx++; + wake_up_interruptible(&daemon->wait); + mutex_unlock(&daemon->mux); +out_unlock: + mutex_unlock(&msg_ctx->mux); + return rc; +} + +/** + * ecryptfs_miscdev_read - format and send message from queue + * @file: fs/ecryptfs/euid miscdevfs handle (ignored) + * @buf: User buffer into which to copy the next message on the daemon queue + * @count: Amount of space available in @buf + * @ppos: Offset in file (ignored) + * + * Pulls the most recent message from the daemon queue, formats it for + * being sent via a miscdevfs handle, and copies it into @buf + * + * Returns the number of bytes copied into the user buffer + */ +static ssize_t +ecryptfs_miscdev_read(struct file *file, char __user *buf, size_t count, + loff_t *ppos) +{ + struct ecryptfs_daemon *daemon; + struct ecryptfs_msg_ctx *msg_ctx; + size_t packet_length_size; + u32 counter_nbo; + char packet_length[3]; + size_t i; + size_t total_length; + int rc; + + mutex_lock(&ecryptfs_daemon_hash_mux); + /* TODO: Just use file->private_data? */ + rc = ecryptfs_find_daemon_by_euid(&daemon, current->euid, + current->nsproxy->user_ns); + BUG_ON(rc || !daemon); + mutex_lock(&daemon->mux); + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + printk(KERN_WARNING "%s: Attempt to read from zombified " + "daemon\n", __func__); + goto out_unlock_daemon; + } + if (daemon->flags & ECRYPTFS_DAEMON_IN_READ) { + rc = 0; + goto out_unlock_daemon; + } + /* This daemon will not go away so long as this flag is set */ + daemon->flags |= ECRYPTFS_DAEMON_IN_READ; + mutex_unlock(&ecryptfs_daemon_hash_mux); +check_list: + if (list_empty(&daemon->msg_ctx_out_queue)) { + mutex_unlock(&daemon->mux); + rc = wait_event_interruptible( + daemon->wait, !list_empty(&daemon->msg_ctx_out_queue)); + mutex_lock(&daemon->mux); + if (rc < 0) { + rc = 0; + goto out_unlock_daemon; + } + } + if (daemon->flags & ECRYPTFS_DAEMON_ZOMBIE) { + rc = 0; + goto out_unlock_daemon; + } + if (list_empty(&daemon->msg_ctx_out_queue)) { + /* Something else jumped in since the + * wait_event_interruptable() and removed the + * message from the queue; try again */ + goto check_list; + } + BUG_ON(current->euid != daemon->euid); + BUG_ON(current->nsproxy->user_ns != daemon->user_ns); + BUG_ON(task_pid(current) != daemon->pid); + msg_ctx = list_first_entry(&daemon->msg_ctx_out_queue, + struct ecryptfs_msg_ctx, daemon_out_list); + BUG_ON(!msg_ctx); + mutex_lock(&msg_ctx->mux); + if (msg_ctx->msg) { + rc = ecryptfs_write_packet_length(packet_length, + msg_ctx->msg_size, + &packet_length_size); + if (rc) { + rc = 0; + printk(KERN_WARNING "%s: Error writing packet length; " + "rc = [%d]\n", __func__, rc); + goto out_unlock_msg_ctx; + } + } else { + packet_length_size = 0; + msg_ctx->msg_size = 0; + } + /* miscdevfs packet format: + * Octet 0: Type + * Octets 1-4: network byte order msg_ctx->counter + * Octets 5-N0: Size of struct ecryptfs_message to follow + * Octets N0-N1: struct ecryptfs_message (including data) + * + * Octets 5-N1 not written if the packet type does not + * include a message */ + total_length = (1 + 4 + packet_length_size + msg_ctx->msg_size); + if (count < total_length) { + rc = 0; + printk(KERN_WARNING "%s: Only given user buffer of " + "size [%Zd], but we need [%Zd] to read the " + "pending message\n", __func__, count, total_length); + goto out_unlock_msg_ctx; + } + i = 0; + buf[i++] = msg_ctx->type; + counter_nbo = cpu_to_be32(msg_ctx->counter); + memcpy(&buf[i], (char *)&counter_nbo, 4); + i += 4; + if (msg_ctx->msg) { + memcpy(&buf[i], packet_length, packet_length_size); + i += packet_length_size; + rc = copy_to_user(&buf[i], msg_ctx->msg, msg_ctx->msg_size); + if (rc) { + printk(KERN_ERR "%s: copy_to_user returned error " + "[%d]\n", __func__, rc); + goto out_unlock_msg_ctx; + } + i += msg_ctx->msg_size; + } + rc = i; + list_del(&msg_ctx->daemon_out_list); + kfree(msg_ctx->msg); + msg_ctx->msg = NULL; + /* We do not expect a reply from the userspace daemon for any + * message type other than ECRYPTFS_MSG_REQUEST */ + if (msg_ctx->type != ECRYPTFS_MSG_REQUEST) + ecryptfs_msg_ctx_alloc_to_free(msg_ctx); +out_unlock_msg_ctx: + mutex_unlock(&msg_ctx->mux); +out_unlock_daemon: + daemon->flags &= ~ECRYPTFS_DAEMON_IN_READ; + mutex_unlock(&daemon->mux); + return rc; +} + +/** + * ecryptfs_miscdev_helo + * @euid: effective user id of miscdevess sending helo packet + * @user_ns: The namespace in which @euid applies + * @pid: miscdevess id of miscdevess sending helo packet + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_miscdev_helo(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) +{ + int rc; + + rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_MISCDEV, euid, user_ns, + pid); + if (rc) + printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); + return rc; +} + +/** + * ecryptfs_miscdev_quit + * @euid: effective user id of miscdevess sending quit packet + * @user_ns: The namespace in which @euid applies + * @pid: miscdevess id of miscdevess sending quit packet + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_miscdev_quit(uid_t euid, struct user_namespace *user_ns, + struct pid *pid) +{ + int rc; + + rc = ecryptfs_process_quit(euid, user_ns, pid); + if (rc) + printk(KERN_WARNING + "Error processing QUIT message; rc = [%d]\n", rc); + return rc; +} + +/** + * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon + * @data: Bytes comprising struct ecryptfs_message + * @data_size: sizeof(struct ecryptfs_message) + data len + * @euid: Effective user id of miscdevess sending the miscdev response + * @user_ns: The namespace in which @euid applies + * @pid: Miscdevess id of miscdevess sending the miscdev response + * @seq: Sequence number for miscdev response packet + * + * Returns zero on success; non-zero otherwise + */ +static int ecryptfs_miscdev_response(char *data, size_t data_size, + uid_t euid, struct user_namespace *user_ns, + struct pid *pid, u32 seq) +{ + struct ecryptfs_message *msg = (struct ecryptfs_message *)data; + int rc; + + if ((sizeof(*msg) + msg->data_len) != data_size) { + printk(KERN_WARNING "%s: (sizeof(*msg) + msg->data_len) = " + "[%Zd]; data_size = [%Zd]. Invalid packet.\n", __func__, + (sizeof(*msg) + msg->data_len), data_size); + rc = -EINVAL; + goto out; + } + rc = ecryptfs_process_response(msg, euid, user_ns, pid, seq); + if (rc) + printk(KERN_ERR + "Error processing response message; rc = [%d]\n", rc); +out: + return rc; +} + +/** + * ecryptfs_miscdev_write - handle write to daemon miscdev handle + * @file: File for misc dev handle (ignored) + * @buf: Buffer containing user data + * @count: Amount of data in @buf + * @ppos: Pointer to offset in file (ignored) + * + * miscdevfs packet format: + * Octet 0: Type + * Octets 1-4: network byte order msg_ctx->counter (0's for non-response) + * Octets 5-N0: Size of struct ecryptfs_message to follow + * Octets N0-N1: struct ecryptfs_message (including data) + * + * Returns the number of bytes read from @buf + */ +static ssize_t +ecryptfs_miscdev_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + u32 counter_nbo, seq; + size_t packet_size, packet_size_length, i; + ssize_t sz = 0; + char *data; + int rc; + + if (count == 0) + goto out; + data = kmalloc(count, GFP_KERNEL); + if (!data) { + printk(KERN_ERR "%s: Out of memory whilst attempting to " + "kmalloc([%Zd], GFP_KERNEL)\n", __func__, count); + goto out; + } + rc = copy_from_user(data, buf, count); + if (rc) { + printk(KERN_ERR "%s: copy_from_user returned error [%d]\n", + __func__, rc); + goto out_free; + } + sz = count; + i = 0; + switch (data[i++]) { + case ECRYPTFS_MSG_RESPONSE: + if (count < (1 + 4 + 1 + sizeof(struct ecryptfs_message))) { + printk(KERN_WARNING "%s: Minimum acceptable packet " + "size is [%Zd], but amount of data written is " + "only [%Zd]. Discarding response packet.\n", + __func__, + (1 + 4 + 1 + sizeof(struct ecryptfs_message)), + count); + goto out_free; + } + memcpy((char *)&counter_nbo, &data[i], 4); + seq = be32_to_cpu(counter_nbo); + i += 4; + rc = ecryptfs_parse_packet_length(&data[i], &packet_size, + &packet_size_length); + if (rc) { + printk(KERN_WARNING "%s: Error parsing packet length; " + "rc = [%d]\n", __func__, rc); + goto out_free; + } + i += packet_size_length; + if ((1 + 4 + packet_size_length + packet_size) != count) { + printk(KERN_WARNING "%s: (1 + packet_size_length([%Zd])" + " + packet_size([%Zd]))([%Zd]) != " + "count([%Zd]). Invalid packet format.\n", + __func__, packet_size_length, packet_size, + (1 + packet_size_length + packet_size), count); + goto out_free; + } + rc = ecryptfs_miscdev_response(&data[i], packet_size, + current->euid, + current->nsproxy->user_ns, + task_pid(current), seq); + if (rc) + printk(KERN_WARNING "%s: Failed to deliver miscdev " + "response to requesting operation; rc = [%d]\n", + __func__, rc); + break; + case ECRYPTFS_MSG_HELO: + rc = ecryptfs_miscdev_helo(current->euid, + current->nsproxy->user_ns, + task_pid(current)); + if (rc) { + printk(KERN_ERR "%s: Error attempting to process " + "helo from pid [0x%p]; rc = [%d]\n", __func__, + task_pid(current), rc); + goto out_free; + } + break; + case ECRYPTFS_MSG_QUIT: + rc = ecryptfs_miscdev_quit(current->euid, + current->nsproxy->user_ns, + task_pid(current)); + if (rc) { + printk(KERN_ERR "%s: Error attempting to process " + "quit from pid [0x%p]; rc = [%d]\n", __func__, + task_pid(current), rc); + goto out_free; + } + break; + default: + ecryptfs_printk(KERN_WARNING, "Dropping miscdev " + "message of unrecognized type [%d]\n", + data[0]); + break; + } +out_free: + kfree(data); +out: + return sz; +} + + +static const struct file_operations ecryptfs_miscdev_fops = { + .open = ecryptfs_miscdev_open, + .poll = ecryptfs_miscdev_poll, + .read = ecryptfs_miscdev_read, + .write = ecryptfs_miscdev_write, + .release = ecryptfs_miscdev_release, +}; + +static struct miscdevice ecryptfs_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ecryptfs", + .fops = &ecryptfs_miscdev_fops +}; + +/** + * ecryptfs_init_ecryptfs_miscdev + * + * Messages sent to the userspace daemon from the kernel are placed on + * a queue associated with the daemon. The next read against the + * miscdev handle by that daemon will return the oldest message placed + * on the message queue for the daemon. + * + * Returns zero on success; non-zero otherwise + */ +int ecryptfs_init_ecryptfs_miscdev(void) +{ + int rc; + + atomic_set(&ecryptfs_num_miscdev_opens, 0); + mutex_lock(&ecryptfs_daemon_hash_mux); + rc = misc_register(&ecryptfs_miscdev); + if (rc) + printk(KERN_ERR "%s: Failed to register miscellaneous device " + "for communications with userspace daemons; rc = [%d]\n", + __func__, rc); + mutex_unlock(&ecryptfs_daemon_hash_mux); + return rc; +} + +/** + * ecryptfs_destroy_ecryptfs_miscdev + * + * All of the daemons must be exorcised prior to calling this + * function. + */ +void ecryptfs_destroy_ecryptfs_miscdev(void) +{ + BUG_ON(atomic_read(&ecryptfs_num_miscdev_opens) != 0); + misc_deregister(&ecryptfs_miscdev); +} diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 6df1debdccc..2b6fe1e6e8b 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -153,7 +153,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, flush_dcache_page(page); if (rc) { printk(KERN_ERR "%s: Error reading xattr " - "region; rc = [%d]\n", __FUNCTION__, rc); + "region; rc = [%d]\n", __func__, rc); goto out; } } else { @@ -169,7 +169,7 @@ ecryptfs_copy_up_encrypted_with_header(struct page *page, if (rc) { printk(KERN_ERR "%s: Error attempting to read " "extent at offset [%lld] in the lower " - "file; rc = [%d]\n", __FUNCTION__, + "file; rc = [%d]\n", __func__, lower_offset, rc); goto out; } @@ -212,7 +212,7 @@ static int ecryptfs_readpage(struct file *file, struct page *page) "the encrypted content from the lower " "file whilst inserting the metadata " "from the xattr into the header; rc = " - "[%d]\n", __FUNCTION__, rc); + "[%d]\n", __func__, rc); goto out; } @@ -293,7 +293,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (rc) { printk(KERN_ERR "%s: Error attemping to read " "lower page segment; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); ClearPageUptodate(page); goto out; } else @@ -308,7 +308,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, "from the lower file whilst " "inserting the metadata from " "the xattr into the header; rc " - "= [%d]\n", __FUNCTION__, rc); + "= [%d]\n", __func__, rc); ClearPageUptodate(page); goto out; } @@ -320,7 +320,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (rc) { printk(KERN_ERR "%s: Error reading " "page; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); ClearPageUptodate(page); goto out; } @@ -331,7 +331,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (rc) { printk(KERN_ERR "%s: Error decrypting page " "at index [%ld]; rc = [%d]\n", - __FUNCTION__, page->index, rc); + __func__, page->index, rc); ClearPageUptodate(page); goto out; } @@ -348,7 +348,7 @@ static int ecryptfs_prepare_write(struct file *file, struct page *page, if (rc) { printk(KERN_ERR "%s: Error on attempt to " "truncate to (higher) offset [%lld];" - " rc = [%d]\n", __FUNCTION__, + " rc = [%d]\n", __func__, prev_page_end_size, rc); goto out; } @@ -389,7 +389,7 @@ static int ecryptfs_write_inode_size_to_header(struct inode *ecryptfs_inode) kfree(file_size_virt); if (rc) printk(KERN_ERR "%s: Error writing file size to header; " - "rc = [%d]\n", __FUNCTION__, rc); + "rc = [%d]\n", __func__, rc); out: return rc; } diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c index f638a698dc5..e0abad62b39 100644 --- a/fs/ecryptfs/netlink.c +++ b/fs/ecryptfs/netlink.c @@ -44,8 +44,8 @@ static struct sock *ecryptfs_nl_sock; * upon sending the message; non-zero upon error. */ int ecryptfs_send_netlink(char *data, int data_len, - struct ecryptfs_msg_ctx *msg_ctx, u16 msg_type, - u16 msg_flags, pid_t daemon_pid) + struct ecryptfs_msg_ctx *msg_ctx, u8 msg_type, + u16 msg_flags, struct pid *daemon_pid) { struct sk_buff *skb; struct nlmsghdr *nlh; @@ -60,7 +60,7 @@ int ecryptfs_send_netlink(char *data, int data_len, ecryptfs_printk(KERN_ERR, "Failed to allocate socket buffer\n"); goto out; } - nlh = NLMSG_PUT(skb, daemon_pid, msg_ctx ? msg_ctx->counter : 0, + nlh = NLMSG_PUT(skb, pid_nr(daemon_pid), msg_ctx ? msg_ctx->counter : 0, msg_type, payload_len); nlh->nlmsg_flags = msg_flags; if (msg_ctx && payload_len) { @@ -69,7 +69,7 @@ int ecryptfs_send_netlink(char *data, int data_len, msg->data_len = data_len; memcpy(msg->data, data, data_len); } - rc = netlink_unicast(ecryptfs_nl_sock, skb, daemon_pid, 0); + rc = netlink_unicast(ecryptfs_nl_sock, skb, pid_nr(daemon_pid), 0); if (rc < 0) { ecryptfs_printk(KERN_ERR, "Failed to send eCryptfs netlink " "message; rc = [%d]\n", rc); @@ -99,6 +99,7 @@ static int ecryptfs_process_nl_response(struct sk_buff *skb) { struct nlmsghdr *nlh = nlmsg_hdr(skb); struct ecryptfs_message *msg = NLMSG_DATA(nlh); + struct pid *pid; int rc; if (skb->len - NLMSG_HDRLEN - sizeof(*msg) != msg->data_len) { @@ -107,8 +108,10 @@ static int ecryptfs_process_nl_response(struct sk_buff *skb) "incorrectly specified data length\n"); goto out; } - rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, - NETLINK_CREDS(skb)->pid, nlh->nlmsg_seq); + pid = find_get_pid(NETLINK_CREDS(skb)->pid); + rc = ecryptfs_process_response(msg, NETLINK_CREDS(skb)->uid, NULL, + pid, nlh->nlmsg_seq); + put_pid(pid); if (rc) printk(KERN_ERR "Error processing response message; rc = [%d]\n", rc); @@ -126,11 +129,13 @@ out: */ static int ecryptfs_process_nl_helo(struct sk_buff *skb) { + struct pid *pid; int rc; + pid = find_get_pid(NETLINK_CREDS(skb)->pid); rc = ecryptfs_process_helo(ECRYPTFS_TRANSPORT_NETLINK, - NETLINK_CREDS(skb)->uid, - NETLINK_CREDS(skb)->pid); + NETLINK_CREDS(skb)->uid, NULL, pid); + put_pid(pid); if (rc) printk(KERN_WARNING "Error processing HELO; rc = [%d]\n", rc); return rc; @@ -147,10 +152,12 @@ static int ecryptfs_process_nl_helo(struct sk_buff *skb) */ static int ecryptfs_process_nl_quit(struct sk_buff *skb) { + struct pid *pid; int rc; - rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, - NETLINK_CREDS(skb)->pid); + pid = find_get_pid(NETLINK_CREDS(skb)->pid); + rc = ecryptfs_process_quit(NETLINK_CREDS(skb)->uid, NULL, pid); + put_pid(pid); if (rc) printk(KERN_WARNING "Error processing QUIT message; rc = [%d]\n", rc); @@ -176,20 +183,20 @@ static void ecryptfs_receive_nl_message(struct sk_buff *skb) goto free; } switch (nlh->nlmsg_type) { - case ECRYPTFS_NLMSG_RESPONSE: + case ECRYPTFS_MSG_RESPONSE: if (ecryptfs_process_nl_response(skb)) { ecryptfs_printk(KERN_WARNING, "Failed to " "deliver netlink response to " "requesting operation\n"); } break; - case ECRYPTFS_NLMSG_HELO: + case ECRYPTFS_MSG_HELO: if (ecryptfs_process_nl_helo(skb)) { ecryptfs_printk(KERN_WARNING, "Failed to " "fulfill HELO request\n"); } break; - case ECRYPTFS_NLMSG_QUIT: + case ECRYPTFS_MSG_QUIT: if (ecryptfs_process_nl_quit(skb)) { ecryptfs_printk(KERN_WARNING, "Failed to " "fulfill QUIT request\n"); diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 0c4928623bb..ebf55150be5 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -55,7 +55,7 @@ int ecryptfs_write_lower(struct inode *ecryptfs_inode, char *data, set_fs(fs_save); if (octets_written < 0) { printk(KERN_ERR "%s: octets_written = [%td]; " - "expected [%td]\n", __FUNCTION__, octets_written, size); + "expected [%td]\n", __func__, octets_written, size); rc = -EINVAL; } mutex_unlock(&inode_info->lower_file_mutex); @@ -153,7 +153,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, rc = PTR_ERR(ecryptfs_page); printk(KERN_ERR "%s: Error getting page at " "index [%ld] from eCryptfs inode " - "mapping; rc = [%d]\n", __FUNCTION__, + "mapping; rc = [%d]\n", __func__, ecryptfs_page_idx, rc); goto out; } @@ -165,7 +165,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, if (rc) { printk(KERN_ERR "%s: Error decrypting " "page; rc = [%d]\n", - __FUNCTION__, rc); + __func__, rc); ClearPageUptodate(ecryptfs_page); page_cache_release(ecryptfs_page); goto out; @@ -202,7 +202,7 @@ int ecryptfs_write(struct file *ecryptfs_file, char *data, loff_t offset, page_cache_release(ecryptfs_page); if (rc) { printk(KERN_ERR "%s: Error encrypting " - "page; rc = [%d]\n", __FUNCTION__, rc); + "page; rc = [%d]\n", __func__, rc); goto out; } pos += num_bytes; @@ -254,7 +254,7 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, set_fs(fs_save); if (octets_read < 0) { printk(KERN_ERR "%s: octets_read = [%td]; " - "expected [%td]\n", __FUNCTION__, octets_read, size); + "expected [%td]\n", __func__, octets_read, size); rc = -EINVAL; } mutex_unlock(&inode_info->lower_file_mutex); @@ -327,7 +327,7 @@ int ecryptfs_read(char *data, loff_t offset, size_t size, printk(KERN_ERR "%s: Attempt to read data past the end of the " "file; offset = [%lld]; size = [%td]; " "ecryptfs_file_size = [%lld]\n", - __FUNCTION__, offset, size, ecryptfs_file_size); + __func__, offset, size, ecryptfs_file_size); goto out; } pos = offset; @@ -345,14 +345,14 @@ int ecryptfs_read(char *data, loff_t offset, size_t size, rc = PTR_ERR(ecryptfs_page); printk(KERN_ERR "%s: Error getting page at " "index [%ld] from eCryptfs inode " - "mapping; rc = [%d]\n", __FUNCTION__, + "mapping; rc = [%d]\n", __func__, ecryptfs_page_idx, rc); goto out; } rc = ecryptfs_decrypt_page(ecryptfs_page); if (rc) { printk(KERN_ERR "%s: Error decrypting " - "page; rc = [%d]\n", __FUNCTION__, rc); + "page; rc = [%d]\n", __func__, rc); ClearPageUptodate(ecryptfs_page); page_cache_release(ecryptfs_page); goto out; diff --git a/fs/eventfd.c b/fs/eventfd.c index a9f130cd50a..343942deeec 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -200,10 +200,8 @@ struct file *eventfd_fget(int fd) asmlinkage long sys_eventfd(unsigned int count) { - int error, fd; + int fd; struct eventfd_ctx *ctx; - struct file *file; - struct inode *inode; ctx = kmalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) @@ -216,12 +214,9 @@ asmlinkage long sys_eventfd(unsigned int count) * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(&fd, &inode, &file, "[eventfd]", - &eventfd_fops, ctx); - if (!error) - return fd; - - kfree(ctx); - return error; + fd = anon_inode_getfd("[eventfd]", &eventfd_fops, ctx); + if (fd < 0) + kfree(ctx); + return fd; } diff --git a/fs/eventpoll.c b/fs/eventpoll.c index a415f42d32c..990c01d2d66 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -257,25 +257,6 @@ static inline int ep_cmp_ffd(struct epoll_filefd *p1, (p1->file < p2->file ? -1 : p1->fd - p2->fd)); } -/* Special initialization for the RB tree node to detect linkage */ -static inline void ep_rb_initnode(struct rb_node *n) -{ - rb_set_parent(n, n); -} - -/* Removes a node from the RB tree and marks it for a fast is-linked check */ -static inline void ep_rb_erase(struct rb_node *n, struct rb_root *r) -{ - rb_erase(n, r); - rb_set_parent(n, n); -} - -/* Fast check to verify that the item is linked to the main RB tree */ -static inline int ep_rb_linked(struct rb_node *n) -{ - return rb_parent(n) != n; -} - /* Tells us if the item is currently linked */ static inline int ep_is_linked(struct list_head *p) { @@ -283,13 +264,13 @@ static inline int ep_is_linked(struct list_head *p) } /* Get the "struct epitem" from a wait queue pointer */ -static inline struct epitem * ep_item_from_wait(wait_queue_t *p) +static inline struct epitem *ep_item_from_wait(wait_queue_t *p) { return container_of(p, struct eppoll_entry, wait)->base; } /* Get the "struct epitem" from an epoll queue wrapper */ -static inline struct epitem * ep_item_from_epqueue(poll_table *p) +static inline struct epitem *ep_item_from_epqueue(poll_table *p) { return container_of(p, struct ep_pqueue, pt)->epi; } @@ -411,8 +392,7 @@ static int ep_remove(struct eventpoll *ep, struct epitem *epi) list_del_init(&epi->fllink); spin_unlock(&file->f_ep_lock); - if (ep_rb_linked(&epi->rbn)) - ep_rb_erase(&epi->rbn, &ep->rbr); + rb_erase(&epi->rbn, &ep->rbr); spin_lock_irqsave(&ep->lock, flags); if (ep_is_linked(&epi->rdllink)) @@ -728,7 +708,6 @@ static int ep_insert(struct eventpoll *ep, struct epoll_event *event, goto error_return; /* Item initialization follow here ... */ - ep_rb_initnode(&epi->rbn); INIT_LIST_HEAD(&epi->rdllink); INIT_LIST_HEAD(&epi->fllink); INIT_LIST_HEAD(&epi->pwqlist); @@ -1071,8 +1050,6 @@ asmlinkage long sys_epoll_create(int size) { int error, fd = -1; struct eventpoll *ep; - struct inode *inode; - struct file *file; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d)\n", current, size)); @@ -1082,29 +1059,24 @@ asmlinkage long sys_epoll_create(int size) * structure ( "struct eventpoll" ). */ error = -EINVAL; - if (size <= 0 || (error = ep_alloc(&ep)) != 0) + if (size <= 0 || (error = ep_alloc(&ep)) < 0) { + fd = error; goto error_return; + } /* * Creates all the items needed to setup an eventpoll file. That is, - * a file structure, and inode and a free file descriptor. + * a file structure and a free file descriptor. */ - error = anon_inode_getfd(&fd, &inode, &file, "[eventpoll]", - &eventpoll_fops, ep); - if (error) - goto error_free; + fd = anon_inode_getfd("[eventpoll]", &eventpoll_fops, ep); + if (fd < 0) + ep_free(ep); +error_return: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", current, size, fd)); return fd; - -error_free: - ep_free(ep); -error_return: - DNPRINTK(3, (KERN_INFO "[%p] eventpoll: sys_epoll_create(%d) = %d\n", - current, size, error)); - return error; } /* @@ -1262,7 +1234,7 @@ error_return: return error; } -#ifdef TIF_RESTORE_SIGMASK +#ifdef HAVE_SET_RESTORE_SIGMASK /* * Implement the event wait interface for the eventpoll file. It is the kernel @@ -1300,7 +1272,7 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, if (error == -EINTR) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); } else sigprocmask(SIG_SETMASK, &sigsaved, NULL); } @@ -1308,7 +1280,7 @@ asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events, return error; } -#endif /* #ifdef TIF_RESTORE_SIGMASK */ +#endif /* HAVE_SET_RESTORE_SIGMASK */ static int __init eventpoll_init(void) { @@ -1330,4 +1302,3 @@ static int __init eventpoll_init(void) return 0; } fs_initcall(eventpoll_init); - diff --git a/fs/exec.c b/fs/exec.c index 54a0a557b67..aeaa9791d8b 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -24,6 +24,7 @@ #include <linux/slab.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/mman.h> #include <linux/a.out.h> #include <linux/stat.h> @@ -735,6 +736,7 @@ static int exec_mmap(struct mm_struct *mm) tsk->active_mm = mm; activate_mm(active_mm, mm); task_unlock(tsk); + mm_update_next_owner(mm); arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); @@ -765,9 +767,7 @@ static int de_thread(struct task_struct *tsk) /* * Kill all other threads in the thread group. - * We must hold tasklist_lock to call zap_other_threads. */ - read_lock(&tasklist_lock); spin_lock_irq(lock); if (signal_group_exit(sig)) { /* @@ -775,21 +775,10 @@ static int de_thread(struct task_struct *tsk) * return so that the signal is processed. */ spin_unlock_irq(lock); - read_unlock(&tasklist_lock); return -EAGAIN; } - - /* - * child_reaper ignores SIGKILL, change it now. - * Reparenting needs write_lock on tasklist_lock, - * so it is safe to do it under read_lock. - */ - if (unlikely(tsk->group_leader == task_child_reaper(tsk))) - task_active_pid_ns(tsk)->child_reaper = tsk; - sig->group_exit_task = tsk; zap_other_threads(tsk); - read_unlock(&tasklist_lock); /* Account for the thread group leader hanging around: */ count = thread_group_leader(tsk) ? 1 : 2; @@ -810,7 +799,7 @@ static int de_thread(struct task_struct *tsk) if (!thread_group_leader(tsk)) { leader = tsk->group_leader; - sig->notify_count = -1; + sig->notify_count = -1; /* for exit_notify() */ for (;;) { write_lock_irq(&tasklist_lock); if (likely(leader->exit_state)) @@ -820,6 +809,8 @@ static int de_thread(struct task_struct *tsk) schedule(); } + if (unlikely(task_child_reaper(tsk) == leader)) + task_active_pid_ns(tsk)->child_reaper = tsk; /* * The only record we have of the real-time age of a * process, regardless of execs it's done, is start_time. @@ -953,7 +944,6 @@ int flush_old_exec(struct linux_binprm * bprm) { char * name; int i, ch, retval; - struct files_struct *files; char tcomm[sizeof(current->comm)]; /* @@ -964,27 +954,18 @@ int flush_old_exec(struct linux_binprm * bprm) if (retval) goto out; - /* - * Make sure we have private file handles. Ask the - * fork helper to do the work for us and the exit - * helper to do the cleanup of the old one. - */ - files = current->files; /* refcounted so safe to hold */ - retval = unshare_files(); - if (retval) - goto out; + set_mm_exe_file(bprm->mm, bprm->file); + /* * Release all of the old mmap stuff */ retval = exec_mmap(bprm->mm); if (retval) - goto mmap_failed; + goto out; bprm->mm = NULL; /* We're using it now */ /* This is the point of no return */ - put_files_struct(files); - current->sas_ss_sp = current->sas_ss_size = 0; if (current->euid == current->uid && current->egid == current->gid) @@ -1034,8 +1015,6 @@ int flush_old_exec(struct linux_binprm * bprm) return 0; -mmap_failed: - reset_files_struct(current, files); out: return retval; } @@ -1282,13 +1261,17 @@ int do_execve(char * filename, { struct linux_binprm *bprm; struct file *file; - unsigned long env_p; + struct files_struct *displaced; int retval; + retval = unshare_files(&displaced); + if (retval) + goto out_ret; + retval = -ENOMEM; bprm = kzalloc(sizeof(*bprm), GFP_KERNEL); if (!bprm) - goto out_ret; + goto out_files; file = open_exec(filename); retval = PTR_ERR(file); @@ -1330,11 +1313,9 @@ int do_execve(char * filename, if (retval < 0) goto out; - env_p = bprm->p; retval = copy_strings(bprm->argc, argv, bprm); if (retval < 0) goto out; - bprm->argv_len = env_p - bprm->p; retval = search_binary_handler(bprm,regs); if (retval >= 0) { @@ -1343,6 +1324,8 @@ int do_execve(char * filename, security_bprm_free(bprm); acct_update_integrals(current); kfree(bprm); + if (displaced) + put_files_struct(displaced); return retval; } @@ -1363,6 +1346,9 @@ out_file: out_kfree: kfree(bprm); +out_files: + if (displaced) + reset_files_struct(displaced); out_ret: return retval; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 109ab5e44ec..cc91227d3bb 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -150,12 +150,12 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir) if (IS_ERR(ppd)) { err = PTR_ERR(ppd); dprintk("%s: get_parent of %ld failed, err %d\n", - __FUNCTION__, pd->d_inode->i_ino, err); + __func__, pd->d_inode->i_ino, err); dput(pd); break; } - dprintk("%s: find name of %lu in %lu\n", __FUNCTION__, + dprintk("%s: find name of %lu in %lu\n", __func__, pd->d_inode->i_ino, ppd->d_inode->i_ino); err = exportfs_get_name(mnt, ppd, nbuf, pd); if (err) { @@ -168,14 +168,14 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir) continue; break; } - dprintk("%s: found name: %s\n", __FUNCTION__, nbuf); + dprintk("%s: found name: %s\n", __func__, nbuf); mutex_lock(&ppd->d_inode->i_mutex); npd = lookup_one_len(nbuf, ppd, strlen(nbuf)); mutex_unlock(&ppd->d_inode->i_mutex); if (IS_ERR(npd)) { err = PTR_ERR(npd); dprintk("%s: lookup failed: %d\n", - __FUNCTION__, err); + __func__, err); dput(ppd); dput(pd); break; @@ -188,7 +188,7 @@ reconnect_path(struct vfsmount *mnt, struct dentry *target_dir) if (npd == pd) noprogress = 0; else - printk("%s: npd != pd\n", __FUNCTION__); + printk("%s: npd != pd\n", __func__); dput(npd); dput(ppd); if (IS_ROOT(pd)) { diff --git a/fs/ext2/balloc.c b/fs/ext2/balloc.c index e7b2bafa1dd..10bb02c3f25 100644 --- a/fs/ext2/balloc.c +++ b/fs/ext2/balloc.c @@ -106,7 +106,7 @@ static int ext2_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext2_error(sb, __FUNCTION__, + ext2_error(sb, __func__, "Invalid block bitmap - " "block_group = %d, block = %lu", block_group, bitmap_blk); @@ -132,7 +132,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext2_error(sb, __FUNCTION__, + ext2_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %u", block_group, le32_to_cpu(desc->bg_block_bitmap)); @@ -143,17 +143,18 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) if (bh_submit_read(bh) < 0) { brelse(bh); - ext2_error(sb, __FUNCTION__, + ext2_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %u", block_group, le32_to_cpu(desc->bg_block_bitmap)); return NULL; } - if (!ext2_valid_block_bitmap(sb, desc, block_group, bh)) { - brelse(bh); - return NULL; - } + ext2_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ return bh; } @@ -245,11 +246,10 @@ restart: prev = rsv; } printk("Window map complete.\n"); - if (bad) - BUG(); + BUG_ON(bad); } #define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __FUNCTION__) + __rsv_window_dump((root), (verbose), __func__) #else #define rsv_window_dump(root, verbose) do {} while (0) #endif @@ -548,7 +548,7 @@ do_more: for (i = 0, group_freed = 0; i < count; i++) { if (!ext2_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { - ext2_error(sb, __FUNCTION__, + ext2_error(sb, __func__, "bit already cleared for block %lu", block + i); } else { group_freed++; @@ -1381,7 +1381,12 @@ allocated: "Allocating block in system zone - " "blocks from "E2FSBLK", length %lu", ret_block, num); - goto out; + /* + * ext2_try_to_allocate marked the blocks we allocated as in + * use. So we may want to selectively mark some of the blocks + * as free + */ + goto retry_alloc; } performed_allocation = 1; diff --git a/fs/ext2/dir.c b/fs/ext2/dir.c index 8dededd80fe..a78c6b4af06 100644 --- a/fs/ext2/dir.c +++ b/fs/ext2/dir.c @@ -41,8 +41,8 @@ static inline __le16 ext2_rec_len_to_disk(unsigned len) { if (len == (1 << 16)) return cpu_to_le16(EXT2_MAX_REC_LEN); - else if (len > (1 << 16)) - BUG(); + else + BUG_ON(len > (1 << 16)); return cpu_to_le16(len); } @@ -295,11 +295,11 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) struct page *page = ext2_get_page(inode, n); if (IS_ERR(page)) { - ext2_error(sb, __FUNCTION__, + ext2_error(sb, __func__, "bad page in #%lu", inode->i_ino); filp->f_pos += PAGE_CACHE_SIZE - offset; - return -EIO; + return PTR_ERR(page); } kaddr = page_address(page); if (unlikely(need_revalidate)) { @@ -314,7 +314,7 @@ ext2_readdir (struct file * filp, void * dirent, filldir_t filldir) limit = kaddr + ext2_last_byte(inode, n) - EXT2_DIR_REC_LEN(1); for ( ;(char*)de <= limit; de = ext2_next_entry(de)) { if (de->rec_len == 0) { - ext2_error(sb, __FUNCTION__, + ext2_error(sb, __func__, "zero-length directory entry"); ext2_put_page(page); return -EIO; @@ -381,7 +381,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, kaddr += ext2_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->rec_len == 0) { - ext2_error(dir->i_sb, __FUNCTION__, + ext2_error(dir->i_sb, __func__, "zero-length directory entry"); ext2_put_page(page); goto out; @@ -396,7 +396,7 @@ struct ext2_dir_entry_2 * ext2_find_entry (struct inode * dir, n = 0; /* next page is past the blocks we've got */ if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) { - ext2_error(dir->i_sb, __FUNCTION__, + ext2_error(dir->i_sb, __func__, "dir %lu size %lld exceeds block count %llu", dir->i_ino, dir->i_size, (unsigned long long)dir->i_blocks); @@ -506,7 +506,7 @@ int ext2_add_link (struct dentry *dentry, struct inode *inode) goto got_it; } if (de->rec_len == 0) { - ext2_error(dir->i_sb, __FUNCTION__, + ext2_error(dir->i_sb, __func__, "zero-length directory entry"); err = -EIO; goto out_unlock; @@ -578,7 +578,7 @@ int ext2_delete_entry (struct ext2_dir_entry_2 * dir, struct page * page ) while ((char*)de < (char*)dir) { if (de->rec_len == 0) { - ext2_error(inode->i_sb, __FUNCTION__, + ext2_error(inode->i_sb, __func__, "zero-length directory entry"); err = -EIO; goto out; @@ -670,7 +670,7 @@ int ext2_empty_dir (struct inode * inode) while ((char *)de <= kaddr) { if (de->rec_len == 0) { - ext2_error(inode->i_sb, __FUNCTION__, + ext2_error(inode->i_sb, __func__, "zero-length directory entry"); printk("kaddr=%p, de=%p\n", kaddr, de); goto not_empty; diff --git a/fs/ext2/ialloc.c b/fs/ext2/ialloc.c index 08f647d8188..f5974134676 100644 --- a/fs/ext2/ialloc.c +++ b/fs/ext2/ialloc.c @@ -75,11 +75,9 @@ static void ext2_release_inode(struct super_block *sb, int group, int dir) } spin_lock(sb_bgl_lock(EXT2_SB(sb), group)); - desc->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(desc->bg_free_inodes_count) + 1); + le16_add_cpu(&desc->bg_free_inodes_count, 1); if (dir) - desc->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(desc->bg_used_dirs_count) - 1); + le16_add_cpu(&desc->bg_used_dirs_count, -1); spin_unlock(sb_bgl_lock(EXT2_SB(sb), group)); if (dir) percpu_counter_dec(&EXT2_SB(sb)->s_dirs_counter); @@ -539,13 +537,11 @@ got: percpu_counter_inc(&sbi->s_dirs_counter); spin_lock(sb_bgl_lock(sbi, group)); - gdp->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + le16_add_cpu(&gdp->bg_free_inodes_count, -1); if (S_ISDIR(mode)) { if (sbi->s_debts[group] < 255) sbi->s_debts[group]++; - gdp->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + le16_add_cpu(&gdp->bg_used_dirs_count, 1); } else { if (sbi->s_debts[group]) sbi->s_debts[group]--; diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index b8a2990bab8..384fc0d1dd7 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -254,13 +254,13 @@ no_block: * Caller must make sure that @ind is valid and will stay that way. */ -static unsigned long ext2_find_near(struct inode *inode, Indirect *ind) +static ext2_fsblk_t ext2_find_near(struct inode *inode, Indirect *ind) { struct ext2_inode_info *ei = EXT2_I(inode); __le32 *start = ind->bh ? (__le32 *) ind->bh->b_data : ei->i_data; __le32 *p; - unsigned long bg_start; - unsigned long colour; + ext2_fsblk_t bg_start; + ext2_fsblk_t colour; /* Try to find previous block */ for (p = ind->p - 1; p >= start; p--) @@ -275,8 +275,7 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind) * It is going to be refered from inode itself? OK, just put it into * the same cylinder group then. */ - bg_start = (ei->i_block_group * EXT2_BLOCKS_PER_GROUP(inode->i_sb)) + - le32_to_cpu(EXT2_SB(inode->i_sb)->s_es->s_first_data_block); + bg_start = ext2_group_first_block_no(inode->i_sb, ei->i_block_group); colour = (current->pid % 16) * (EXT2_BLOCKS_PER_GROUP(inode->i_sb) / 16); return bg_start + colour; @@ -291,8 +290,8 @@ static unsigned long ext2_find_near(struct inode *inode, Indirect *ind) * Returns preferred place for a block (the goal). */ -static inline int ext2_find_goal(struct inode *inode, long block, - Indirect *partial) +static inline ext2_fsblk_t ext2_find_goal(struct inode *inode, long block, + Indirect *partial) { struct ext2_block_alloc_info *block_i; @@ -796,7 +795,7 @@ const struct address_space_operations ext2_aops = { const struct address_space_operations ext2_aops_xip = { .bmap = ext2_bmap, - .get_xip_page = ext2_get_xip_page, + .get_xip_mem = ext2_get_xip_mem, }; const struct address_space_operations ext2_nobh_aops = { diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 088b011bb97..ef50cbc792d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -51,8 +51,7 @@ void ext2_error (struct super_block * sb, const char * function, if (!(sb->s_flags & MS_RDONLY)) { sbi->s_mount_state |= EXT2_ERROR_FS; - es->s_state = - cpu_to_le16(le16_to_cpu(es->s_state) | EXT2_ERROR_FS); + es->s_state |= cpu_to_le16(EXT2_ERROR_FS); ext2_sync_super(sb, es); } @@ -90,7 +89,7 @@ void ext2_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT2_GOOD_OLD_REV) return; - ext2_warning(sb, __FUNCTION__, + ext2_warning(sb, __func__, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT2_DYNAMIC_REV); @@ -604,7 +603,7 @@ static int ext2_setup_super (struct super_block * sb, "running e2fsck is recommended\n"); if (!le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT2_DFL_MAX_MNT_COUNT); - es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); + le16_add_cpu(&es->s_mnt_count, 1); ext2_write_super(sb); if (test_opt (sb, DEBUG)) printk ("[EXT II FS %s, %s, bs=%lu, fs=%lu, gc=%lu, " @@ -622,13 +621,13 @@ static int ext2_check_descriptors(struct super_block *sb) { int i; struct ext2_sb_info *sbi = EXT2_SB(sb); - unsigned long first_block = le32_to_cpu(sbi->s_es->s_first_data_block); - unsigned long last_block; ext2_debug ("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext2_group_desc *gdp = ext2_get_group_desc(sb, i, NULL); + ext2_fsblk_t first_block = ext2_group_first_block_no(sb, i); + ext2_fsblk_t last_block; if (i == sbi->s_groups_count - 1) last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; @@ -664,7 +663,6 @@ static int ext2_check_descriptors(struct super_block *sb) i, (unsigned long) le32_to_cpu(gdp->bg_inode_table)); return 0; } - first_block += EXT2_BLOCKS_PER_GROUP(sb); } return 1; } @@ -721,10 +719,9 @@ static unsigned long descriptor_loc(struct super_block *sb, int nr) { struct ext2_sb_info *sbi = EXT2_SB(sb); - unsigned long bg, first_data_block, first_meta_bg; + unsigned long bg, first_meta_bg; int has_super = 0; - first_data_block = le32_to_cpu(sbi->s_es->s_first_data_block); first_meta_bg = le32_to_cpu(sbi->s_es->s_first_meta_bg); if (!EXT2_HAS_INCOMPAT_FEATURE(sb, EXT2_FEATURE_INCOMPAT_META_BG) || @@ -733,7 +730,8 @@ static unsigned long descriptor_loc(struct super_block *sb, bg = sbi->s_desc_per_block * nr; if (ext2_bg_has_super(sb, bg)) has_super = 1; - return (first_data_block + has_super + (bg * sbi->s_blocks_per_group)); + + return ext2_group_first_block_no(sb, bg) + has_super; } static int ext2_fill_super(struct super_block *sb, void *data, int silent) @@ -1062,7 +1060,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount3; } if (EXT2_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_HAS_JOURNAL)) - ext2_warning(sb, __FUNCTION__, + ext2_warning(sb, __func__, "mounting ext3 filesystem as ext2"); ext2_setup_super (sb, es, sb->s_flags & MS_RDONLY); return 0; @@ -1126,10 +1124,9 @@ void ext2_write_super (struct super_block * sb) if (!(sb->s_flags & MS_RDONLY)) { es = EXT2_SB(sb)->s_es; - if (le16_to_cpu(es->s_state) & EXT2_VALID_FS) { + if (es->s_state & cpu_to_le16(EXT2_VALID_FS)) { ext2_debug ("setting valid to 0\n"); - es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & - ~EXT2_VALID_FS); + es->s_state &= cpu_to_le16(~EXT2_VALID_FS); es->s_free_blocks_count = cpu_to_le32(ext2_count_free_blocks(sb)); es->s_free_inodes_count = cpu_to_le32(ext2_count_free_inodes(sb)); es->s_mtime = cpu_to_le32(get_seconds()); @@ -1180,7 +1177,7 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && invalidate_inodes(sb)) - ext2_warning(sb, __FUNCTION__, "busy inodes while remounting "\ + ext2_warning(sb, __func__, "busy inodes while remounting "\ "xip remain in cache (no functional problem)"); if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; diff --git a/fs/ext2/xattr.c b/fs/ext2/xattr.c index a99d46f3b26..987a5261cc2 100644 --- a/fs/ext2/xattr.c +++ b/fs/ext2/xattr.c @@ -646,8 +646,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, unlock_buffer(new_bh); goto cleanup; } - HDR(new_bh)->h_refcount = cpu_to_le32(1 + - le32_to_cpu(HDR(new_bh)->h_refcount)); + le32_add_cpu(&HDR(new_bh)->h_refcount, 1); ea_bdebug(new_bh, "refcount now=%d", le32_to_cpu(HDR(new_bh)->h_refcount)); } @@ -660,10 +659,8 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, ext2_xattr_cache_insert(new_bh); } else { /* We need to allocate a new block */ - int goal = le32_to_cpu(EXT2_SB(sb)->s_es-> - s_first_data_block) + - EXT2_I(inode)->i_block_group * - EXT2_BLOCKS_PER_GROUP(sb); + ext2_fsblk_t goal = ext2_group_first_block_no(sb, + EXT2_I(inode)->i_block_group); int block = ext2_new_block(inode, goal, &error); if (error) goto cleanup; @@ -731,8 +728,7 @@ ext2_xattr_set2(struct inode *inode, struct buffer_head *old_bh, bforget(old_bh); } else { /* Decrement the refcount only. */ - HDR(old_bh)->h_refcount = cpu_to_le32( - le32_to_cpu(HDR(old_bh)->h_refcount) - 1); + le32_add_cpu(&HDR(old_bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); DQUOT_FREE_BLOCK(inode, 1); @@ -789,8 +785,7 @@ ext2_xattr_delete_inode(struct inode *inode) bforget(bh); unlock_buffer(bh); } else { - HDR(bh)->h_refcount = cpu_to_le32( - le32_to_cpu(HDR(bh)->h_refcount) - 1); + le32_add_cpu(&HDR(bh)->h_refcount, -1); if (ce) mb_cache_entry_release(ce); ea_bdebug(bh, "refcount now=%d", diff --git a/fs/ext2/xip.c b/fs/ext2/xip.c index ca7f0031238..4fb94c20041 100644 --- a/fs/ext2/xip.c +++ b/fs/ext2/xip.c @@ -15,24 +15,28 @@ #include "xip.h" static inline int -__inode_direct_access(struct inode *inode, sector_t sector, - unsigned long *data) +__inode_direct_access(struct inode *inode, sector_t block, + void **kaddr, unsigned long *pfn) { - BUG_ON(!inode->i_sb->s_bdev->bd_disk->fops->direct_access); - return inode->i_sb->s_bdev->bd_disk->fops - ->direct_access(inode->i_sb->s_bdev,sector,data); + struct block_device *bdev = inode->i_sb->s_bdev; + struct block_device_operations *ops = bdev->bd_disk->fops; + sector_t sector; + + sector = block * (PAGE_SIZE / 512); /* ext2 block to bdev sector */ + + BUG_ON(!ops->direct_access); + return ops->direct_access(bdev, sector, kaddr, pfn); } static inline int -__ext2_get_sector(struct inode *inode, sector_t offset, int create, +__ext2_get_block(struct inode *inode, pgoff_t pgoff, int create, sector_t *result) { struct buffer_head tmp; int rc; memset(&tmp, 0, sizeof(struct buffer_head)); - rc = ext2_get_block(inode, offset/ (PAGE_SIZE/512), &tmp, - create); + rc = ext2_get_block(inode, pgoff, &tmp, create); *result = tmp.b_blocknr; /* did we get a sparse block (hole in the file)? */ @@ -45,15 +49,15 @@ __ext2_get_sector(struct inode *inode, sector_t offset, int create, } int -ext2_clear_xip_target(struct inode *inode, int block) +ext2_clear_xip_target(struct inode *inode, sector_t block) { - sector_t sector = block * (PAGE_SIZE/512); - unsigned long data; + void *kaddr; + unsigned long pfn; int rc; - rc = __inode_direct_access(inode, sector, &data); + rc = __inode_direct_access(inode, block, &kaddr, &pfn); if (!rc) - clear_page((void*)data); + clear_page(kaddr); return rc; } @@ -64,30 +68,23 @@ void ext2_xip_verify_sb(struct super_block *sb) if ((sbi->s_mount_opt & EXT2_MOUNT_XIP) && !sb->s_bdev->bd_disk->fops->direct_access) { sbi->s_mount_opt &= (~EXT2_MOUNT_XIP); - ext2_warning(sb, __FUNCTION__, + ext2_warning(sb, __func__, "ignoring xip option - not supported by bdev"); } } -struct page * -ext2_get_xip_page(struct address_space *mapping, sector_t offset, - int create) +int ext2_get_xip_mem(struct address_space *mapping, pgoff_t pgoff, int create, + void **kmem, unsigned long *pfn) { int rc; - unsigned long data; - sector_t sector; + sector_t block; /* first, retrieve the sector number */ - rc = __ext2_get_sector(mapping->host, offset, create, §or); + rc = __ext2_get_block(mapping->host, pgoff, create, &block); if (rc) - goto error; + return rc; /* retrieve address of the target data */ - rc = __inode_direct_access - (mapping->host, sector * (PAGE_SIZE/512), &data); - if (!rc) - return virt_to_page(data); - - error: - return ERR_PTR(rc); + rc = __inode_direct_access(mapping->host, block, kmem, pfn); + return rc; } diff --git a/fs/ext2/xip.h b/fs/ext2/xip.h index aa85331d6c5..18b34d2f31b 100644 --- a/fs/ext2/xip.h +++ b/fs/ext2/xip.h @@ -7,19 +7,20 @@ #ifdef CONFIG_EXT2_FS_XIP extern void ext2_xip_verify_sb (struct super_block *); -extern int ext2_clear_xip_target (struct inode *, int); +extern int ext2_clear_xip_target (struct inode *, sector_t); static inline int ext2_use_xip (struct super_block *sb) { struct ext2_sb_info *sbi = EXT2_SB(sb); return (sbi->s_mount_opt & EXT2_MOUNT_XIP); } -struct page* ext2_get_xip_page (struct address_space *, sector_t, int); -#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_page) +int ext2_get_xip_mem(struct address_space *, pgoff_t, int, + void **, unsigned long *); +#define mapping_is_xip(map) unlikely(map->a_ops->get_xip_mem) #else #define mapping_is_xip(map) 0 #define ext2_xip_verify_sb(sb) do { } while (0) #define ext2_use_xip(sb) 0 #define ext2_clear_xip_target(inode, chain) 0 -#define ext2_get_xip_page NULL +#define ext2_get_xip_mem NULL #endif diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index da0cb2c0e43..92fd0338a6e 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -117,7 +117,7 @@ static int ext3_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "Invalid block bitmap - " "block_group = %d, block = %lu", block_group, bitmap_blk); @@ -147,7 +147,7 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) bitmap_blk = le32_to_cpu(desc->bg_block_bitmap); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %u", block_group, le32_to_cpu(desc->bg_block_bitmap)); @@ -158,16 +158,17 @@ read_block_bitmap(struct super_block *sb, unsigned int block_group) if (bh_submit_read(bh) < 0) { brelse(bh); - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %u", block_group, le32_to_cpu(desc->bg_block_bitmap)); return NULL; } - if (!ext3_valid_block_bitmap(sb, desc, block_group, bh)) { - brelse(bh); - return NULL; - } + ext3_valid_block_bitmap(sb, desc, block_group, bh); + /* + * file system mounted not to panic on error, continue with corrupt + * bitmap + */ return bh; } /* @@ -232,11 +233,10 @@ restart: prev = rsv; } printk("Window map complete.\n"); - if (bad) - BUG(); + BUG_ON(bad); } #define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __FUNCTION__) + __rsv_window_dump((root), (verbose), __func__) #else #define rsv_window_dump(root, verbose) do {} while (0) #endif @@ -618,7 +618,7 @@ do_more: if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { jbd_unlock_bh_state(bitmap_bh); - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "bit already cleared for block "E3FSBLK, block + i); jbd_lock_bh_state(bitmap_bh); @@ -1642,7 +1642,11 @@ allocated: "Allocating block in system zone - " "blocks from "E3FSBLK", length %lu", ret_block, num); - goto out; + /* + * claim_block() marked the blocks we allocated as in use. So we + * may want to selectively mark some of the blocks as free. + */ + goto retry_alloc; } performed_allocation = 1; @@ -1668,7 +1672,7 @@ allocated: if (ext3_test_bit(grp_alloc_blk+i, bh2jh(bitmap_bh)->b_committed_data)) { printk("%s: block was unexpectedly set in " - "b_committed_data\n", __FUNCTION__); + "b_committed_data\n", __func__); } } } diff --git a/fs/ext3/ext3_jbd.c b/fs/ext3/ext3_jbd.c index e1f91fd26a9..d401f148d74 100644 --- a/fs/ext3/ext3_jbd.c +++ b/fs/ext3/ext3_jbd.c @@ -9,7 +9,7 @@ int __ext3_journal_get_undo_access(const char *where, handle_t *handle, { int err = journal_get_undo_access(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -18,7 +18,7 @@ int __ext3_journal_get_write_access(const char *where, handle_t *handle, { int err = journal_get_write_access(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -27,7 +27,7 @@ int __ext3_journal_forget(const char *where, handle_t *handle, { int err = journal_forget(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -36,7 +36,7 @@ int __ext3_journal_revoke(const char *where, handle_t *handle, { int err = journal_revoke(handle, blocknr, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -45,7 +45,7 @@ int __ext3_journal_get_create_access(const char *where, { int err = journal_get_create_access(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } @@ -54,6 +54,6 @@ int __ext3_journal_dirty_metadata(const char *where, { int err = journal_dirty_metadata(handle, bh); if (err) - ext3_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext3_journal_abort_handle(where, __func__, bh, handle,err); return err; } diff --git a/fs/ext3/fsync.c b/fs/ext3/fsync.c index a588e23841d..d33634119e1 100644 --- a/fs/ext3/fsync.c +++ b/fs/ext3/fsync.c @@ -72,6 +72,9 @@ int ext3_sync_file(struct file * file, struct dentry *dentry, int datasync) goto out; } + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + /* * The VFS has written the file data. If the inode is unaltered * then we need not start a commit. diff --git a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c index 96dd5573e49..77126821b2e 100644 --- a/fs/ext3/ialloc.c +++ b/fs/ext3/ialloc.c @@ -644,7 +644,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "bad orphan ino %lu! e2fsck was run?", ino); goto error; } @@ -653,7 +653,7 @@ struct inode *ext3_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT3_INODES_PER_GROUP(sb); bitmap_bh = read_inode_bitmap(sb, block_group); if (!bitmap_bh) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "inode bitmap error for orphan %lu", ino); goto error; } @@ -678,7 +678,7 @@ iget_failed: err = PTR_ERR(inode); inode = NULL; bad_orphan: - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext3_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index c683609b0e3..6ae4ecf3ce4 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -95,7 +95,7 @@ int ext3_forget(handle_t *handle, int is_metadata, struct inode *inode, BUFFER_TRACE(bh, "call ext3_journal_revoke"); err = ext3_journal_revoke(handle, blocknr, bh); if (err) - ext3_abort(inode->i_sb, __FUNCTION__, + ext3_abort(inode->i_sb, __func__, "error %d when attempting revoke", err); BUFFER_TRACE(bh, "exit"); return err; @@ -1190,7 +1190,7 @@ int ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { int err = journal_dirty_data(handle, bh); if (err) - ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__, + ext3_journal_abort_handle(__func__, __func__, bh, handle, err); return err; } @@ -1261,10 +1261,11 @@ static int ext3_ordered_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - copied = ext3_generic_write_end(file, mapping, pos, len, copied, + ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (copied < 0) - ret = copied; + copied = ret2; + if (ret2 < 0) + ret = ret2; } ret2 = ext3_journal_stop(handle); if (!ret) @@ -1289,10 +1290,11 @@ static int ext3_writeback_write_end(struct file *file, if (new_i_size > EXT3_I(inode)->i_disksize) EXT3_I(inode)->i_disksize = new_i_size; - copied = ext3_generic_write_end(file, mapping, pos, len, copied, + ret2 = ext3_generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (copied < 0) - ret = copied; + copied = ret2; + if (ret2 < 0) + ret = ret2; ret2 = ext3_journal_stop(handle); if (!ret) @@ -2454,11 +2456,10 @@ out_stop: static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, unsigned long ino, struct ext3_iloc *iloc) { - unsigned long desc, group_desc, block_group; + unsigned long block_group; unsigned long offset; ext3_fsblk_t block; - struct buffer_head *bh; - struct ext3_group_desc * gdp; + struct ext3_group_desc *gdp; if (!ext3_valid_inum(sb, ino)) { /* @@ -2470,27 +2471,15 @@ static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb, } block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb); - if (block_group >= EXT3_SB(sb)->s_groups_count) { - ext3_error(sb,"ext3_get_inode_block","group >= groups count"); - return 0; - } - smp_rmb(); - group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb); - desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1); - bh = EXT3_SB(sb)->s_group_desc[group_desc]; - if (!bh) { - ext3_error (sb, "ext3_get_inode_block", - "Descriptor not loaded"); + gdp = ext3_get_group_desc(sb, block_group, NULL); + if (!gdp) return 0; - } - - gdp = (struct ext3_group_desc *)bh->b_data; /* * Figure out the offset within the block group inode table */ offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) * EXT3_INODE_SIZE(sb); - block = le32_to_cpu(gdp[desc].bg_inode_table) + + block = le32_to_cpu(gdp->bg_inode_table) + (offset >> EXT3_BLOCK_SIZE_BITS(sb)); iloc->block_group = block_group; @@ -3214,7 +3203,7 @@ void ext3_dirty_inode(struct inode *inode) current_handle->h_transaction != handle->h_transaction) { /* This task has a transaction open against a different fs */ printk(KERN_EMERG "%s: transactions do not match!\n", - __FUNCTION__); + __func__); } else { jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index dec3e0d88ab..0b8cf80154f 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -57,10 +57,15 @@ static struct buffer_head *ext3_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - if ((bh = ext3_bread(handle, inode, *block, 1, err))) { + bh = ext3_bread(handle, inode, *block, 1, err); + if (bh) { inode->i_size += inode->i_sb->s_blocksize; EXT3_I(inode)->i_disksize = inode->i_size; - ext3_journal_get_write_access(handle,bh); + *err = ext3_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } } return bh; } @@ -356,7 +361,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Unrecognised inode hash code %d", root->info.hash_version); brelse(bh); @@ -370,7 +375,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); brelse(bh); @@ -379,7 +384,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, } if ((indirect = root->info.indirect_levels) > 1) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); brelse(bh); @@ -392,7 +397,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "dx entry: limit != root limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -404,7 +409,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "dx entry: no count or count > limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -449,7 +454,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "dx entry: limit != node limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -465,7 +470,7 @@ fail2: } fail: if (*err == ERR_BAD_DX_DIR) - ext3_warning(dir->i_sb, __FUNCTION__, + ext3_warning(dir->i_sb, __func__, "Corrupt dir inode %ld, running e2fsck is " "recommended.", dir->i_ino); return NULL; @@ -913,7 +918,7 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext3_error(sb, __FUNCTION__, "reading directory #%lu " + ext3_error(sb, __func__, "reading directory #%lu " "offset %lu", dir->i_ino, block); brelse(bh); goto next; @@ -1005,7 +1010,7 @@ static struct buffer_head * ext3_dx_find_entry(struct dentry *dentry, retval = ext3_htree_next_block(dir, hash, frame, frames, NULL); if (retval < 0) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "error reading index page in directory #%lu", dir->i_ino); *err = retval; @@ -1530,7 +1535,7 @@ static int ext3_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Directory index full!"); err = -ENOSPC; goto cleanup; @@ -1832,11 +1837,11 @@ static int empty_dir (struct inode * inode) if (inode->i_size < EXT3_DIR_REC_LEN(1) + EXT3_DIR_REC_LEN(2) || !(bh = ext3_bread (NULL, inode, 0, 0, &err))) { if (err) - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "error %d reading directory #%lu offset 0", err, inode->i_ino); else - ext3_warning(inode->i_sb, __FUNCTION__, + ext3_warning(inode->i_sb, __func__, "bad directory (dir #%lu) - no data block", inode->i_ino); return 1; @@ -1865,7 +1870,7 @@ static int empty_dir (struct inode * inode) offset >> EXT3_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "error %d reading directory" " #%lu offset %lu", err, inode->i_ino, offset); @@ -2318,6 +2323,8 @@ static int ext3_rename (struct inode * old_dir, struct dentry *old_dentry, EXT3_FEATURE_INCOMPAT_FILETYPE)) new_de->file_type = old_de->file_type; new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = CURRENT_TIME_SEC; + ext3_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext3_journal_dirty_metadata"); ext3_journal_dirty_metadata(handle, new_bh); brelse(new_bh); diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 0e97b6e07cb..28cfd0b4052 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -48,60 +48,60 @@ static int verify_group_input(struct super_block *sb, free_blocks_count, input->reserved_blocks); if (group != sbi->s_groups_count) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Cannot add at group %u (only %lu groups)", input->group, sbi->s_groups_count); else if ((start - le32_to_cpu(es->s_first_data_block)) % EXT3_BLOCKS_PER_GROUP(sb)) - ext3_warning(sb, __FUNCTION__, "Last group not full"); + ext3_warning(sb, __func__, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) - ext3_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", + ext3_warning(sb, __func__, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) - ext3_warning(sb, __FUNCTION__, "Bad blocks count %u", + ext3_warning(sb, __func__, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Cannot read last block ("E3FSBLK")", end - 1); else if (outside(input->block_bitmap, start, end)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap not in group (block %u)", input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode bitmap not in group (block %u)", input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode table not in group (blocks %u-"E3FSBLK")", input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap same as inode bitmap (%u)", input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap (%u) in inode table (%u-"E3FSBLK")", input->block_bitmap, input->inode_table, itend-1); else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode bitmap (%u) in inode table (%u-"E3FSBLK")", input->inode_bitmap, input->inode_table, itend-1); else if (inside(input->block_bitmap, start, metaend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Block bitmap (%u) in GDT table" " ("E3FSBLK"-"E3FSBLK")", input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode bitmap (%u) in GDT table" " ("E3FSBLK"-"E3FSBLK")", input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Inode table (%u-"E3FSBLK") overlaps" "GDT table ("E3FSBLK"-"E3FSBLK")", input->inode_table, itend - 1, start, metaend - 1); @@ -386,7 +386,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext3_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT3_BLOCKS_PER_GROUP(sb) + blk){ - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "reserved GDT "E3FSBLK " missing grp %d ("E3FSBLK")", blk, grp, @@ -440,7 +440,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, */ if (EXT3_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT3_SB(sb)->s_es->s_first_data_block)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "won't resize using backup superblock at %llu", (unsigned long long)EXT3_SB(sb)->s_sbh->b_blocknr); return -EPERM; @@ -464,7 +464,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT3_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "new group %u GDT block "E3FSBLK" not reserved", input->group, gdblock); err = -EINVAL; @@ -488,7 +488,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext3_warning (sb, __FUNCTION__, + ext3_warning (sb, __func__, "not enough memory for %lu groups", gdb_num + 1); goto exit_inode; } @@ -586,7 +586,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "reserved block "E3FSBLK " not at offset %ld", blk, @@ -730,7 +730,7 @@ static void update_backups(struct super_block *sb, */ exit_err: if (err) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "can't update backup for group %d (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT3_VALID_FS; @@ -770,33 +770,33 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (gdb_off == 0 && !EXT3_HAS_RO_COMPAT_FEATURE(sb, EXT3_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Can't resize non-sparse filesystem further"); return -EPERM; } if (le32_to_cpu(es->s_blocks_count) + input->blocks_count < le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __FUNCTION__, "blocks_count overflow\n"); + ext3_warning(sb, __func__, "blocks_count overflow\n"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT3_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext3_warning(sb, __FUNCTION__, "inodes_count overflow\n"); + ext3_warning(sb, __func__, "inodes_count overflow\n"); return -EINVAL; } if (reserved_gdb || gdb_off == 0) { if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_RESIZE_INODE)){ - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext3_iget(sb, EXT3_RESIZE_INO); if (IS_ERR(inode)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "Error opening resize inode"); return PTR_ERR(inode); } @@ -825,7 +825,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) lock_super(sb); if (input->group != sbi->s_groups_count) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; @@ -988,13 +988,13 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, " too large to resize to %lu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "CONFIG_LBD not enabled\n"); return -EINVAL; } if (n_blocks_count < o_blocks_count) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "can't shrink FS - resize aborted"); return -EBUSY; } @@ -1004,7 +1004,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, EXT3_BLOCKS_PER_GROUP(sb); if (last == 0) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "need to use ext2online to resize further"); return -EPERM; } @@ -1012,7 +1012,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, add = EXT3_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { - ext3_warning(sb, __FUNCTION__, "blocks_count overflow"); + ext3_warning(sb, __func__, "blocks_count overflow"); return -EINVAL; } @@ -1020,7 +1020,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "will only finish group ("E3FSBLK " blocks, %u new)", o_blocks_count + add, add); @@ -1028,7 +1028,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, /* See if the device is actually as big as what was requested */ bh = sb_bread(sb, o_blocks_count + add -1); if (!bh) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "can't read last block, resize aborted"); return -ENOSPC; } @@ -1040,22 +1040,23 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, handle = ext3_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - ext3_warning(sb, __FUNCTION__, "error %d on journal start",err); + ext3_warning(sb, __func__, "error %d on journal start",err); goto exit_put; } lock_super(sb); if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "multiple resizers run on filesystem!"); unlock_super(sb); + ext3_journal_stop(handle); err = -EBUSY; goto exit_put; } if ((err = ext3_journal_get_write_access(handle, EXT3_SB(sb)->s_sbh))) { - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "error %d on journal write access", err); unlock_super(sb); ext3_journal_stop(handle); diff --git a/fs/ext3/super.c b/fs/ext3/super.c index ad536066408..fe3119a71ad 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -84,7 +84,7 @@ handle_t *ext3_journal_start_sb(struct super_block *sb, int nblocks) * take the FS itself readonly cleanly. */ journal = EXT3_SB(sb)->s_journal; if (is_journal_aborted(journal)) { - ext3_abort(sb, __FUNCTION__, + ext3_abort(sb, __func__, "Detected aborted journal"); return ERR_PTR(-EROFS); } @@ -304,7 +304,7 @@ void ext3_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT3_GOOD_OLD_REV) return; - ext3_warning(sb, __FUNCTION__, + ext3_warning(sb, __func__, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT3_DYNAMIC_REV); @@ -685,7 +685,8 @@ static int ext3_acquire_dquot(struct dquot *dquot); static int ext3_release_dquot(struct dquot *dquot); static int ext3_mark_dquot_dirty(struct dquot *dquot); static int ext3_write_info(struct super_block *sb, int type); -static int ext3_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext3_quota_on(struct super_block *sb, int type, int format_id, + char *path, int remount); static int ext3_quota_on_mount(struct super_block *sb, int type); static ssize_t ext3_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1096,6 +1097,9 @@ clear_qf_name: case Opt_quota: case Opt_usrquota: case Opt_grpquota: + printk(KERN_ERR + "EXT3-fs: quota options not supported.\n"); + break; case Opt_usrjquota: case Opt_grpjquota: case Opt_offusrjquota: @@ -1103,7 +1107,7 @@ clear_qf_name: case Opt_jqfmt_vfsold: case Opt_jqfmt_vfsv0: printk(KERN_ERR - "EXT3-fs: journalled quota options not " + "EXT3-fs: journaled quota options not " "supported.\n"); break; case Opt_noquota: @@ -1218,7 +1222,7 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, inconsistencies, to force a fsck at reboot. But for a plain journaled filesystem we can keep it set as valid forever! :) */ - es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT3_VALID_FS); + es->s_state &= cpu_to_le16(~EXT3_VALID_FS); #endif if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT3_DFL_MAX_MNT_COUNT); @@ -1253,14 +1257,14 @@ static int ext3_setup_super(struct super_block *sb, struct ext3_super_block *es, static int ext3_check_descriptors(struct super_block *sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); - ext3_fsblk_t first_block = le32_to_cpu(sbi->s_es->s_first_data_block); - ext3_fsblk_t last_block; int i; ext3_debug ("Checking group descriptors"); for (i = 0; i < sbi->s_groups_count; i++) { struct ext3_group_desc *gdp = ext3_get_group_desc(sb, i, NULL); + ext3_fsblk_t first_block = ext3_group_first_block_no(sb, i); + ext3_fsblk_t last_block; if (i == sbi->s_groups_count - 1) last_block = le32_to_cpu(sbi->s_es->s_blocks_count) - 1; @@ -1299,7 +1303,6 @@ static int ext3_check_descriptors(struct super_block *sb) le32_to_cpu(gdp->bg_inode_table)); return 0; } - first_block += EXT3_BLOCKS_PER_GROUP(sb); } sbi->s_es->s_free_blocks_count=cpu_to_le32(ext3_count_free_blocks(sb)); @@ -1387,7 +1390,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", - __FUNCTION__, inode->i_ino, inode->i_size); + __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %Ld bytes\n", inode->i_ino, inode->i_size); ext3_truncate(inode); @@ -1395,7 +1398,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, } else { printk(KERN_DEBUG "%s: deleting unreferenced inode %lu\n", - __FUNCTION__, inode->i_ino); + __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; @@ -1415,7 +1418,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i); + vfs_quota_off(sb, i, 0); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2298,9 +2301,9 @@ static void ext3_clear_journal_err(struct super_block * sb, char nbuf[16]; errstr = ext3_decode_error(sb, j_errno, nbuf); - ext3_warning(sb, __FUNCTION__, "Filesystem error recorded " + ext3_warning(sb, __func__, "Filesystem error recorded " "from previous mount: %s", errstr); - ext3_warning(sb, __FUNCTION__, "Marking fs in need of " + ext3_warning(sb, __func__, "Marking fs in need of " "filesystem check."); EXT3_SB(sb)->s_mount_state |= EXT3_ERROR_FS; @@ -2427,7 +2430,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) } if (sbi->s_mount_opt & EXT3_MOUNT_ABORT) - ext3_abort(sb, __FUNCTION__, "Abort forced by user"); + ext3_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT3_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); @@ -2639,8 +2642,14 @@ static int ext3_dquot_drop(struct inode *inode) /* We may delete quota structure so we need to reserve enough blocks */ handle = ext3_journal_start(inode, 2*EXT3_QUOTA_DEL_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + /* + * We call dquot_drop() anyway to at least release references + * to quota structures so that umount does not hang. + */ + dquot_drop(inode); return PTR_ERR(handle); + } ret = dquot_drop(inode); err = ext3_journal_stop(handle); if (!ret) @@ -2743,17 +2752,17 @@ static int ext3_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext3_quota_on(struct super_block *sb, int type, int format_id, - char *path) + char *path, int remount) { int err; struct nameidata nd; if (!test_opt(sb, QUOTA)) return -EINVAL; - /* Not journalling quota? */ - if (!EXT3_SB(sb)->s_qf_names[USRQUOTA] && - !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) - return vfs_quota_on(sb, type, format_id, path); + /* Not journalling quota or remount? */ + if ((!EXT3_SB(sb)->s_qf_names[USRQUOTA] && + !EXT3_SB(sb)->s_qf_names[GRPQUOTA]) || remount) + return vfs_quota_on(sb, type, format_id, path, remount); err = path_lookup(path, LOOKUP_FOLLOW, &nd); if (err) return err; @@ -2762,13 +2771,13 @@ static int ext3_quota_on(struct super_block *sb, int type, int format_id, path_put(&nd.path); return -EXDEV; } - /* Quotafile not of fs root? */ + /* Quotafile not in fs root? */ if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) printk(KERN_WARNING "EXT3-fs: Quota file not on filesystem root. " "Journalled quota will not work.\n"); path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path); + return vfs_quota_on(sb, type, format_id, path, remount); } /* Read data from quotafile - avoid pagecache and such because we cannot afford diff --git a/fs/ext3/xattr.c b/fs/ext3/xattr.c index 42856541e9a..d4a4f0e9ff6 100644 --- a/fs/ext3/xattr.c +++ b/fs/ext3/xattr.c @@ -99,6 +99,8 @@ static struct buffer_head *ext3_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext3_xattr_rehash(struct ext3_xattr_header *, struct ext3_xattr_entry *); +static int ext3_xattr_list(struct inode *inode, char *buffer, + size_t buffer_size); static struct mb_cache *ext3_xattr_cache; @@ -232,7 +234,7 @@ ext3_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { -bad_block: ext3_error(inode->i_sb, __FUNCTION__, +bad_block: ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; @@ -374,7 +376,7 @@ ext3_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext3_xattr_check_block(bh)) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; @@ -427,7 +429,7 @@ cleanup: * Returns a negative error number on failure, or the number of bytes * used / required on success. */ -int +static int ext3_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) { int i_error, b_error; @@ -649,7 +651,7 @@ ext3_xattr_block_find(struct inode *inode, struct ext3_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext3_xattr_check_block(bs->bh)) { - ext3_error(sb, __FUNCTION__, + ext3_error(sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); error = -EIO; @@ -797,10 +799,8 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - ext3_fsblk_t goal = le32_to_cpu( - EXT3_SB(sb)->s_es->s_first_data_block) + - (ext3_fsblk_t)EXT3_I(inode)->i_block_group * - EXT3_BLOCKS_PER_GROUP(sb); + ext3_fsblk_t goal = ext3_group_first_block_no(sb, + EXT3_I(inode)->i_block_group); ext3_fsblk_t block = ext3_new_block(handle, inode, goal, &error); if (error) @@ -852,7 +852,7 @@ cleanup_dquot: goto cleanup; bad_block: - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; @@ -1081,14 +1081,14 @@ ext3_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT3_I(inode)->i_file_acl); if (!bh) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: block "E3FSBLK" read error", inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT3_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: bad block "E3FSBLK, inode->i_ino, EXT3_I(inode)->i_file_acl); goto cleanup; @@ -1215,7 +1215,7 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext3_error(inode->i_sb, __FUNCTION__, + ext3_error(inode->i_sb, __func__, "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= diff --git a/fs/ext3/xattr.h b/fs/ext3/xattr.h index 6b1ae1c6182..148a4dfc82a 100644 --- a/fs/ext3/xattr.h +++ b/fs/ext3/xattr.h @@ -67,7 +67,6 @@ extern struct xattr_handler ext3_xattr_security_handler; extern ssize_t ext3_listxattr(struct dentry *, char *, size_t); extern int ext3_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ext3_xattr_list(struct inode *, char *, size_t); extern int ext3_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext3_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); @@ -89,12 +88,6 @@ ext3_xattr_get(struct inode *inode, int name_index, const char *name, } static inline int -ext3_xattr_list(struct inode *inode, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - -static inline int ext3_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t size, int flags) { diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index a8bae8cd1d5..3c8dab880d9 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -9,8 +9,8 @@ #include <linux/slab.h> #include <linux/capability.h> #include <linux/fs.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" #include "xattr.h" #include "acl.h" @@ -37,7 +37,7 @@ ext4_acl_from_disk(const void *value, size_t size) return ERR_PTR(-EINVAL); if (count == 0) return NULL; - acl = posix_acl_alloc(count, GFP_KERNEL); + acl = posix_acl_alloc(count, GFP_NOFS); if (!acl) return ERR_PTR(-ENOMEM); for (n=0; n < count; n++) { @@ -91,7 +91,7 @@ ext4_acl_to_disk(const struct posix_acl *acl, size_t *size) *size = ext4_acl_size(acl->a_count); ext_acl = kmalloc(sizeof(ext4_acl_header) + acl->a_count * - sizeof(ext4_acl_entry), GFP_KERNEL); + sizeof(ext4_acl_entry), GFP_NOFS); if (!ext_acl) return ERR_PTR(-ENOMEM); ext_acl->a_version = cpu_to_le32(EXT4_ACL_VERSION); @@ -187,7 +187,7 @@ ext4_get_acl(struct inode *inode, int type) } retval = ext4_xattr_get(inode, name_index, "", NULL, 0); if (retval > 0) { - value = kmalloc(retval, GFP_KERNEL); + value = kmalloc(retval, GFP_NOFS); if (!value) return ERR_PTR(-ENOMEM); retval = ext4_xattr_get(inode, name_index, "", value, retval); @@ -335,7 +335,7 @@ ext4_init_acl(handle_t *handle, struct inode *inode, struct inode *dir) if (error) goto cleanup; } - clone = posix_acl_clone(acl, GFP_KERNEL); + clone = posix_acl_clone(acl, GFP_NOFS); error = -ENOMEM; if (!clone) goto cleanup; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 0737e05ba3d..da994374ec3 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -15,12 +15,12 @@ #include <linux/capability.h> #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> - +#include "ext4.h" +#include "ext4_jbd2.h" #include "group.h" + /* * balloc.c contains the blocks allocation and deallocation routines */ @@ -48,7 +48,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { - unsigned long start; int bit, bit_max; unsigned free_blocks, group_blocks; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -59,7 +58,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Checksum bad for group %lu\n", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; @@ -106,11 +105,12 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, free_blocks = group_blocks - bit_max; if (bh) { + ext4_fsblk_t start; + for (bit = 0; bit < bit_max; bit++) ext4_set_bit(bit, bh->b_data); - start = block_group * EXT4_BLOCKS_PER_GROUP(sb) + - le32_to_cpu(sbi->s_es->s_first_data_block); + start = ext4_group_first_block_no(sb, block_group); /* Set bits for block and inode bitmaps, and inode table */ ext4_set_bit(ext4_block_bitmap(sb, gdp) - start, bh->b_data); @@ -235,7 +235,7 @@ static int ext4_valid_block_bitmap(struct super_block *sb, return 1; err_out: - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Invalid block bitmap - " "block_group = %d, block = %llu", block_group, bitmap_blk); @@ -264,7 +264,7 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group) bitmap_blk = ext4_block_bitmap(sb, desc); bh = sb_getblk(sb, bitmap_blk); if (unlikely(!bh)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %llu", (int)block_group, (unsigned long long)bitmap_blk); @@ -281,7 +281,7 @@ read_block_bitmap(struct super_block *sb, ext4_group_t block_group) } if (bh_submit_read(bh) < 0) { put_bh(bh); - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Cannot read block bitmap - " "block_group = %d, block_bitmap = %llu", (int)block_group, (unsigned long long)bitmap_blk); @@ -360,7 +360,7 @@ restart: BUG(); } #define rsv_window_dump(root, verbose) \ - __rsv_window_dump((root), (verbose), __FUNCTION__) + __rsv_window_dump((root), (verbose), __func__) #else #define rsv_window_dump(root, verbose) do {} while (0) #endif @@ -740,7 +740,7 @@ do_more: if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), bit + i, bitmap_bh->b_data)) { jbd_unlock_bh_state(bitmap_bh); - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "bit already cleared for block %llu", (ext4_fsblk_t)(block + i)); jbd_lock_bh_state(bitmap_bh); @@ -752,9 +752,7 @@ do_more: jbd_unlock_bh_state(bitmap_bh); spin_lock(sb_bgl_lock(sbi, block_group)); - desc->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(desc->bg_free_blocks_count) + - group_freed); + le16_add_cpu(&desc->bg_free_blocks_count, group_freed); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); @@ -1798,7 +1796,7 @@ allocated: if (ext4_test_bit(grp_alloc_blk+i, bh2jh(bitmap_bh)->b_committed_data)) { printk("%s: block was unexpectedly set in " - "b_committed_data\n", __FUNCTION__); + "b_committed_data\n", __func__); } } } @@ -1823,8 +1821,7 @@ allocated: spin_lock(sb_bgl_lock(sbi, group_no)); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); - gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count)-num); + le16_add_cpu(&gdp->bg_free_blocks_count, -num); gdp->bg_checksum = ext4_group_desc_csum(sbi, group_no, gdp); spin_unlock(sb_bgl_lock(sbi, group_no)); percpu_counter_sub(&sbi->s_freeblocks_counter, num); diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 420554f8f79..d37ea675045 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -9,7 +9,7 @@ #include <linux/buffer_head.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> +#include "ext4.h" #ifdef EXT4FS_DEBUG diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 2c23bade9aa..2bf0331ea19 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -23,10 +23,10 @@ #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> #include <linux/buffer_head.h> #include <linux/slab.h> #include <linux/rbtree.h> +#include "ext4.h" static unsigned char ext4_filetype_table[] = { DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK @@ -42,7 +42,7 @@ const struct file_operations ext4_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .readdir = ext4_readdir, /* we take BKL. needed?*/ - .ioctl = ext4_ioctl, /* BKL held */ + .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h new file mode 100644 index 00000000000..8158083f7ac --- /dev/null +++ b/fs/ext4/ext4.h @@ -0,0 +1,1205 @@ +/* + * ext4.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_H +#define _EXT4_H + +#include <linux/types.h> +#include <linux/blkdev.h> +#include <linux/magic.h> +#include "ext4_i.h" + +/* + * The second extended filesystem constants/structures + */ + +/* + * Define EXT4FS_DEBUG to produce debug messages + */ +#undef EXT4FS_DEBUG + +/* + * Define EXT4_RESERVATION to reserve data blocks for expanding files + */ +#define EXT4_DEFAULT_RESERVE_BLOCKS 8 +/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */ +#define EXT4_MAX_RESERVE_BLOCKS 1027 +#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0 + +/* + * Debug code + */ +#ifdef EXT4FS_DEBUG +#define ext4_debug(f, a...) \ + do { \ + printk (KERN_DEBUG "EXT4-fs DEBUG (%s, %d): %s:", \ + __FILE__, __LINE__, __FUNCTION__); \ + printk (KERN_DEBUG f, ## a); \ + } while (0) +#else +#define ext4_debug(f, a...) do {} while (0) +#endif + +#define EXT4_MULTIBLOCK_ALLOCATOR 1 + +/* prefer goal again. length */ +#define EXT4_MB_HINT_MERGE 1 +/* blocks already reserved */ +#define EXT4_MB_HINT_RESERVED 2 +/* metadata is being allocated */ +#define EXT4_MB_HINT_METADATA 4 +/* first blocks in the file */ +#define EXT4_MB_HINT_FIRST 8 +/* search for the best chunk */ +#define EXT4_MB_HINT_BEST 16 +/* data is being allocated */ +#define EXT4_MB_HINT_DATA 32 +/* don't preallocate (for tails) */ +#define EXT4_MB_HINT_NOPREALLOC 64 +/* allocate for locality group */ +#define EXT4_MB_HINT_GROUP_ALLOC 128 +/* allocate goal blocks or none */ +#define EXT4_MB_HINT_GOAL_ONLY 256 +/* goal is meaningful */ +#define EXT4_MB_HINT_TRY_GOAL 512 + +struct ext4_allocation_request { + /* target inode for block we're allocating */ + struct inode *inode; + /* logical block in target inode */ + ext4_lblk_t logical; + /* phys. target (a hint) */ + ext4_fsblk_t goal; + /* the closest logical allocated block to the left */ + ext4_lblk_t lleft; + /* phys. block for ^^^ */ + ext4_fsblk_t pleft; + /* the closest logical allocated block to the right */ + ext4_lblk_t lright; + /* phys. block for ^^^ */ + ext4_fsblk_t pright; + /* how many blocks we want to allocate */ + unsigned long len; + /* flags. see above EXT4_MB_HINT_* */ + unsigned long flags; +}; + +/* + * Special inodes numbers + */ +#define EXT4_BAD_INO 1 /* Bad blocks inode */ +#define EXT4_ROOT_INO 2 /* Root inode */ +#define EXT4_BOOT_LOADER_INO 5 /* Boot loader inode */ +#define EXT4_UNDEL_DIR_INO 6 /* Undelete directory inode */ +#define EXT4_RESIZE_INO 7 /* Reserved group descriptors inode */ +#define EXT4_JOURNAL_INO 8 /* Journal inode */ + +/* First non-reserved inode for old ext4 filesystems */ +#define EXT4_GOOD_OLD_FIRST_INO 11 + +/* + * Maximal count of links to a file + */ +#define EXT4_LINK_MAX 65000 + +/* + * Macro-instructions used to manage several block sizes + */ +#define EXT4_MIN_BLOCK_SIZE 1024 +#define EXT4_MAX_BLOCK_SIZE 65536 +#define EXT4_MIN_BLOCK_LOG_SIZE 10 +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE(s) ((s)->s_blocksize) +#else +# define EXT4_BLOCK_SIZE(s) (EXT4_MIN_BLOCK_SIZE << (s)->s_log_block_size) +#endif +#define EXT4_ADDR_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / sizeof (__u32)) +#ifdef __KERNEL__ +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_blocksize_bits) +#else +# define EXT4_BLOCK_SIZE_BITS(s) ((s)->s_log_block_size + 10) +#endif +#ifdef __KERNEL__ +#define EXT4_ADDR_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_addr_per_block_bits) +#define EXT4_INODE_SIZE(s) (EXT4_SB(s)->s_inode_size) +#define EXT4_FIRST_INO(s) (EXT4_SB(s)->s_first_ino) +#else +#define EXT4_INODE_SIZE(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_INODE_SIZE : \ + (s)->s_inode_size) +#define EXT4_FIRST_INO(s) (((s)->s_rev_level == EXT4_GOOD_OLD_REV) ? \ + EXT4_GOOD_OLD_FIRST_INO : \ + (s)->s_first_ino) +#endif +#define EXT4_BLOCK_ALIGN(size, blkbits) ALIGN((size), (1 << (blkbits))) + +/* + * Structure of a blocks group descriptor + */ +struct ext4_group_desc +{ + __le32 bg_block_bitmap_lo; /* Blocks bitmap block */ + __le32 bg_inode_bitmap_lo; /* Inodes bitmap block */ + __le32 bg_inode_table_lo; /* Inodes table block */ + __le16 bg_free_blocks_count; /* Free blocks count */ + __le16 bg_free_inodes_count; /* Free inodes count */ + __le16 bg_used_dirs_count; /* Directories count */ + __le16 bg_flags; /* EXT4_BG_flags (INODE_UNINIT, etc) */ + __u32 bg_reserved[2]; /* Likely block/inode bitmap checksum */ + __le16 bg_itable_unused; /* Unused inodes count */ + __le16 bg_checksum; /* crc16(sb_uuid+group+desc) */ + __le32 bg_block_bitmap_hi; /* Blocks bitmap block MSB */ + __le32 bg_inode_bitmap_hi; /* Inodes bitmap block MSB */ + __le32 bg_inode_table_hi; /* Inodes table block MSB */ + __le16 bg_free_blocks_count_hi;/* Free blocks count MSB */ + __le16 bg_free_inodes_count_hi;/* Free inodes count MSB */ + __le16 bg_used_dirs_count_hi; /* Directories count MSB */ + __le16 bg_itable_unused_hi; /* Unused inodes count MSB */ + __u32 bg_reserved2[3]; +}; + +#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */ +#define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ +#define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ + +#ifdef __KERNEL__ +#include "ext4_sb.h" +#endif +/* + * Macro-instructions used to manage group descriptors + */ +#define EXT4_MIN_DESC_SIZE 32 +#define EXT4_MIN_DESC_SIZE_64BIT 64 +#define EXT4_MAX_DESC_SIZE EXT4_MIN_BLOCK_SIZE +#define EXT4_DESC_SIZE(s) (EXT4_SB(s)->s_desc_size) +#ifdef __KERNEL__ +# define EXT4_BLOCKS_PER_GROUP(s) (EXT4_SB(s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_SB(s)->s_desc_per_block) +# define EXT4_INODES_PER_GROUP(s) (EXT4_SB(s)->s_inodes_per_group) +# define EXT4_DESC_PER_BLOCK_BITS(s) (EXT4_SB(s)->s_desc_per_block_bits) +#else +# define EXT4_BLOCKS_PER_GROUP(s) ((s)->s_blocks_per_group) +# define EXT4_DESC_PER_BLOCK(s) (EXT4_BLOCK_SIZE(s) / EXT4_DESC_SIZE(s)) +# define EXT4_INODES_PER_GROUP(s) ((s)->s_inodes_per_group) +#endif + +/* + * Constants relative to the data blocks + */ +#define EXT4_NDIR_BLOCKS 12 +#define EXT4_IND_BLOCK EXT4_NDIR_BLOCKS +#define EXT4_DIND_BLOCK (EXT4_IND_BLOCK + 1) +#define EXT4_TIND_BLOCK (EXT4_DIND_BLOCK + 1) +#define EXT4_N_BLOCKS (EXT4_TIND_BLOCK + 1) + +/* + * Inode flags + */ +#define EXT4_SECRM_FL 0x00000001 /* Secure deletion */ +#define EXT4_UNRM_FL 0x00000002 /* Undelete */ +#define EXT4_COMPR_FL 0x00000004 /* Compress file */ +#define EXT4_SYNC_FL 0x00000008 /* Synchronous updates */ +#define EXT4_IMMUTABLE_FL 0x00000010 /* Immutable file */ +#define EXT4_APPEND_FL 0x00000020 /* writes to file may only append */ +#define EXT4_NODUMP_FL 0x00000040 /* do not dump file */ +#define EXT4_NOATIME_FL 0x00000080 /* do not update atime */ +/* Reserved for compression usage... */ +#define EXT4_DIRTY_FL 0x00000100 +#define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ +#define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ +#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ +/* End compression flags --- maybe not all used */ +#define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ +#define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ +#define EXT4_JOURNAL_DATA_FL 0x00004000 /* file data should be journaled */ +#define EXT4_NOTAIL_FL 0x00008000 /* file tail should not be merged */ +#define EXT4_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ +#define EXT4_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define EXT4_HUGE_FILE_FL 0x00040000 /* Set to each huge file */ +#define EXT4_EXTENTS_FL 0x00080000 /* Inode uses extents */ +#define EXT4_EXT_MIGRATE 0x00100000 /* Inode is migrating */ +#define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ + +#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x000380FF /* User modifiable flags */ + +/* + * Inode dynamic state flags + */ +#define EXT4_STATE_JDATA 0x00000001 /* journaled data exists */ +#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */ +#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */ +#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ + +/* Used to pass group descriptor data when online resize is done */ +struct ext4_new_group_input { + __u32 group; /* Group number for this data */ + __u64 block_bitmap; /* Absolute block number of block bitmap */ + __u64 inode_bitmap; /* Absolute block number of inode bitmap */ + __u64 inode_table; /* Absolute block number of inode table start */ + __u32 blocks_count; /* Total number of blocks in this group */ + __u16 reserved_blocks; /* Number of reserved blocks in this group */ + __u16 unused; +}; + +/* The struct ext4_new_group_input in kernel space, with free_blocks_count */ +struct ext4_new_group_data { + __u32 group; + __u64 block_bitmap; + __u64 inode_bitmap; + __u64 inode_table; + __u32 blocks_count; + __u16 reserved_blocks; + __u16 unused; + __u32 free_blocks_count; +}; + +/* + * Following is used by preallocation code to tell get_blocks() that we + * want uninitialzed extents. + */ +#define EXT4_CREATE_UNINITIALIZED_EXT 2 + +/* + * ioctl commands + */ +#define EXT4_IOC_GETFLAGS FS_IOC_GETFLAGS +#define EXT4_IOC_SETFLAGS FS_IOC_SETFLAGS +#define EXT4_IOC_GETVERSION _IOR('f', 3, long) +#define EXT4_IOC_SETVERSION _IOW('f', 4, long) +#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long) +#define EXT4_IOC_GROUP_ADD _IOW('f', 8,struct ext4_new_group_input) +#define EXT4_IOC_GETVERSION_OLD FS_IOC_GETVERSION +#define EXT4_IOC_SETVERSION_OLD FS_IOC_SETVERSION +#ifdef CONFIG_JBD2_DEBUG +#define EXT4_IOC_WAIT_FOR_READONLY _IOR('f', 99, long) +#endif +#define EXT4_IOC_GETRSVSZ _IOR('f', 5, long) +#define EXT4_IOC_SETRSVSZ _IOW('f', 6, long) +#define EXT4_IOC_MIGRATE _IO('f', 7) + +/* + * ioctl commands in 32 bit emulation + */ +#define EXT4_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define EXT4_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define EXT4_IOC32_GETVERSION _IOR('f', 3, int) +#define EXT4_IOC32_SETVERSION _IOW('f', 4, int) +#define EXT4_IOC32_GETRSVSZ _IOR('f', 5, int) +#define EXT4_IOC32_SETRSVSZ _IOW('f', 6, int) +#define EXT4_IOC32_GROUP_EXTEND _IOW('f', 7, unsigned int) +#ifdef CONFIG_JBD2_DEBUG +#define EXT4_IOC32_WAIT_FOR_READONLY _IOR('f', 99, int) +#endif +#define EXT4_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION +#define EXT4_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION + + +/* + * Mount options + */ +struct ext4_mount_options { + unsigned long s_mount_opt; + uid_t s_resuid; + gid_t s_resgid; + unsigned long s_commit_interval; +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; +#endif +}; + +/* + * Structure of an inode on the disk + */ +struct ext4_inode { + __le16 i_mode; /* File mode */ + __le16 i_uid; /* Low 16 bits of Owner Uid */ + __le32 i_size_lo; /* Size in bytes */ + __le32 i_atime; /* Access time */ + __le32 i_ctime; /* Inode Change time */ + __le32 i_mtime; /* Modification time */ + __le32 i_dtime; /* Deletion Time */ + __le16 i_gid; /* Low 16 bits of Group Id */ + __le16 i_links_count; /* Links count */ + __le32 i_blocks_lo; /* Blocks count */ + __le32 i_flags; /* File flags */ + union { + struct { + __le32 l_i_version; + } linux1; + struct { + __u32 h_i_translator; + } hurd1; + struct { + __u32 m_i_reserved1; + } masix1; + } osd1; /* OS dependent 1 */ + __le32 i_block[EXT4_N_BLOCKS];/* Pointers to blocks */ + __le32 i_generation; /* File version (for NFS) */ + __le32 i_file_acl_lo; /* File ACL */ + __le32 i_size_high; + __le32 i_obso_faddr; /* Obsoleted fragment address */ + union { + struct { + __le16 l_i_blocks_high; /* were l_i_reserved1 */ + __le16 l_i_file_acl_high; + __le16 l_i_uid_high; /* these 2 fields */ + __le16 l_i_gid_high; /* were reserved2[0] */ + __u32 l_i_reserved2; + } linux2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __u16 h_i_mode_high; + __u16 h_i_uid_high; + __u16 h_i_gid_high; + __u32 h_i_author; + } hurd2; + struct { + __le16 h_i_reserved1; /* Obsoleted fragment number/size which are removed in ext4 */ + __le16 m_i_file_acl_high; + __u32 m_i_reserved2[2]; + } masix2; + } osd2; /* OS dependent 2 */ + __le16 i_extra_isize; + __le16 i_pad1; + __le32 i_ctime_extra; /* extra Change time (nsec << 2 | epoch) */ + __le32 i_mtime_extra; /* extra Modification time(nsec << 2 | epoch) */ + __le32 i_atime_extra; /* extra Access time (nsec << 2 | epoch) */ + __le32 i_crtime; /* File Creation time */ + __le32 i_crtime_extra; /* extra FileCreationtime (nsec << 2 | epoch) */ + __le32 i_version_hi; /* high 32 bits for 64-bit version */ +}; + + +#define EXT4_EPOCH_BITS 2 +#define EXT4_EPOCH_MASK ((1 << EXT4_EPOCH_BITS) - 1) +#define EXT4_NSEC_MASK (~0UL << EXT4_EPOCH_BITS) + +/* + * Extended fields will fit into an inode if the filesystem was formatted + * with large inodes (-I 256 or larger) and there are not currently any EAs + * consuming all of the available space. For new inodes we always reserve + * enough space for the kernel's known extended fields, but for inodes + * created with an old kernel this might not have been the case. None of + * the extended inode fields is critical for correct filesystem operation. + * This macro checks if a certain field fits in the inode. Note that + * inode-size = GOOD_OLD_INODE_SIZE + i_extra_isize + */ +#define EXT4_FITS_IN_INODE(ext4_inode, einode, field) \ + ((offsetof(typeof(*ext4_inode), field) + \ + sizeof((ext4_inode)->field)) \ + <= (EXT4_GOOD_OLD_INODE_SIZE + \ + (einode)->i_extra_isize)) \ + +static inline __le32 ext4_encode_extra_time(struct timespec *time) +{ + return cpu_to_le32((sizeof(time->tv_sec) > 4 ? + time->tv_sec >> 32 : 0) | + ((time->tv_nsec << 2) & EXT4_NSEC_MASK)); +} + +static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra) +{ + if (sizeof(time->tv_sec) > 4) + time->tv_sec |= (__u64)(le32_to_cpu(extra) & EXT4_EPOCH_MASK) + << 32; + time->tv_nsec = (le32_to_cpu(extra) & EXT4_NSEC_MASK) >> 2; +} + +#define EXT4_INODE_SET_XTIME(xtime, inode, raw_inode) \ +do { \ + (raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(inode)->xtime); \ +} while (0) + +#define EXT4_EINODE_SET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (raw_inode)->xtime = cpu_to_le32((einode)->xtime.tv_sec); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + (raw_inode)->xtime ## _extra = \ + ext4_encode_extra_time(&(einode)->xtime); \ +} while (0) + +#define EXT4_INODE_GET_XTIME(xtime, inode, raw_inode) \ +do { \ + (inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) \ + ext4_decode_extra_time(&(inode)->xtime, \ + raw_inode->xtime ## _extra); \ +} while (0) + +#define EXT4_EINODE_GET_XTIME(xtime, einode, raw_inode) \ +do { \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime)) \ + (einode)->xtime.tv_sec = \ + (signed)le32_to_cpu((raw_inode)->xtime); \ + if (EXT4_FITS_IN_INODE(raw_inode, einode, xtime ## _extra)) \ + ext4_decode_extra_time(&(einode)->xtime, \ + raw_inode->xtime ## _extra); \ +} while (0) + +#define i_disk_version osd1.linux1.l_i_version + +#if defined(__KERNEL__) || defined(__linux__) +#define i_reserved1 osd1.linux1.l_i_reserved1 +#define i_file_acl_high osd2.linux2.l_i_file_acl_high +#define i_blocks_high osd2.linux2.l_i_blocks_high +#define i_uid_low i_uid +#define i_gid_low i_gid +#define i_uid_high osd2.linux2.l_i_uid_high +#define i_gid_high osd2.linux2.l_i_gid_high +#define i_reserved2 osd2.linux2.l_i_reserved2 + +#elif defined(__GNU__) + +#define i_translator osd1.hurd1.h_i_translator +#define i_uid_high osd2.hurd2.h_i_uid_high +#define i_gid_high osd2.hurd2.h_i_gid_high +#define i_author osd2.hurd2.h_i_author + +#elif defined(__masix__) + +#define i_reserved1 osd1.masix1.m_i_reserved1 +#define i_file_acl_high osd2.masix2.m_i_file_acl_high +#define i_reserved2 osd2.masix2.m_i_reserved2 + +#endif /* defined(__KERNEL__) || defined(__linux__) */ + +/* + * File system states + */ +#define EXT4_VALID_FS 0x0001 /* Unmounted cleanly */ +#define EXT4_ERROR_FS 0x0002 /* Errors detected */ +#define EXT4_ORPHAN_FS 0x0004 /* Orphans being recovered */ + +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + +/* + * Mount flags + */ +#define EXT4_MOUNT_CHECK 0x00001 /* Do mount-time checks */ +#define EXT4_MOUNT_OLDALLOC 0x00002 /* Don't use the new Orlov allocator */ +#define EXT4_MOUNT_GRPID 0x00004 /* Create files with directory's group */ +#define EXT4_MOUNT_DEBUG 0x00008 /* Some debugging messages */ +#define EXT4_MOUNT_ERRORS_CONT 0x00010 /* Continue on errors */ +#define EXT4_MOUNT_ERRORS_RO 0x00020 /* Remount fs ro on errors */ +#define EXT4_MOUNT_ERRORS_PANIC 0x00040 /* Panic on errors */ +#define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ +#define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#define EXT4_MOUNT_ABORT 0x00200 /* Fatal error detected */ +#define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ +#define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ +#define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ +#define EXT4_MOUNT_WRITEBACK_DATA 0x00C00 /* No data ordering */ +#define EXT4_MOUNT_UPDATE_JOURNAL 0x01000 /* Update the journal format */ +#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */ +#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */ +#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ +#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */ +#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ +#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ +#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ +#define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ +#define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ +#define EXT4_MOUNT_EXTENTS 0x400000 /* Extents support */ +#define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ +#define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ +#define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ +#define EXT4_MOUNT_MBALLOC 0x4000000 /* Buddy allocation support */ +/* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ +#ifndef _LINUX_EXT2_FS_H +#define clear_opt(o, opt) o &= ~EXT4_MOUNT_##opt +#define set_opt(o, opt) o |= EXT4_MOUNT_##opt +#define test_opt(sb, opt) (EXT4_SB(sb)->s_mount_opt & \ + EXT4_MOUNT_##opt) +#else +#define EXT2_MOUNT_NOLOAD EXT4_MOUNT_NOLOAD +#define EXT2_MOUNT_ABORT EXT4_MOUNT_ABORT +#define EXT2_MOUNT_DATA_FLAGS EXT4_MOUNT_DATA_FLAGS +#endif + +#define ext4_set_bit ext2_set_bit +#define ext4_set_bit_atomic ext2_set_bit_atomic +#define ext4_clear_bit ext2_clear_bit +#define ext4_clear_bit_atomic ext2_clear_bit_atomic +#define ext4_test_bit ext2_test_bit +#define ext4_find_first_zero_bit ext2_find_first_zero_bit +#define ext4_find_next_zero_bit ext2_find_next_zero_bit +#define ext4_find_next_bit ext2_find_next_bit + +/* + * Maximal mount counts between two filesystem checks + */ +#define EXT4_DFL_MAX_MNT_COUNT 20 /* Allow 20 mounts */ +#define EXT4_DFL_CHECKINTERVAL 0 /* Don't use interval check */ + +/* + * Behaviour when detecting errors + */ +#define EXT4_ERRORS_CONTINUE 1 /* Continue execution */ +#define EXT4_ERRORS_RO 2 /* Remount fs read-only */ +#define EXT4_ERRORS_PANIC 3 /* Panic */ +#define EXT4_ERRORS_DEFAULT EXT4_ERRORS_CONTINUE + +/* + * Structure of the super block + */ +struct ext4_super_block { +/*00*/ __le32 s_inodes_count; /* Inodes count */ + __le32 s_blocks_count_lo; /* Blocks count */ + __le32 s_r_blocks_count_lo; /* Reserved blocks count */ + __le32 s_free_blocks_count_lo; /* Free blocks count */ +/*10*/ __le32 s_free_inodes_count; /* Free inodes count */ + __le32 s_first_data_block; /* First Data Block */ + __le32 s_log_block_size; /* Block size */ + __le32 s_obso_log_frag_size; /* Obsoleted fragment size */ +/*20*/ __le32 s_blocks_per_group; /* # Blocks per group */ + __le32 s_obso_frags_per_group; /* Obsoleted fragments per group */ + __le32 s_inodes_per_group; /* # Inodes per group */ + __le32 s_mtime; /* Mount time */ +/*30*/ __le32 s_wtime; /* Write time */ + __le16 s_mnt_count; /* Mount count */ + __le16 s_max_mnt_count; /* Maximal mount count */ + __le16 s_magic; /* Magic signature */ + __le16 s_state; /* File system state */ + __le16 s_errors; /* Behaviour when detecting errors */ + __le16 s_minor_rev_level; /* minor revision level */ +/*40*/ __le32 s_lastcheck; /* time of last check */ + __le32 s_checkinterval; /* max. time between checks */ + __le32 s_creator_os; /* OS */ + __le32 s_rev_level; /* Revision level */ +/*50*/ __le16 s_def_resuid; /* Default uid for reserved blocks */ + __le16 s_def_resgid; /* Default gid for reserved blocks */ + /* + * These fields are for EXT4_DYNAMIC_REV superblocks only. + * + * Note: the difference between the compatible feature set and + * the incompatible feature set is that if there is a bit set + * in the incompatible feature set that the kernel doesn't + * know about, it should refuse to mount the filesystem. + * + * e2fsck's requirements are more strict; if it doesn't know + * about a feature in either the compatible or incompatible + * feature set, it must abort and not try to meddle with + * things it doesn't understand... + */ + __le32 s_first_ino; /* First non-reserved inode */ + __le16 s_inode_size; /* size of inode structure */ + __le16 s_block_group_nr; /* block group # of this superblock */ + __le32 s_feature_compat; /* compatible feature set */ +/*60*/ __le32 s_feature_incompat; /* incompatible feature set */ + __le32 s_feature_ro_compat; /* readonly-compatible feature set */ +/*68*/ __u8 s_uuid[16]; /* 128-bit uuid for volume */ +/*78*/ char s_volume_name[16]; /* volume name */ +/*88*/ char s_last_mounted[64]; /* directory where last mounted */ +/*C8*/ __le32 s_algorithm_usage_bitmap; /* For compression */ + /* + * Performance hints. Directory preallocation should only + * happen if the EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on. + */ + __u8 s_prealloc_blocks; /* Nr of blocks to try to preallocate*/ + __u8 s_prealloc_dir_blocks; /* Nr to preallocate for dirs */ + __le16 s_reserved_gdt_blocks; /* Per group desc for online growth */ + /* + * Journaling support valid if EXT4_FEATURE_COMPAT_HAS_JOURNAL set. + */ +/*D0*/ __u8 s_journal_uuid[16]; /* uuid of journal superblock */ +/*E0*/ __le32 s_journal_inum; /* inode number of journal file */ + __le32 s_journal_dev; /* device number of journal file */ + __le32 s_last_orphan; /* start of list of inodes to delete */ + __le32 s_hash_seed[4]; /* HTREE hash seed */ + __u8 s_def_hash_version; /* Default hash version to use */ + __u8 s_reserved_char_pad; + __le16 s_desc_size; /* size of group descriptor */ +/*100*/ __le32 s_default_mount_opts; + __le32 s_first_meta_bg; /* First metablock block group */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u32 s_reserved[163]; /* Padding to the end of the block */ +}; + +#ifdef __KERNEL__ +static inline struct ext4_sb_info * EXT4_SB(struct super_block *sb) +{ + return sb->s_fs_info; +} +static inline struct ext4_inode_info *EXT4_I(struct inode *inode) +{ + return container_of(inode, struct ext4_inode_info, vfs_inode); +} + +static inline struct timespec ext4_current_time(struct inode *inode) +{ + return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ? + current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; +} + + +static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) +{ + return ino == EXT4_ROOT_INO || + ino == EXT4_JOURNAL_INO || + ino == EXT4_RESIZE_INO || + (ino >= EXT4_FIRST_INO(sb) && + ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); +} +#else +/* Assume that user mode programs are passing in an ext4fs superblock, not + * a kernel struct super_block. This will allow us to call the feature-test + * macros from user land. */ +#define EXT4_SB(sb) (sb) +#endif + +#define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime + +/* + * Codes for operating systems + */ +#define EXT4_OS_LINUX 0 +#define EXT4_OS_HURD 1 +#define EXT4_OS_MASIX 2 +#define EXT4_OS_FREEBSD 3 +#define EXT4_OS_LITES 4 + +/* + * Revision levels + */ +#define EXT4_GOOD_OLD_REV 0 /* The good old (original) format */ +#define EXT4_DYNAMIC_REV 1 /* V2 format w/ dynamic inode sizes */ + +#define EXT4_CURRENT_REV EXT4_GOOD_OLD_REV +#define EXT4_MAX_SUPP_REV EXT4_DYNAMIC_REV + +#define EXT4_GOOD_OLD_INODE_SIZE 128 + +/* + * Feature set definitions + */ + +#define EXT4_HAS_COMPAT_FEATURE(sb,mask) \ + ( EXT4_SB(sb)->s_es->s_feature_compat & cpu_to_le32(mask) ) +#define EXT4_HAS_RO_COMPAT_FEATURE(sb,mask) \ + ( EXT4_SB(sb)->s_es->s_feature_ro_compat & cpu_to_le32(mask) ) +#define EXT4_HAS_INCOMPAT_FEATURE(sb,mask) \ + ( EXT4_SB(sb)->s_es->s_feature_incompat & cpu_to_le32(mask) ) +#define EXT4_SET_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat |= cpu_to_le32(mask) +#define EXT4_SET_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat |= cpu_to_le32(mask) +#define EXT4_SET_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat |= cpu_to_le32(mask) +#define EXT4_CLEAR_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_RO_COMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_ro_compat &= ~cpu_to_le32(mask) +#define EXT4_CLEAR_INCOMPAT_FEATURE(sb,mask) \ + EXT4_SB(sb)->s_es->s_feature_incompat &= ~cpu_to_le32(mask) + +#define EXT4_FEATURE_COMPAT_DIR_PREALLOC 0x0001 +#define EXT4_FEATURE_COMPAT_IMAGIC_INODES 0x0002 +#define EXT4_FEATURE_COMPAT_HAS_JOURNAL 0x0004 +#define EXT4_FEATURE_COMPAT_EXT_ATTR 0x0008 +#define EXT4_FEATURE_COMPAT_RESIZE_INODE 0x0010 +#define EXT4_FEATURE_COMPAT_DIR_INDEX 0x0020 + +#define EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER 0x0001 +#define EXT4_FEATURE_RO_COMPAT_LARGE_FILE 0x0002 +#define EXT4_FEATURE_RO_COMPAT_BTREE_DIR 0x0004 +#define EXT4_FEATURE_RO_COMPAT_HUGE_FILE 0x0008 +#define EXT4_FEATURE_RO_COMPAT_GDT_CSUM 0x0010 +#define EXT4_FEATURE_RO_COMPAT_DIR_NLINK 0x0020 +#define EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE 0x0040 + +#define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 +#define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 +#define EXT4_FEATURE_INCOMPAT_RECOVER 0x0004 /* Needs recovery */ +#define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV 0x0008 /* Journal device */ +#define EXT4_FEATURE_INCOMPAT_META_BG 0x0010 +#define EXT4_FEATURE_INCOMPAT_EXTENTS 0x0040 /* extents support */ +#define EXT4_FEATURE_INCOMPAT_64BIT 0x0080 +#define EXT4_FEATURE_INCOMPAT_MMP 0x0100 +#define EXT4_FEATURE_INCOMPAT_FLEX_BG 0x0200 + +#define EXT4_FEATURE_COMPAT_SUPP EXT2_FEATURE_COMPAT_EXT_ATTR +#define EXT4_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ + EXT4_FEATURE_INCOMPAT_RECOVER| \ + EXT4_FEATURE_INCOMPAT_META_BG| \ + EXT4_FEATURE_INCOMPAT_EXTENTS| \ + EXT4_FEATURE_INCOMPAT_64BIT| \ + EXT4_FEATURE_INCOMPAT_FLEX_BG) +#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ + EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ + EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_BTREE_DIR |\ + EXT4_FEATURE_RO_COMPAT_HUGE_FILE) + +/* + * Default values for user and/or group using reserved blocks + */ +#define EXT4_DEF_RESUID 0 +#define EXT4_DEF_RESGID 0 + +/* + * Default mount options + */ +#define EXT4_DEFM_DEBUG 0x0001 +#define EXT4_DEFM_BSDGROUPS 0x0002 +#define EXT4_DEFM_XATTR_USER 0x0004 +#define EXT4_DEFM_ACL 0x0008 +#define EXT4_DEFM_UID16 0x0010 +#define EXT4_DEFM_JMODE 0x0060 +#define EXT4_DEFM_JMODE_DATA 0x0020 +#define EXT4_DEFM_JMODE_ORDERED 0x0040 +#define EXT4_DEFM_JMODE_WBACK 0x0060 + +/* + * Structure of a directory entry + */ +#define EXT4_NAME_LEN 255 + +struct ext4_dir_entry { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __le16 name_len; /* Name length */ + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * The new version of the directory entry. Since EXT4 structures are + * stored in intel byte order, and the name_len field could never be + * bigger than 255 chars, it's safe to reclaim the extra byte for the + * file_type field. + */ +struct ext4_dir_entry_2 { + __le32 inode; /* Inode number */ + __le16 rec_len; /* Directory entry length */ + __u8 name_len; /* Name length */ + __u8 file_type; + char name[EXT4_NAME_LEN]; /* File name */ +}; + +/* + * Ext4 directory file types. Only the low 3 bits are used. The + * other bits are reserved for now. + */ +#define EXT4_FT_UNKNOWN 0 +#define EXT4_FT_REG_FILE 1 +#define EXT4_FT_DIR 2 +#define EXT4_FT_CHRDEV 3 +#define EXT4_FT_BLKDEV 4 +#define EXT4_FT_FIFO 5 +#define EXT4_FT_SOCK 6 +#define EXT4_FT_SYMLINK 7 + +#define EXT4_FT_MAX 8 + +/* + * EXT4_DIR_PAD defines the directory entries boundaries + * + * NOTE: It must be a multiple of 4 + */ +#define EXT4_DIR_PAD 4 +#define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) +#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ + ~EXT4_DIR_ROUND) +#define EXT4_MAX_REC_LEN ((1<<16)-1) + +static inline unsigned ext4_rec_len_from_disk(__le16 dlen) +{ + unsigned len = le16_to_cpu(dlen); + + if (len == EXT4_MAX_REC_LEN) + return 1 << 16; + return len; +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len) +{ + if (len == (1 << 16)) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else if (len > (1 << 16)) + BUG(); + return cpu_to_le16(len); +} + +/* + * Hash Tree Directory indexing + * (c) Daniel Phillips, 2001 + */ + +#define is_dx(dir) (EXT4_HAS_COMPAT_FEATURE(dir->i_sb, \ + EXT4_FEATURE_COMPAT_DIR_INDEX) && \ + (EXT4_I(dir)->i_flags & EXT4_INDEX_FL)) +#define EXT4_DIR_LINK_MAX(dir) (!is_dx(dir) && (dir)->i_nlink >= EXT4_LINK_MAX) +#define EXT4_DIR_LINK_EMPTY(dir) ((dir)->i_nlink == 2 || (dir)->i_nlink == 1) + +/* Legal values for the dx_root hash_version field: */ + +#define DX_HASH_LEGACY 0 +#define DX_HASH_HALF_MD4 1 +#define DX_HASH_TEA 2 + +#ifdef __KERNEL__ + +/* hash info structure used by the directory hash */ +struct dx_hash_info +{ + u32 hash; + u32 minor_hash; + int hash_version; + u32 *seed; +}; + +#define EXT4_HTREE_EOF 0x7fffffff + +/* + * Control parameters used by ext4_htree_next_block + */ +#define HASH_NB_ALWAYS 1 + + +/* + * Describe an inode's exact location on disk and in memory + */ +struct ext4_iloc +{ + struct buffer_head *bh; + unsigned long offset; + ext4_group_t block_group; +}; + +static inline struct ext4_inode *ext4_raw_inode(struct ext4_iloc *iloc) +{ + return (struct ext4_inode *) (iloc->bh->b_data + iloc->offset); +} + +/* + * This structure is stuffed into the struct file's private_data field + * for directories. It is where we put information so that we can do + * readdir operations in hash tree order. + */ +struct dir_private_info { + struct rb_root root; + struct rb_node *curr_node; + struct fname *extra_fname; + loff_t last_pos; + __u32 curr_hash; + __u32 curr_minor_hash; + __u32 next_hash; +}; + +/* calculate the first block number of the group */ +static inline ext4_fsblk_t +ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no) +{ + return group_no * (ext4_fsblk_t)EXT4_BLOCKS_PER_GROUP(sb) + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); +} + +/* + * Special error return code only used by dx_probe() and its callers. + */ +#define ERR_BAD_DX_DIR -75000 + +void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr, + unsigned long *blockgrpp, ext4_grpblk_t *offsetp); + +/* + * Function prototypes + */ + +/* + * Ok, these declarations are also in <linux/kernel.h> but none of the + * ext4 source programs needs to include it so they are duplicated here. + */ +# define NORET_TYPE /**/ +# define ATTRIB_NORET __attribute__((noreturn)) +# define NORET_AND noreturn, + +/* balloc.c */ +extern unsigned int ext4_block_group(struct super_block *sb, + ext4_fsblk_t blocknr); +extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb, + ext4_fsblk_t blocknr); +extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group); +extern unsigned long ext4_bg_num_gdb(struct super_block *sb, + ext4_group_t group); +extern ext4_fsblk_t ext4_new_block (handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, int *errp); +extern ext4_fsblk_t ext4_new_blocks (handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp); +extern ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, + ext4_fsblk_t goal, unsigned long *count, int *errp); +extern void ext4_free_blocks (handle_t *handle, struct inode *inode, + ext4_fsblk_t block, unsigned long count, int metadata); +extern void ext4_free_blocks_sb (handle_t *handle, struct super_block *sb, + ext4_fsblk_t block, unsigned long count, + unsigned long *pdquot_freed_blocks); +extern ext4_fsblk_t ext4_count_free_blocks (struct super_block *); +extern void ext4_check_blocks_bitmap (struct super_block *); +extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, + ext4_group_t block_group, + struct buffer_head ** bh); +extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); +extern void ext4_init_block_alloc_info(struct inode *); +extern void ext4_rsv_window_add(struct super_block *sb, struct ext4_reserve_window_node *rsv); + +/* dir.c */ +extern int ext4_check_dir_entry(const char *, struct inode *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned long); +extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, + __u32 minor_hash, + struct ext4_dir_entry_2 *dirent); +extern void ext4_htree_free_dir_info(struct dir_private_info *p); + +/* fsync.c */ +extern int ext4_sync_file (struct file *, struct dentry *, int); + +/* hash.c */ +extern int ext4fs_dirhash(const char *name, int len, struct + dx_hash_info *hinfo); + +/* ialloc.c */ +extern struct inode * ext4_new_inode (handle_t *, struct inode *, int); +extern void ext4_free_inode (handle_t *, struct inode *); +extern struct inode * ext4_orphan_get (struct super_block *, unsigned long); +extern unsigned long ext4_count_free_inodes (struct super_block *); +extern unsigned long ext4_count_dirs (struct super_block *); +extern void ext4_check_inodes_bitmap (struct super_block *); +extern unsigned long ext4_count_free (struct buffer_head *, unsigned); + +/* mballoc.c */ +extern long ext4_mb_stats; +extern long ext4_mb_max_to_scan; +extern int ext4_mb_init(struct super_block *, int); +extern int ext4_mb_release(struct super_block *); +extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, + struct ext4_allocation_request *, int *); +extern int ext4_mb_reserve_blocks(struct super_block *, int); +extern void ext4_mb_discard_inode_preallocations(struct inode *); +extern int __init init_ext4_mballoc(void); +extern void exit_ext4_mballoc(void); +extern void ext4_mb_free_blocks(handle_t *, struct inode *, + unsigned long, unsigned long, int, unsigned long *); + + +/* inode.c */ +int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +struct buffer_head *ext4_bread(handle_t *, struct inode *, + ext4_lblk_t, int, int *); +int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, unsigned long maxblocks, + struct buffer_head *bh_result, + int create, int extend_disksize); + +extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern int ext4_write_inode (struct inode *, int); +extern int ext4_setattr (struct dentry *, struct iattr *); +extern void ext4_delete_inode (struct inode *); +extern int ext4_sync_inode (handle_t *, struct inode *); +extern void ext4_discard_reservation (struct inode *); +extern void ext4_dirty_inode(struct inode *); +extern int ext4_change_inode_journal_flag(struct inode *, int); +extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *); +extern void ext4_truncate (struct inode *); +extern void ext4_set_inode_flags(struct inode *); +extern void ext4_get_inode_flags(struct ext4_inode_info *); +extern void ext4_set_aops(struct inode *inode); +extern int ext4_writepage_trans_blocks(struct inode *); +extern int ext4_block_truncate_page(handle_t *handle, struct page *page, + struct address_space *mapping, loff_t from); + +/* ioctl.c */ +extern long ext4_ioctl(struct file *, unsigned int, unsigned long); +extern long ext4_compat_ioctl (struct file *, unsigned int, unsigned long); + +/* migrate.c */ +extern int ext4_ext_migrate(struct inode *, struct file *, unsigned int, + unsigned long); +/* namei.c */ +extern int ext4_orphan_add(handle_t *, struct inode *); +extern int ext4_orphan_del(handle_t *, struct inode *); +extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, + __u32 start_minor_hash, __u32 *next_hash); + +/* resize.c */ +extern int ext4_group_add(struct super_block *sb, + struct ext4_new_group_data *input); +extern int ext4_group_extend(struct super_block *sb, + struct ext4_super_block *es, + ext4_fsblk_t n_blocks_count); + +/* super.c */ +extern void ext4_error (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void __ext4_std_error (struct super_block *, const char *, int); +extern void ext4_abort (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_warning (struct super_block *, const char *, const char *, ...) + __attribute__ ((format (printf, 3, 4))); +extern void ext4_update_dynamic_rev (struct super_block *sb); +extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, + __u32 compat); +extern int ext4_update_rocompat_feature(handle_t *handle, + struct super_block *sb, __u32 rocompat); +extern int ext4_update_incompat_feature(handle_t *handle, + struct super_block *sb, __u32 incompat); +extern ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb, + struct ext4_group_desc *bg); +extern ext4_fsblk_t ext4_inode_table(struct super_block *sb, + struct ext4_group_desc *bg); +extern void ext4_block_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_bitmap_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); +extern void ext4_inode_table_set(struct super_block *sb, + struct ext4_group_desc *bg, ext4_fsblk_t blk); + +static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | + le32_to_cpu(es->s_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_r_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_r_blocks_count_hi) << 32) | + le32_to_cpu(es->s_r_blocks_count_lo); +} + +static inline ext4_fsblk_t ext4_free_blocks_count(struct ext4_super_block *es) +{ + return ((ext4_fsblk_t)le32_to_cpu(es->s_free_blocks_count_hi) << 32) | + le32_to_cpu(es->s_free_blocks_count_lo); +} + +static inline void ext4_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_free_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_free_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_free_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline void ext4_r_blocks_count_set(struct ext4_super_block *es, + ext4_fsblk_t blk) +{ + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); +} + +static inline loff_t ext4_isize(struct ext4_inode *raw_inode) +{ + return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | + le32_to_cpu(raw_inode->i_size_lo); +} + +static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) +{ + raw_inode->i_size_lo = cpu_to_le32(i_size); + raw_inode->i_size_high = cpu_to_le32(i_size >> 32); +} + +static inline +struct ext4_group_info *ext4_get_group_info(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info ***grp_info; + long indexv, indexh; + grp_info = EXT4_SB(sb)->s_group_info; + indexv = group >> (EXT4_DESC_PER_BLOCK_BITS(sb)); + indexh = group & ((EXT4_DESC_PER_BLOCK(sb)) - 1); + return grp_info[indexv][indexh]; +} + + +#define ext4_std_error(sb, errno) \ +do { \ + if ((errno)) \ + __ext4_std_error((sb), __FUNCTION__, (errno)); \ +} while (0) + +/* + * Inodes and files operations + */ + +/* dir.c */ +extern const struct file_operations ext4_dir_operations; + +/* file.c */ +extern const struct inode_operations ext4_file_inode_operations; +extern const struct file_operations ext4_file_operations; + +/* namei.c */ +extern const struct inode_operations ext4_dir_inode_operations; +extern const struct inode_operations ext4_special_inode_operations; + +/* symlink.c */ +extern const struct inode_operations ext4_symlink_inode_operations; +extern const struct inode_operations ext4_fast_symlink_inode_operations; + +/* extents.c */ +extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern int ext4_ext_writepage_trans_blocks(struct inode *, int); +extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, + ext4_lblk_t iblock, + unsigned long max_blocks, struct buffer_head *bh_result, + int create, int extend_disksize); +extern void ext4_ext_truncate(struct inode *, struct page *); +extern void ext4_ext_init(struct super_block *); +extern void ext4_ext_release(struct super_block *); +extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, + loff_t len); +extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, + sector_t block, unsigned long max_blocks, + struct buffer_head *bh, int create, + int extend_disksize); +#endif /* __KERNEL__ */ + +#endif /* _EXT4_H */ diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h new file mode 100644 index 00000000000..75333b595fa --- /dev/null +++ b/fs/ext4/ext4_extents.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com + * Written by Alex Tomas <alex@clusterfs.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public Licens + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111- + */ + +#ifndef _EXT4_EXTENTS +#define _EXT4_EXTENTS + +#include "ext4.h" + +/* + * With AGGRESSIVE_TEST defined, the capacity of index/leaf blocks + * becomes very small, so index split, in-depth growing and + * other hard changes happen much more often. + * This is for debug purposes only. + */ +#define AGGRESSIVE_TEST_ + +/* + * With EXTENTS_STATS defined, the number of blocks and extents + * are collected in the truncate path. They'll be shown at + * umount time. + */ +#define EXTENTS_STATS__ + +/* + * If CHECK_BINSEARCH is defined, then the results of the binary search + * will also be checked by linear search. + */ +#define CHECK_BINSEARCH__ + +/* + * If EXT_DEBUG is defined you can use the 'extdebug' mount option + * to get lots of info about what's going on. + */ +#define EXT_DEBUG__ +#ifdef EXT_DEBUG +#define ext_debug(a...) printk(a) +#else +#define ext_debug(a...) +#endif + +/* + * If EXT_STATS is defined then stats numbers are collected. + * These number will be displayed at umount time. + */ +#define EXT_STATS_ + + +/* + * ext4_inode has i_block array (60 bytes total). + * The first 12 bytes store ext4_extent_header; + * the remainder stores an array of ext4_extent. + */ + +/* + * This is the extent on-disk structure. + * It's used at the bottom of the tree. + */ +struct ext4_extent { + __le32 ee_block; /* first logical block extent covers */ + __le16 ee_len; /* number of blocks covered by extent */ + __le16 ee_start_hi; /* high 16 bits of physical block */ + __le32 ee_start_lo; /* low 32 bits of physical block */ +}; + +/* + * This is index on-disk structure. + * It's used at all the levels except the bottom. + */ +struct ext4_extent_idx { + __le32 ei_block; /* index covers logical blocks from 'block' */ + __le32 ei_leaf_lo; /* pointer to the physical block of the next * + * level. leaf or next index could be there */ + __le16 ei_leaf_hi; /* high 16 bits of physical block */ + __u16 ei_unused; +}; + +/* + * Each block (leaves and indexes), even inode-stored has header. + */ +struct ext4_extent_header { + __le16 eh_magic; /* probably will support different formats */ + __le16 eh_entries; /* number of valid entries */ + __le16 eh_max; /* capacity of store in entries */ + __le16 eh_depth; /* has tree real underlying blocks? */ + __le32 eh_generation; /* generation of the tree */ +}; + +#define EXT4_EXT_MAGIC cpu_to_le16(0xf30a) + +/* + * Array of ext4_ext_path contains path to some extent. + * Creation/lookup routines use it for traversal/splitting/etc. + * Truncate uses it to simulate recursive walking. + */ +struct ext4_ext_path { + ext4_fsblk_t p_block; + __u16 p_depth; + struct ext4_extent *p_ext; + struct ext4_extent_idx *p_idx; + struct ext4_extent_header *p_hdr; + struct buffer_head *p_bh; +}; + +/* + * structure for external API + */ + +#define EXT4_EXT_CACHE_NO 0 +#define EXT4_EXT_CACHE_GAP 1 +#define EXT4_EXT_CACHE_EXTENT 2 + + +#define EXT_MAX_BLOCK 0xffffffff + +/* + * EXT_INIT_MAX_LEN is the maximum number of blocks we can have in an + * initialized extent. This is 2^15 and not (2^16 - 1), since we use the + * MSB of ee_len field in the extent datastructure to signify if this + * particular extent is an initialized extent or an uninitialized (i.e. + * preallocated). + * EXT_UNINIT_MAX_LEN is the maximum number of blocks we can have in an + * uninitialized extent. + * If ee_len is <= 0x8000, it is an initialized extent. Otherwise, it is an + * uninitialized one. In other words, if MSB of ee_len is set, it is an + * uninitialized extent with only one special scenario when ee_len = 0x8000. + * In this case we can not have an uninitialized extent of zero length and + * thus we make it as a special case of initialized extent with 0x8000 length. + * This way we get better extent-to-group alignment for initialized extents. + * Hence, the maximum number of blocks we can have in an *initialized* + * extent is 2^15 (32768) and in an *uninitialized* extent is 2^15-1 (32767). + */ +#define EXT_INIT_MAX_LEN (1UL << 15) +#define EXT_UNINIT_MAX_LEN (EXT_INIT_MAX_LEN - 1) + + +#define EXT_FIRST_EXTENT(__hdr__) \ + ((struct ext4_extent *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_FIRST_INDEX(__hdr__) \ + ((struct ext4_extent_idx *) (((char *) (__hdr__)) + \ + sizeof(struct ext4_extent_header))) +#define EXT_HAS_FREE_INDEX(__path__) \ + (le16_to_cpu((__path__)->p_hdr->eh_entries) \ + < le16_to_cpu((__path__)->p_hdr->eh_max)) +#define EXT_LAST_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_LAST_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_INDEX(__hdr__) \ + (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + +static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) +{ + return (struct ext4_extent_header *) EXT4_I(inode)->i_data; +} + +static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh) +{ + return (struct ext4_extent_header *) bh->b_data; +} + +static inline unsigned short ext_depth(struct inode *inode) +{ + return le16_to_cpu(ext_inode_hdr(inode)->eh_depth); +} + +static inline void ext4_ext_tree_changed(struct inode *inode) +{ + EXT4_I(inode)->i_ext_generation++; +} + +static inline void +ext4_ext_invalidate_cache(struct inode *inode) +{ + EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO; +} + +static inline void ext4_ext_mark_uninitialized(struct ext4_extent *ext) +{ + /* We can not have an uninitialized extent of zero length! */ + BUG_ON((le16_to_cpu(ext->ee_len) & ~EXT_INIT_MAX_LEN) == 0); + ext->ee_len |= cpu_to_le16(EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_is_uninitialized(struct ext4_extent *ext) +{ + /* Extent with ee_len of 0x8000 is treated as an initialized extent */ + return (le16_to_cpu(ext->ee_len) > EXT_INIT_MAX_LEN); +} + +static inline int ext4_ext_get_actual_len(struct ext4_extent *ext) +{ + return (le16_to_cpu(ext->ee_len) <= EXT_INIT_MAX_LEN ? + le16_to_cpu(ext->ee_len) : + (le16_to_cpu(ext->ee_len) - EXT_INIT_MAX_LEN)); +} + +extern ext4_fsblk_t idx_pblock(struct ext4_extent_idx *); +extern void ext4_ext_store_pblock(struct ext4_extent *, ext4_fsblk_t); +extern int ext4_extent_tree_init(handle_t *, struct inode *); +extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *); +extern int ext4_ext_try_to_merge(struct inode *inode, + struct ext4_ext_path *path, + struct ext4_extent *); +extern unsigned int ext4_ext_check_overlap(struct inode *, struct ext4_extent *, struct ext4_ext_path *); +extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *); +extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path *); +extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *, + ext4_lblk_t *, ext4_fsblk_t *); +extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *, + ext4_lblk_t *, ext4_fsblk_t *); +extern void ext4_ext_drop_refs(struct ext4_ext_path *); +#endif /* _EXT4_EXTENTS */ + diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h new file mode 100644 index 00000000000..26a4ae255d7 --- /dev/null +++ b/fs/ext4/ext4_i.h @@ -0,0 +1,167 @@ +/* + * ext4_i.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_i.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_I +#define _EXT4_I + +#include <linux/rwsem.h> +#include <linux/rbtree.h> +#include <linux/seqlock.h> +#include <linux/mutex.h> + +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned long ext4_group_t; + +struct ext4_reserve_window { + ext4_fsblk_t _rsv_start; /* First byte reserved */ + ext4_fsblk_t _rsv_end; /* Last byte reserved or 0 */ +}; + +struct ext4_reserve_window_node { + struct rb_node rsv_node; + __u32 rsv_goal_size; + __u32 rsv_alloc_hit; + struct ext4_reserve_window rsv_window; +}; + +struct ext4_block_alloc_info { + /* information about reservation window */ + struct ext4_reserve_window_node rsv_window_node; + /* + * was i_next_alloc_block in ext4_inode_info + * is the logical (file-relative) number of the + * most-recently-allocated block in this file. + * We use this for detecting linearly ascending allocation requests. + */ + ext4_lblk_t last_alloc_logical_block; + /* + * Was i_next_alloc_goal in ext4_inode_info + * is the *physical* companion to i_next_alloc_block. + * it the physical block number of the block which was most-recentl + * allocated to this file. This give us the goal (target) for the next + * allocation when we detect linearly ascending requests. + */ + ext4_fsblk_t last_alloc_physical_block; +}; + +#define rsv_start rsv_window._rsv_start +#define rsv_end rsv_window._rsv_end + +/* + * storage for cached extent + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* + * third extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; + ext4_fsblk_t i_file_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + __u32 i_state; /* Dynamic state flags for ext4 */ + + /* block reservation info */ + struct ext4_block_alloc_info *i_block_alloc_info; + + ext4_lblk_t i_dir_start_lookup; +#ifdef CONFIG_EXT4DEV_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_EXT4DEV_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* on-disk additional length */ + __u16 i_extra_isize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + + unsigned long i_ext_generation; + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; +}; + +#endif /* _EXT4_I */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index d6afe4e2734..c75384b34f2 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -2,14 +2,14 @@ * Interface between ext4 and JBD */ -#include <linux/ext4_jbd2.h> +#include "ext4_jbd2.h" int __ext4_journal_get_undo_access(const char *where, handle_t *handle, struct buffer_head *bh) { int err = jbd2_journal_get_undo_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext4_journal_abort_handle(where, __func__, bh, handle, err); return err; } @@ -18,7 +18,7 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, { int err = jbd2_journal_get_write_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext4_journal_abort_handle(where, __func__, bh, handle, err); return err; } @@ -27,7 +27,7 @@ int __ext4_journal_forget(const char *where, handle_t *handle, { int err = jbd2_journal_forget(handle, bh); if (err) - ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext4_journal_abort_handle(where, __func__, bh, handle, err); return err; } @@ -36,7 +36,7 @@ int __ext4_journal_revoke(const char *where, handle_t *handle, { int err = jbd2_journal_revoke(handle, blocknr, bh); if (err) - ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext4_journal_abort_handle(where, __func__, bh, handle, err); return err; } @@ -45,7 +45,7 @@ int __ext4_journal_get_create_access(const char *where, { int err = jbd2_journal_get_create_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext4_journal_abort_handle(where, __func__, bh, handle, err); return err; } @@ -54,6 +54,6 @@ int __ext4_journal_dirty_metadata(const char *where, { int err = jbd2_journal_dirty_metadata(handle, bh); if (err) - ext4_journal_abort_handle(where, __FUNCTION__, bh, handle,err); + ext4_journal_abort_handle(where, __func__, bh, handle, err); return err; } diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h new file mode 100644 index 00000000000..9255a7d28b2 --- /dev/null +++ b/fs/ext4/ext4_jbd2.h @@ -0,0 +1,231 @@ +/* + * ext4_jbd2.h + * + * Written by Stephen C. Tweedie <sct@redhat.com>, 1999 + * + * Copyright 1998--1999 Red Hat corp --- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + * + * Ext4-specific journaling extensions. + */ + +#ifndef _EXT4_JBD2_H +#define _EXT4_JBD2_H + +#include <linux/fs.h> +#include <linux/jbd2.h> +#include "ext4.h" + +#define EXT4_JOURNAL(inode) (EXT4_SB((inode)->i_sb)->s_journal) + +/* Define the number of blocks we need to account to a transaction to + * modify one block of data. + * + * We may have to touch one inode, one bitmap buffer, up to three + * indirection blocks, the group and superblock summaries, and the data + * block to complete the transaction. + * + * For extents-enabled fs we may have to allocate and modify up to + * 5 levels of tree + root which are stored in the inode. */ + +#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb) \ + (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS) \ + || test_opt(sb, EXTENTS) ? 27U : 8U) + +/* Extended attribute operations touch at most two data buffers, + * two bitmap buffers, and two group summaries, in addition to the inode + * and the superblock, which are already accounted for. */ + +#define EXT4_XATTR_TRANS_BLOCKS 6U + +/* Define the minimum size for a transaction which modifies data. This + * needs to take into account the fact that we may end up modifying two + * quota files too (one for the group, one for the user quota). The + * superblock only gets updated once, of course, so don't bother + * counting that again for the quota updates. */ + +#define EXT4_DATA_TRANS_BLOCKS(sb) (EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \ + EXT4_XATTR_TRANS_BLOCKS - 2 + \ + 2*EXT4_QUOTA_TRANS_BLOCKS(sb)) + +/* Delete operations potentially hit one directory's namespace plus an + * entire inode, plus arbitrary amounts of bitmap/indirection data. Be + * generous. We can grow the delete transaction later if necessary. */ + +#define EXT4_DELETE_TRANS_BLOCKS(sb) (2 * EXT4_DATA_TRANS_BLOCKS(sb) + 64) + +/* Define an arbitrary limit for the amount of data we will anticipate + * writing to any given transaction. For unbounded transactions such as + * write(2) and truncate(2) we can write more than this, but we always + * start off at the maximum transaction size and grow the transaction + * optimistically as we go. */ + +#define EXT4_MAX_TRANS_DATA 64U + +/* We break up a large truncate or write transaction once the handle's + * buffer credits gets this low, we need either to extend the + * transaction or to start a new one. Reserve enough space here for + * inode, bitmap, superblock, group and indirection updates for at least + * one block, plus two quota updates. Quota allocations are not + * needed. */ + +#define EXT4_RESERVE_TRANS_BLOCKS 12U + +#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 + +#ifdef CONFIG_QUOTA +/* Amount of blocks needed for quota update - we know that the structure was + * allocated so we need to update only inode+data */ +#define EXT4_QUOTA_TRANS_BLOCKS(sb) (test_opt(sb, QUOTA) ? 2 : 0) +/* Amount of blocks needed for quota insert/delete - we do some block writes + * but inode, sb and group updates are done only once */ +#define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0) +#define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\ + (EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0) +#else +#define EXT4_QUOTA_TRANS_BLOCKS(sb) 0 +#define EXT4_QUOTA_INIT_BLOCKS(sb) 0 +#define EXT4_QUOTA_DEL_BLOCKS(sb) 0 +#endif + +int +ext4_mark_iloc_dirty(handle_t *handle, + struct inode *inode, + struct ext4_iloc *iloc); + +/* + * On success, We end up with an outstanding reference count against + * iloc->bh. This _must_ be cleaned up later. + */ + +int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, + struct ext4_iloc *iloc); + +int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); + +/* + * Wrapper functions with which ext4 calls into JBD. The intent here is + * to allow these to be turned into appropriate stubs so ext4 can control + * ext2 filesystems, so ext2+ext4 systems only nee one fs. This work hasn't + * been done yet. + */ + +static inline void ext4_journal_release_buffer(handle_t *handle, + struct buffer_head *bh) +{ + jbd2_journal_release_buffer(handle, bh); +} + +void ext4_journal_abort_handle(const char *caller, const char *err_fn, + struct buffer_head *bh, handle_t *handle, int err); + +int __ext4_journal_get_undo_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext4_journal_get_write_access(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext4_journal_forget(const char *where, handle_t *handle, + struct buffer_head *bh); + +int __ext4_journal_revoke(const char *where, handle_t *handle, + ext4_fsblk_t blocknr, struct buffer_head *bh); + +int __ext4_journal_get_create_access(const char *where, + handle_t *handle, struct buffer_head *bh); + +int __ext4_journal_dirty_metadata(const char *where, + handle_t *handle, struct buffer_head *bh); + +#define ext4_journal_get_undo_access(handle, bh) \ + __ext4_journal_get_undo_access(__FUNCTION__, (handle), (bh)) +#define ext4_journal_get_write_access(handle, bh) \ + __ext4_journal_get_write_access(__FUNCTION__, (handle), (bh)) +#define ext4_journal_revoke(handle, blocknr, bh) \ + __ext4_journal_revoke(__FUNCTION__, (handle), (blocknr), (bh)) +#define ext4_journal_get_create_access(handle, bh) \ + __ext4_journal_get_create_access(__FUNCTION__, (handle), (bh)) +#define ext4_journal_dirty_metadata(handle, bh) \ + __ext4_journal_dirty_metadata(__FUNCTION__, (handle), (bh)) +#define ext4_journal_forget(handle, bh) \ + __ext4_journal_forget(__FUNCTION__, (handle), (bh)) + +int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh); + +handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); +int __ext4_journal_stop(const char *where, handle_t *handle); + +static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) +{ + return ext4_journal_start_sb(inode->i_sb, nblocks); +} + +#define ext4_journal_stop(handle) \ + __ext4_journal_stop(__FUNCTION__, (handle)) + +static inline handle_t *ext4_journal_current_handle(void) +{ + return journal_current_handle(); +} + +static inline int ext4_journal_extend(handle_t *handle, int nblocks) +{ + return jbd2_journal_extend(handle, nblocks); +} + +static inline int ext4_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2_journal_restart(handle, nblocks); +} + +static inline int ext4_journal_blocks_per_page(struct inode *inode) +{ + return jbd2_journal_blocks_per_page(inode); +} + +static inline int ext4_journal_force_commit(journal_t *journal) +{ + return jbd2_journal_force_commit(journal); +} + +/* super.c */ +int ext4_force_commit(struct super_block *sb); + +static inline int ext4_should_journal_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 1; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) + return 1; + if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + return 1; + return 0; +} + +static inline int ext4_should_order_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) + return 1; + return 0; +} + +static inline int ext4_should_writeback_data(struct inode *inode) +{ + if (!S_ISREG(inode->i_mode)) + return 0; + if (EXT4_I(inode)->i_flags & EXT4_JOURNAL_DATA_FL) + return 0; + if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA) + return 1; + return 0; +} + +#endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h new file mode 100644 index 00000000000..5802e69f219 --- /dev/null +++ b/fs/ext4/ext4_sb.h @@ -0,0 +1,148 @@ +/* + * ext4_sb.h + * + * Copyright (C) 1992, 1993, 1994, 1995 + * Remy Card (card@masi.ibp.fr) + * Laboratoire MASI - Institut Blaise Pascal + * Universite Pierre et Marie Curie (Paris VI) + * + * from + * + * linux/include/linux/minix_fs_sb.h + * + * Copyright (C) 1991, 1992 Linus Torvalds + */ + +#ifndef _EXT4_SB +#define _EXT4_SB + +#ifdef __KERNEL__ +#include <linux/timer.h> +#include <linux/wait.h> +#include <linux/blockgroup_lock.h> +#include <linux/percpu_counter.h> +#endif +#include <linux/rbtree.h> + +/* + * third extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block * s_es; /* Pointer to the super block in the buffer */ + struct buffer_head ** s_group_desc; + unsigned long s_mount_opt; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct blockgroup_lock s_blockgroup_lock; + + /* root of the per fs reservation window tree */ + spinlock_t s_rsv_window_lock; + struct rb_root s_rsv_window_root; + struct ext4_reserve_window_node s_rsv_window_head; + + /* Journaling */ + struct inode * s_journal_inode; + struct journal_s * s_journal; + struct list_head s_orphan; + unsigned long s_commit_interval; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD2_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + struct list_head s_active_transaction; + struct list_head s_closed_transaction; + struct list_head s_committed_transaction; + spinlock_t s_md_lock; + tid_t s_last_transaction; + unsigned short *s_mb_offsets, *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned long s_mb_stream_request; + unsigned long s_mb_max_to_scan; + unsigned long s_mb_min_to_scan; + unsigned long s_mb_stats; + unsigned long s_mb_order2_reqs; + unsigned long s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; + int s_mb_history_cur; + int s_mb_history_max; + int s_mb_history_num; + struct proc_dir_entry *s_mb_proc; + spinlock_t s_mb_history_lock; + int s_mb_history_filter; + + /* stats for buddy allocator */ + spinlock_t s_mb_pa_lock; + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + + /* locality groups */ + struct ext4_locality_group *s_locality_groups; +}; + +#endif /* _EXT4_SB */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 9ae6e67090c..47929c4e3da 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -32,7 +32,6 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> -#include <linux/ext4_jbd2.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> @@ -40,8 +39,9 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/falloc.h> -#include <linux/ext4_fs_extents.h> #include <asm/uaccess.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" /* @@ -308,7 +308,7 @@ corrupted: } #define ext4_ext_check_header(inode, eh, depth) \ - __ext4_ext_check_header(__FUNCTION__, inode, eh, depth) + __ext4_ext_check_header(__func__, inode, eh, depth) #ifdef EXT_DEBUG static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) @@ -614,7 +614,7 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, ix->ei_block = cpu_to_le32(logical); ext4_idx_store_pblock(ix, ptr); - curp->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(curp->p_hdr->eh_entries)+1); + le16_add_cpu(&curp->p_hdr->eh_entries, 1); BUG_ON(le16_to_cpu(curp->p_hdr->eh_entries) > le16_to_cpu(curp->p_hdr->eh_max)); @@ -736,7 +736,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } if (m) { memmove(ex, path[depth].p_ext-m, sizeof(struct ext4_extent)*m); - neh->eh_entries = cpu_to_le16(le16_to_cpu(neh->eh_entries)+m); + le16_add_cpu(&neh->eh_entries, m); } set_buffer_uptodate(bh); @@ -753,8 +753,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto cleanup; - path[depth].p_hdr->eh_entries = - cpu_to_le16(le16_to_cpu(path[depth].p_hdr->eh_entries)-m); + le16_add_cpu(&path[depth].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + depth); if (err) goto cleanup; @@ -817,8 +816,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, if (m) { memmove(++fidx, path[i].p_idx - m, sizeof(struct ext4_extent_idx) * m); - neh->eh_entries = - cpu_to_le16(le16_to_cpu(neh->eh_entries) + m); + le16_add_cpu(&neh->eh_entries, m); } set_buffer_uptodate(bh); unlock_buffer(bh); @@ -834,7 +832,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path + i); if (err) goto cleanup; - path[i].p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path[i].p_hdr->eh_entries)-m); + le16_add_cpu(&path[i].p_hdr->eh_entries, -m); err = ext4_ext_dirty(handle, inode, path + i); if (err) goto cleanup; @@ -1369,7 +1367,7 @@ int ext4_ext_try_to_merge(struct inode *inode, * sizeof(struct ext4_extent); memmove(ex + 1, ex + 2, len); } - eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries) - 1); + le16_add_cpu(&eh->eh_entries, -1); merge_done = 1; WARN_ON(eh->eh_entries == 0); if (!eh->eh_entries) @@ -1560,7 +1558,7 @@ has_space: path[depth].p_ext = nearex; } - eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)+1); + le16_add_cpu(&eh->eh_entries, 1); nearex = path[depth].p_ext; nearex->ee_block = newext->ee_block; ext4_ext_store_pblock(nearex, ext_pblock(newext)); @@ -1699,7 +1697,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_get_access(handle, inode, path); if (err) return err; - path->p_hdr->eh_entries = cpu_to_le16(le16_to_cpu(path->p_hdr->eh_entries)-1); + le16_add_cpu(&path->p_hdr->eh_entries, -1); err = ext4_ext_dirty(handle, inode, path); if (err) return err; @@ -1902,7 +1900,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (num == 0) { /* this extent is removed; mark slot entirely unused */ ext4_ext_store_pblock(ex, 0); - eh->eh_entries = cpu_to_le16(le16_to_cpu(eh->eh_entries)-1); + le16_add_cpu(&eh->eh_entries, -1); } ex->ee_block = cpu_to_le32(block); @@ -1979,7 +1977,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start) * We start scanning from right side, freeing all the blocks * after i_size and walking into the tree depth-wise. */ - path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_KERNEL); + path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 1), GFP_NOFS); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; @@ -2138,6 +2136,82 @@ void ext4_ext_release(struct super_block *sb) #endif } +static void bi_complete(struct bio *bio, int error) +{ + complete((struct completion *)bio->bi_private); +} + +/* FIXME!! we need to try to merge to left or right after zero-out */ +static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) +{ + int ret = -EIO; + struct bio *bio; + int blkbits, blocksize; + sector_t ee_pblock; + struct completion event; + unsigned int ee_len, len, done, offset; + + + blkbits = inode->i_blkbits; + blocksize = inode->i_sb->s_blocksize; + ee_len = ext4_ext_get_actual_len(ex); + ee_pblock = ext_pblock(ex); + + /* convert ee_pblock to 512 byte sectors */ + ee_pblock = ee_pblock << (blkbits - 9); + + while (ee_len > 0) { + + if (ee_len > BIO_MAX_PAGES) + len = BIO_MAX_PAGES; + else + len = ee_len; + + bio = bio_alloc(GFP_NOIO, len); + if (!bio) + return -ENOMEM; + bio->bi_sector = ee_pblock; + bio->bi_bdev = inode->i_sb->s_bdev; + + done = 0; + offset = 0; + while (done < len) { + ret = bio_add_page(bio, ZERO_PAGE(0), + blocksize, offset); + if (ret != blocksize) { + /* + * We can't add any more pages because of + * hardware limitations. Start a new bio. + */ + break; + } + done++; + offset += blocksize; + if (offset >= PAGE_CACHE_SIZE) + offset = 0; + } + + init_completion(&event); + bio->bi_private = &event; + bio->bi_end_io = bi_complete; + submit_bio(WRITE, bio); + wait_for_completion(&event); + + if (test_bit(BIO_UPTODATE, &bio->bi_flags)) + ret = 0; + else { + ret = -EIO; + break; + } + bio_put(bio); + ee_len -= done; + ee_pblock += done << (blkbits - 9); + } + return ret; +} + +#define EXT4_EXT_ZERO_LEN 7 + /* * This function is called by ext4_ext_get_blocks() if someone tries to write * to an uninitialized extent. It may result in splitting the uninitialized @@ -2154,7 +2228,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, ext4_lblk_t iblock, unsigned long max_blocks) { - struct ext4_extent *ex, newex; + struct ext4_extent *ex, newex, orig_ex; struct ext4_extent *ex1 = NULL; struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; @@ -2173,10 +2247,26 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, allocated = ee_len - (iblock - ee_block); newblock = iblock - ee_block + ext_pblock(ex); ex2 = ex; + orig_ex.ee_block = ex->ee_block; + orig_ex.ee_len = cpu_to_le16(ee_len); + ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); err = ext4_ext_get_access(handle, inode, path + depth); if (err) goto out; + /* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */ + if (ee_len <= 2*EXT4_EXT_ZERO_LEN) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + return allocated; + } /* ex1: ee_block to iblock - 1 : uninitialized */ if (iblock > ee_block) { @@ -2195,19 +2285,103 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, /* ex3: to ee_block + ee_len : uninitialised */ if (allocated > max_blocks) { unsigned int newdepth; + /* If extent has less than EXT4_EXT_ZERO_LEN zerout directly */ + if (allocated <= EXT4_EXT_ZERO_LEN) { + /* Mark first half uninitialized. + * Mark second half initialized and zero out the + * initialized extent + */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = cpu_to_le16(ee_len - allocated); + ext4_ext_mark_uninitialized(ex); + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + + ex3 = &newex; + ex3->ee_block = cpu_to_le32(iblock); + ext4_ext_store_pblock(ex3, newblock); + ex3->ee_len = cpu_to_le16(allocated); + err = ext4_ext_insert_extent(handle, inode, path, ex3); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + return allocated; + + } else if (err) + goto fix_extent_len; + + /* + * We need to zero out the second half because + * an fallocate request can update file size and + * converting the second half to initialized extent + * implies that we can leak some junk data to user + * space. + */ + err = ext4_ext_zeroout(inode, ex3); + if (err) { + /* + * We should actually mark the + * second half as uninit and return error + * Insert would have changed the extent + */ + depth = ext_depth(inode); + ext4_ext_drop_refs(path); + path = ext4_ext_find_extent(inode, + iblock, path); + if (IS_ERR(path)) { + err = PTR_ERR(path); + return err; + } + ex = path[depth].p_ext; + err = ext4_ext_get_access(handle, inode, + path + depth); + if (err) + return err; + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; + } + + /* zeroed the second half */ + return allocated; + } ex3 = &newex; ex3->ee_block = cpu_to_le32(iblock + max_blocks); ext4_ext_store_pblock(ex3, newblock + max_blocks); ex3->ee_len = cpu_to_le16(allocated - max_blocks); ext4_ext_mark_uninitialized(ex3); err = ext4_ext_insert_extent(handle, inode, path, ex3); - if (err) - goto out; + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zeroed the full extent */ + return allocated; + + } else if (err) + goto fix_extent_len; /* * The depth, and hence eh & ex might change * as part of the insert above. */ newdepth = ext_depth(inode); + /* + * update the extent length after successfull insert of the + * split extent + */ + orig_ex.ee_len = cpu_to_le16(ee_len - + ext4_ext_get_actual_len(ex3)); if (newdepth != depth) { depth = newdepth; ext4_ext_drop_refs(path); @@ -2226,6 +2400,24 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; } allocated = max_blocks; + + /* If extent has less than EXT4_EXT_ZERO_LEN and we are trying + * to insert a extent in the middle zerout directly + * otherwise give the extent a chance to merge to left + */ + if (le16_to_cpu(orig_ex.ee_len) <= EXT4_EXT_ZERO_LEN && + iblock != ee_block) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } } /* * If there was a change of depth as part of the @@ -2282,8 +2474,29 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, goto out; insert: err = ext4_ext_insert_extent(handle, inode, path, &newex); + if (err == -ENOSPC) { + err = ext4_ext_zeroout(inode, &orig_ex); + if (err) + goto fix_extent_len; + /* update the extent length and mark as initialized */ + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_dirty(handle, inode, path + depth); + /* zero out the first half */ + return allocated; + } else if (err) + goto fix_extent_len; out: return err ? err : allocated; + +fix_extent_len: + ex->ee_block = orig_ex.ee_block; + ex->ee_len = orig_ex.ee_len; + ext4_ext_store_pblock(ex, ext_pblock(&orig_ex)); + ext4_ext_mark_uninitialized(ex); + ext4_ext_dirty(handle, inode, path + depth); + return err; } /* @@ -2393,8 +2606,20 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, } if (create == EXT4_CREATE_UNINITIALIZED_EXT) goto out; - if (!create) + if (!create) { + /* + * We have blocks reserved already. We + * return allocated blocks so that delalloc + * won't do block reservation for us. But + * the buffer head will be unmapped so that + * a read from the block returns 0s. + */ + if (allocated > max_blocks) + allocated = max_blocks; + /* mark the buffer unwritten */ + __set_bit(BH_Unwritten, &bh_result->b_state); goto out2; + } ret = ext4_ext_convert_to_initialized(handle, inode, path, iblock, @@ -2584,6 +2809,8 @@ out_stop: ext4_orphan_del(handle, inode); up_write(&EXT4_I(inode)->i_data_sem); + inode->i_mtime = inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); } @@ -2608,6 +2835,28 @@ int ext4_ext_writepage_trans_blocks(struct inode *inode, int num) return needed; } +static void ext4_falloc_update_inode(struct inode *inode, + int mode, loff_t new_size, int update_ctime) +{ + struct timespec now; + + if (update_ctime) { + now = current_fs_time(inode->i_sb); + if (!timespec_equal(&inode->i_ctime, &now)) + inode->i_ctime = now; + } + /* + * Update only when preallocation was requested beyond + * the file size. + */ + if (!(mode & FALLOC_FL_KEEP_SIZE) && + new_size > i_size_read(inode)) { + i_size_write(inode, new_size); + EXT4_I(inode)->i_disksize = new_size; + } + +} + /* * preallocate space for a file. This implements ext4's fallocate inode * operation, which gets called from sys_fallocate system call. @@ -2619,8 +2868,8 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) { handle_t *handle; ext4_lblk_t block; + loff_t new_size; unsigned long max_blocks; - ext4_fsblk_t nblocks = 0; int ret = 0; int ret2 = 0; int retries = 0; @@ -2639,9 +2888,12 @@ long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len) return -ENODEV; block = offset >> blkbits; + /* + * We can't just convert len to max_blocks because + * If blocksize = 4096 offset = 3072 and len = 2048 + */ max_blocks = (EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) - - block; - + - block; /* * credits to insert 1 extent into extent tree + buffers to be able to * modify 1 super block, 1 block bitmap and 1 group descriptor. @@ -2657,7 +2909,6 @@ retry: ret = PTR_ERR(handle); break; } - ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, EXT4_CREATE_UNINITIALIZED_EXT, 0); @@ -2673,61 +2924,24 @@ retry: ret2 = ext4_journal_stop(handle); break; } - if (ret > 0) { - /* check wrap through sign-bit/zero here */ - if ((block + ret) < 0 || (block + ret) < block) { - ret = -EIO; - ext4_mark_inode_dirty(handle, inode); - ret2 = ext4_journal_stop(handle); - break; - } - if (buffer_new(&map_bh) && ((block + ret) > - (EXT4_BLOCK_ALIGN(i_size_read(inode), blkbits) - >> blkbits))) - nblocks = nblocks + ret; - } - - /* Update ctime if new blocks get allocated */ - if (nblocks) { - struct timespec now; - - now = current_fs_time(inode->i_sb); - if (!timespec_equal(&inode->i_ctime, &now)) - inode->i_ctime = now; - } + if ((block + ret) >= (EXT4_BLOCK_ALIGN(offset + len, + blkbits) >> blkbits)) + new_size = offset + len; + else + new_size = (block + ret) << blkbits; + ext4_falloc_update_inode(inode, mode, new_size, + buffer_new(&map_bh)); ext4_mark_inode_dirty(handle, inode); ret2 = ext4_journal_stop(handle); if (ret2) break; } - - if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) { + ret = 0; goto retry; - - /* - * Time to update the file size. - * Update only when preallocation was requested beyond the file size. - */ - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len) > i_size_read(inode)) { - if (ret > 0) { - /* - * if no error, we assume preallocation succeeded - * completely - */ - i_size_write(inode, offset + len); - EXT4_I(inode)->i_disksize = i_size_read(inode); - } else if (ret < 0 && nblocks) { - /* Handle partial allocation scenario */ - loff_t newsize; - - newsize = (nblocks << blkbits) + i_size_read(inode); - i_size_write(inode, EXT4_BLOCK_ALIGN(newsize, blkbits)); - EXT4_I(inode)->i_disksize = i_size_read(inode); - } } - mutex_unlock(&inode->i_mutex); return ret > 0 ? ret2 : ret; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index ac35ec58db5..4159be6366a 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -21,8 +21,8 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> +#include "ext4.h" +#include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -129,7 +129,7 @@ const struct file_operations ext4_file_operations = { .write = do_sync_write, .aio_read = generic_file_aio_read, .aio_write = ext4_file_write, - .ioctl = ext4_ioctl, + .unlocked_ioctl = ext4_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = ext4_compat_ioctl, #endif diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 8d50879d1c2..1c8ba48d4f8 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -27,8 +27,8 @@ #include <linux/sched.h> #include <linux/writeback.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> +#include "ext4.h" +#include "ext4_jbd2.h" /* * akpm: A new design for ext4_sync_file(). @@ -72,6 +72,9 @@ int ext4_sync_file(struct file * file, struct dentry *dentry, int datasync) goto out; } + if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) + goto out; + /* * The VFS has written the file data. If the inode is unaltered * then we need not start a commit. diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index 1555024e3b3..1d6329dbe39 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -11,8 +11,8 @@ #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> #include <linux/cryptohash.h> +#include "ext4.h" #define DELTA 0x9E3779B9 diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 486e46a3918..c6efbab0c80 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -15,8 +15,6 @@ #include <linux/time.h> #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> @@ -25,7 +23,8 @@ #include <linux/bitops.h> #include <linux/blkdev.h> #include <asm/byteorder.h> - +#include "ext4.h" +#include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "group.h" @@ -75,7 +74,7 @@ unsigned ext4_init_inode_bitmap(struct super_block *sb, struct buffer_head *bh, /* If checksum is bad mark all blocks and inodes use to prevent * allocation, essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sbi, block_group, gdp)) { - ext4_error(sb, __FUNCTION__, "Checksum bad for group %lu\n", + ext4_error(sb, __func__, "Checksum bad for group %lu\n", block_group); gdp->bg_free_blocks_count = 0; gdp->bg_free_inodes_count = 0; @@ -223,11 +222,9 @@ void ext4_free_inode (handle_t *handle, struct inode * inode) if (gdp) { spin_lock(sb_bgl_lock(sbi, block_group)); - gdp->bg_free_inodes_count = cpu_to_le16( - le16_to_cpu(gdp->bg_free_inodes_count) + 1); + le16_add_cpu(&gdp->bg_free_inodes_count, 1); if (is_directory) - gdp->bg_used_dirs_count = cpu_to_le16( - le16_to_cpu(gdp->bg_used_dirs_count) - 1); + le16_add_cpu(&gdp->bg_used_dirs_count, -1); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); @@ -588,7 +585,7 @@ got: ino++; if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "reserved inode or inode > inodes count - " "block_group = %lu, inode=%lu", group, ino + group * EXT4_INODES_PER_GROUP(sb)); @@ -664,11 +661,9 @@ got: cpu_to_le16(EXT4_INODES_PER_GROUP(sb) - ino); } - gdp->bg_free_inodes_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_inodes_count) - 1); + le16_add_cpu(&gdp->bg_free_inodes_count, -1); if (S_ISDIR(mode)) { - gdp->bg_used_dirs_count = - cpu_to_le16(le16_to_cpu(gdp->bg_used_dirs_count) + 1); + le16_add_cpu(&gdp->bg_used_dirs_count, 1); } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); spin_unlock(sb_bgl_lock(sbi, group)); @@ -744,23 +739,24 @@ got: if (err) goto fail_free_drop; - err = ext4_mark_inode_dirty(handle, inode); - if (err) { - ext4_std_error(sb, err); - goto fail_free_drop; - } if (test_opt(sb, EXTENTS)) { - /* set extent flag only for directory and file */ - if (S_ISDIR(mode) || S_ISREG(mode)) { + /* set extent flag only for diretory, file and normal symlink*/ + if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) { EXT4_I(inode)->i_flags |= EXT4_EXTENTS_FL; ext4_ext_tree_init(handle, inode); err = ext4_update_incompat_feature(handle, sb, EXT4_FEATURE_INCOMPAT_EXTENTS); if (err) - goto fail; + goto fail_free_drop; } } + err = ext4_mark_inode_dirty(handle, inode); + if (err) { + ext4_std_error(sb, err); + goto fail_free_drop; + } + ext4_debug("allocating inode %lu\n", inode->i_ino); goto really_out; fail: @@ -796,7 +792,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) /* Error cases - e2fsck has already cleaned up for us */ if (ino > max_ino) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "bad orphan ino %lu! e2fsck was run?", ino); goto error; } @@ -805,7 +801,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) bit = (ino - 1) % EXT4_INODES_PER_GROUP(sb); bitmap_bh = read_inode_bitmap(sb, block_group); if (!bitmap_bh) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "inode bitmap error for orphan %lu", ino); goto error; } @@ -830,7 +826,7 @@ iget_failed: err = PTR_ERR(inode); inode = NULL; bad_orphan: - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "bad orphan inode %lu! e2fsck was run?", ino); printk(KERN_NOTICE "ext4_test_bit(bit=%d, block=%llu) = %d\n", bit, (unsigned long long)bitmap_bh->b_blocknr, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8fab233cb05..8d970774641 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -25,7 +25,6 @@ #include <linux/module.h> #include <linux/fs.h> #include <linux/time.h> -#include <linux/ext4_jbd2.h> #include <linux/jbd2.h> #include <linux/highuid.h> #include <linux/pagemap.h> @@ -36,6 +35,7 @@ #include <linux/mpage.h> #include <linux/uio.h> #include <linux/bio.h> +#include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" @@ -93,7 +93,7 @@ int ext4_forget(handle_t *handle, int is_metadata, struct inode *inode, BUFFER_TRACE(bh, "call ext4_journal_revoke"); err = ext4_journal_revoke(handle, blocknr, bh); if (err) - ext4_abort(inode->i_sb, __FUNCTION__, + ext4_abort(inode->i_sb, __func__, "error %d when attempting revoke", err); BUFFER_TRACE(bh, "exit"); return err; @@ -985,6 +985,16 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, } else { retval = ext4_get_blocks_handle(handle, inode, block, max_blocks, bh, create, extend_disksize); + + if (retval > 0 && buffer_new(bh)) { + /* + * We allocated new blocks which will result in + * i_data's format changing. Force the migrate + * to fail by clearing migrate flags + */ + EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & + ~EXT4_EXT_MIGRATE; + } } up_write((&EXT4_I(inode)->i_data_sem)); return retval; @@ -1230,7 +1240,7 @@ int ext4_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { int err = jbd2_journal_dirty_data(handle, bh); if (err) - ext4_journal_abort_handle(__FUNCTION__, __FUNCTION__, + ext4_journal_abort_handle(__func__, __func__, bh, handle, err); return err; } @@ -1301,10 +1311,11 @@ static int ext4_ordered_write_end(struct file *file, new_i_size = pos + copied; if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - copied = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (copied < 0) - ret = copied; + copied = ret2; + if (ret2 < 0) + ret = ret2; } ret2 = ext4_journal_stop(handle); if (!ret) @@ -1329,10 +1340,11 @@ static int ext4_writeback_write_end(struct file *file, if (new_i_size > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = new_i_size; - copied = ext4_generic_write_end(file, mapping, pos, len, copied, + ret2 = ext4_generic_write_end(file, mapping, pos, len, copied, page, fsdata); - if (copied < 0) - ret = copied; + copied = ret2; + if (ret2 < 0) + ret = ret2; ret2 = ext4_journal_stop(handle); if (!ret) @@ -2501,12 +2513,10 @@ out_stop: static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, unsigned long ino, struct ext4_iloc *iloc) { - unsigned long desc, group_desc; ext4_group_t block_group; unsigned long offset; ext4_fsblk_t block; - struct buffer_head *bh; - struct ext4_group_desc * gdp; + struct ext4_group_desc *gdp; if (!ext4_valid_inum(sb, ino)) { /* @@ -2518,22 +2528,10 @@ static ext4_fsblk_t ext4_get_inode_block(struct super_block *sb, } block_group = (ino - 1) / EXT4_INODES_PER_GROUP(sb); - if (block_group >= EXT4_SB(sb)->s_groups_count) { - ext4_error(sb,"ext4_get_inode_block","group >= groups count"); + gdp = ext4_get_group_desc(sb, block_group, NULL); + if (!gdp) return 0; - } - smp_rmb(); - group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); - desc = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); - bh = EXT4_SB(sb)->s_group_desc[group_desc]; - if (!bh) { - ext4_error (sb, "ext4_get_inode_block", - "Descriptor not loaded"); - return 0; - } - gdp = (struct ext4_group_desc *)((__u8 *)bh->b_data + - desc * EXT4_DESC_SIZE(sb)); /* * Figure out the offset within the block group inode table */ @@ -2976,7 +2974,8 @@ static int ext4_do_update_inode(handle_t *handle, if (ext4_inode_blocks_set(handle, raw_inode, ei)) goto out_brelse; raw_inode->i_dtime = cpu_to_le32(ei->i_dtime); - raw_inode->i_flags = cpu_to_le32(ei->i_flags); + /* clear the migrate flag in the raw_inode */ + raw_inode->i_flags = cpu_to_le32(ei->i_flags & ~EXT4_EXT_MIGRATE); if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_HURD)) raw_inode->i_file_acl_high = @@ -3374,7 +3373,7 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) EXT4_I(inode)->i_state |= EXT4_STATE_NO_EXPAND; if (mnt_count != le16_to_cpu(sbi->s_es->s_mnt_count)) { - ext4_warning(inode->i_sb, __FUNCTION__, + ext4_warning(inode->i_sb, __func__, "Unable to expand inode %lu. Delete" " some EAs or run e2fsck.", inode->i_ino); @@ -3415,7 +3414,7 @@ void ext4_dirty_inode(struct inode *inode) current_handle->h_transaction != handle->h_transaction) { /* This task has a transaction open against a different fs */ printk(KERN_EMERG "%s: transactions do not match!\n", - __FUNCTION__); + __func__); } else { jbd_debug(5, "marking dirty. outer handle=%p\n", current_handle); diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 25b13ede808..7a6c2f1faba 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -10,17 +10,17 @@ #include <linux/fs.h> #include <linux/jbd2.h> #include <linux/capability.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> #include <linux/time.h> #include <linux/compat.h> #include <linux/smp_lock.h> #include <linux/mount.h> #include <asm/uaccess.h> +#include "ext4_jbd2.h" +#include "ext4.h" -int ext4_ioctl (struct inode * inode, struct file * filp, unsigned int cmd, - unsigned long arg) +long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { + struct inode *inode = filp->f_dentry->d_inode; struct ext4_inode_info *ei = EXT4_I(inode); unsigned int flags; unsigned short rsv_window_size; @@ -277,9 +277,6 @@ setversion_out: #ifdef CONFIG_COMPAT long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct inode *inode = file->f_path.dentry->d_inode; - int ret; - /* These are just misnamed, they actually get/put from/to user an int */ switch (cmd) { case EXT4_IOC32_GETFLAGS: @@ -319,9 +316,6 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) default: return -ENOIOCTLCMD; } - lock_kernel(); - ret = ext4_ioctl(inode, file, cmd, (unsigned long) compat_ptr(arg)); - unlock_kernel(); - return ret; + return ext4_ioctl(file, cmd, (unsigned long) compat_ptr(arg)); } #endif diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index ef97f19c2f9..fbec2ef9379 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -21,21 +21,7 @@ * mballoc.c contains the multiblocks allocation routines */ -#include <linux/time.h> -#include <linux/fs.h> -#include <linux/namei.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/quotaops.h> -#include <linux/buffer_head.h> -#include <linux/module.h> -#include <linux/swap.h> -#include <linux/proc_fs.h> -#include <linux/pagemap.h> -#include <linux/seq_file.h> -#include <linux/version.h> -#include "group.h" - +#include "mballoc.h" /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -345,288 +331,6 @@ * */ -/* - * with AGGRESSIVE_CHECK allocator runs consistency checks over - * structures. these checks slow things down a lot - */ -#define AGGRESSIVE_CHECK__ - -/* - * with DOUBLE_CHECK defined mballoc creates persistent in-core - * bitmaps, maintains and uses them to check for double allocations - */ -#define DOUBLE_CHECK__ - -/* - */ -#define MB_DEBUG__ -#ifdef MB_DEBUG -#define mb_debug(fmt, a...) printk(fmt, ##a) -#else -#define mb_debug(fmt, a...) -#endif - -/* - * with EXT4_MB_HISTORY mballoc stores last N allocations in memory - * and you can monitor it in /proc/fs/ext4/<dev>/mb_history - */ -#define EXT4_MB_HISTORY -#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ -#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ -#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ -#define EXT4_MB_HISTORY_FREE 8 /* free */ - -#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ - EXT4_MB_HISTORY_PREALLOC) - -/* - * How long mballoc can look for a best extent (in found extents) - */ -#define MB_DEFAULT_MAX_TO_SCAN 200 - -/* - * How long mballoc must look for a best extent - */ -#define MB_DEFAULT_MIN_TO_SCAN 10 - -/* - * How many groups mballoc will scan looking for the best chunk - */ -#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 - -/* - * with 'ext4_mb_stats' allocator will collect stats that will be - * shown at umount. The collecting costs though! - */ -#define MB_DEFAULT_STATS 1 - -/* - * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served - * by the stream allocator, which purpose is to pack requests - * as close each to other as possible to produce smooth I/O traffic - * We use locality group prealloc space for stream request. - * We can tune the same via /proc/fs/ext4/<parition>/stream_req - */ -#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ - -/* - * for which requests use 2^N search using buddies - */ -#define MB_DEFAULT_ORDER2_REQS 2 - -/* - * default group prealloc size 512 blocks - */ -#define MB_DEFAULT_GROUP_PREALLOC 512 - -static struct kmem_cache *ext4_pspace_cachep; -static struct kmem_cache *ext4_ac_cachep; - -#ifdef EXT4_BB_MAX_BLOCKS -#undef EXT4_BB_MAX_BLOCKS -#endif -#define EXT4_BB_MAX_BLOCKS 30 - -struct ext4_free_metadata { - ext4_group_t group; - unsigned short num; - ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; - struct list_head list; -}; - -struct ext4_group_info { - unsigned long bb_state; - unsigned long bb_tid; - struct ext4_free_metadata *bb_md_cur; - unsigned short bb_first_free; - unsigned short bb_free; - unsigned short bb_fragments; - struct list_head bb_prealloc_list; -#ifdef DOUBLE_CHECK - void *bb_bitmap; -#endif - unsigned short bb_counters[]; -}; - -#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 - -#define EXT4_MB_GRP_NEED_INIT(grp) \ - (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) - - -struct ext4_prealloc_space { - struct list_head pa_inode_list; - struct list_head pa_group_list; - union { - struct list_head pa_tmp_list; - struct rcu_head pa_rcu; - } u; - spinlock_t pa_lock; - atomic_t pa_count; - unsigned pa_deleted; - ext4_fsblk_t pa_pstart; /* phys. block */ - ext4_lblk_t pa_lstart; /* log. block */ - unsigned short pa_len; /* len of preallocated chunk */ - unsigned short pa_free; /* how many blocks are free */ - unsigned short pa_linear; /* consumed in one direction - * strictly, for grp prealloc */ - spinlock_t *pa_obj_lock; - struct inode *pa_inode; /* hack, for history only */ -}; - - -struct ext4_free_extent { - ext4_lblk_t fe_logical; - ext4_grpblk_t fe_start; - ext4_group_t fe_group; - int fe_len; -}; - -/* - * Locality group: - * we try to group all related changes together - * so that writeback can flush/allocate them together as well - */ -struct ext4_locality_group { - /* for allocator */ - struct mutex lg_mutex; /* to serialize allocates */ - struct list_head lg_prealloc_list;/* list of preallocations */ - spinlock_t lg_prealloc_lock; -}; - -struct ext4_allocation_context { - struct inode *ac_inode; - struct super_block *ac_sb; - - /* original request */ - struct ext4_free_extent ac_o_ex; - - /* goal request (after normalization) */ - struct ext4_free_extent ac_g_ex; - - /* the best found extent */ - struct ext4_free_extent ac_b_ex; - - /* copy of the bext found extent taken before preallocation efforts */ - struct ext4_free_extent ac_f_ex; - - /* number of iterations done. we have to track to limit searching */ - unsigned long ac_ex_scanned; - __u16 ac_groups_scanned; - __u16 ac_found; - __u16 ac_tail; - __u16 ac_buddy; - __u16 ac_flags; /* allocation hints */ - __u8 ac_status; - __u8 ac_criteria; - __u8 ac_repeats; - __u8 ac_2order; /* if request is to allocate 2^N blocks and - * N > 0, the field stores N, otherwise 0 */ - __u8 ac_op; /* operation, for history only */ - struct page *ac_bitmap_page; - struct page *ac_buddy_page; - struct ext4_prealloc_space *ac_pa; - struct ext4_locality_group *ac_lg; -}; - -#define AC_STATUS_CONTINUE 1 -#define AC_STATUS_FOUND 2 -#define AC_STATUS_BREAK 3 - -struct ext4_mb_history { - struct ext4_free_extent orig; /* orig allocation */ - struct ext4_free_extent goal; /* goal allocation */ - struct ext4_free_extent result; /* result allocation */ - unsigned pid; - unsigned ino; - __u16 found; /* how many extents have been found */ - __u16 groups; /* how many groups have been scanned */ - __u16 tail; /* what tail broke some buddy */ - __u16 buddy; /* buddy the tail ^^^ broke */ - __u16 flags; - __u8 cr:3; /* which phase the result extent was found at */ - __u8 op:4; - __u8 merged:1; -}; - -struct ext4_buddy { - struct page *bd_buddy_page; - void *bd_buddy; - struct page *bd_bitmap_page; - void *bd_bitmap; - struct ext4_group_info *bd_info; - struct super_block *bd_sb; - __u16 bd_blkbits; - ext4_group_t bd_group; -}; -#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) -#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) - -#ifndef EXT4_MB_HISTORY -static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) -{ - return; -} -#else -static void ext4_mb_store_history(struct ext4_allocation_context *ac); -#endif - -#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) - -static struct proc_dir_entry *proc_root_ext4; -struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); -ext4_fsblk_t ext4_new_blocks_old(handle_t *handle, struct inode *inode, - ext4_fsblk_t goal, unsigned long *count, int *errp); - -static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, - ext4_group_t group); -static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); -static void ext4_mb_free_committed_blocks(struct super_block *); -static void ext4_mb_return_to_preallocation(struct inode *inode, - struct ext4_buddy *e4b, sector_t block, - int count); -static void ext4_mb_put_pa(struct ext4_allocation_context *, - struct super_block *, struct ext4_prealloc_space *pa); -static int ext4_mb_init_per_dev_proc(struct super_block *sb); -static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); - - -static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline void ext4_unlock_group(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); -} - -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); -} - -static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, - struct ext4_free_extent *fex) -{ - ext4_fsblk_t block; - - block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) - + fex->fe_start - + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - return block; -} - static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { #if BITS_PER_LONG == 64 @@ -736,7 +440,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_error(sb, __FUNCTION__, "double-free of inode" + ext4_error(sb, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %lu)\n", inode ? inode->i_ino : 0, blocknr, first + i, e4b->bd_group); @@ -898,17 +602,17 @@ static int __mb_check_buddy(struct ext4_buddy *e4b, char *file, list_for_each(cur, &grp->bb_prealloc_list) { ext4_group_t groupnr; struct ext4_prealloc_space *pa; - pa = list_entry(cur, struct ext4_prealloc_space, group_list); - ext4_get_group_no_and_offset(sb, pa->pstart, &groupnr, &k); + pa = list_entry(cur, struct ext4_prealloc_space, pa_group_list); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, &groupnr, &k); MB_CHECK_ASSERT(groupnr == e4b->bd_group); - for (i = 0; i < pa->len; i++) + for (i = 0; i < pa->pa_len; i++) MB_CHECK_ASSERT(mb_test_bit(k + i, buddy)); } return 0; } #undef MB_CHECK_ASSERT #define mb_check_buddy(e4b) __mb_check_buddy(e4b, \ - __FILE__, __FUNCTION__, __LINE__) + __FILE__, __func__, __LINE__) #else #define mb_check_buddy(e4b) #endif @@ -982,7 +686,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "EXT4-fs: group %lu: %u blocks in bitmap, %u in gd\n", group, free, grp->bb_free); /* @@ -1168,8 +872,9 @@ out: return err; } -static int ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, - struct ext4_buddy *e4b) +static noinline_for_stack int +ext4_mb_load_buddy(struct super_block *sb, ext4_group_t group, + struct ext4_buddy *e4b) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct inode *inode = sbi->s_buddy_cache; @@ -1367,7 +1072,7 @@ static int mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr += le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); - ext4_error(sb, __FUNCTION__, "double-free of inode" + ext4_error(sb, __func__, "double-free of inode" " %lu's block %llu(bit %u in group %lu)\n", inode ? inode->i_ino : 0, blocknr, block, e4b->bd_group); @@ -1848,7 +1553,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_error(sb, __FUNCTION__, "%d free blocks as per " + ext4_error(sb, __func__, "%d free blocks as per " "group info. But bitmap says 0\n", free); break; @@ -1857,7 +1562,7 @@ static void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_error(sb, __FUNCTION__, "%d free blocks as per " + ext4_error(sb, __func__, "%d free blocks as per " "group info. But got %d blocks\n", free, ex.fe_len); /* @@ -1965,7 +1670,8 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, return 0; } -static int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t group; ext4_group_t i; @@ -2449,17 +2155,10 @@ static void ext4_mb_history_init(struct super_block *sb) int i; if (sbi->s_mb_proc != NULL) { - struct proc_dir_entry *p; - p = create_proc_entry("mb_history", S_IRUGO, sbi->s_mb_proc); - if (p) { - p->proc_fops = &ext4_mb_seq_history_fops; - p->data = sb; - } - p = create_proc_entry("mb_groups", S_IRUGO, sbi->s_mb_proc); - if (p) { - p->proc_fops = &ext4_mb_seq_groups_fops; - p->data = sb; - } + proc_create_data("mb_history", S_IRUGO, sbi->s_mb_proc, + &ext4_mb_seq_history_fops, sb); + proc_create_data("mb_groups", S_IRUGO, sbi->s_mb_proc, + &ext4_mb_seq_groups_fops, sb); } sbi->s_mb_history_max = 1000; @@ -2472,7 +2171,8 @@ static void ext4_mb_history_init(struct super_block *sb) /* if we can't allocate history, then we simple won't use it */ } -static void ext4_mb_store_history(struct ext4_allocation_context *ac) +static noinline_for_stack void +ext4_mb_store_history(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_mb_history h; @@ -2572,13 +2272,13 @@ static int ext4_mb_init_backend(struct super_block *sb) meta_group_info[j] = kzalloc(len, GFP_KERNEL); if (meta_group_info[j] == NULL) { printk(KERN_ERR "EXT4-fs: can't allocate buddy mem\n"); - i--; goto err_freebuddy; } desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR "EXT4-fs: can't read descriptor %lu\n", i); + i++; goto err_freebuddy; } memset(meta_group_info[j], 0, len); @@ -2618,13 +2318,11 @@ static int ext4_mb_init_backend(struct super_block *sb) return 0; err_freebuddy: - while (i >= 0) { + while (i-- > 0) kfree(ext4_get_group_info(sb, i)); - i--; - } i = num_meta_group_infos; err_freemeta: - while (--i >= 0) + while (i-- > 0) kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: @@ -2808,7 +2506,8 @@ int ext4_mb_release(struct super_block *sb) return 0; } -static void ext4_mb_free_committed_blocks(struct super_block *sb) +static noinline_for_stack void +ext4_mb_free_committed_blocks(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); int err; @@ -2867,7 +2566,6 @@ static void ext4_mb_free_committed_blocks(struct super_block *sb) mb_debug("freed %u blocks in %u structures\n", count, count2); } -#define EXT4_ROOT "ext4" #define EXT4_MB_STATS_NAME "stats" #define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan" #define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan" @@ -3007,9 +2705,9 @@ int __init init_ext4_mballoc(void) return -ENOMEM; } #ifdef CONFIG_PROC_FS - proc_root_ext4 = proc_mkdir(EXT4_ROOT, proc_root_fs); + proc_root_ext4 = proc_mkdir("fs/ext4", NULL); if (proc_root_ext4 == NULL) - printk(KERN_ERR "EXT4-fs: Unable to create %s\n", EXT4_ROOT); + printk(KERN_ERR "EXT4-fs: Unable to create fs/ext4\n"); #endif return 0; } @@ -3020,7 +2718,7 @@ void exit_ext4_mballoc(void) kmem_cache_destroy(ext4_pspace_cachep); kmem_cache_destroy(ext4_ac_cachep); #ifdef CONFIG_PROC_FS - remove_proc_entry(EXT4_ROOT, proc_root_fs); + remove_proc_entry("fs/ext4", NULL); #endif } @@ -3029,7 +2727,8 @@ void exit_ext4_mballoc(void) * Check quota and mark choosed space (ac->ac_b_ex) non-free in bitmaps * Returns 0 if success or error code */ -static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, +static noinline_for_stack int +ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle) { struct buffer_head *bitmap_bh = NULL; @@ -3078,7 +2777,7 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, in_range(block, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Allocating block in system zone - block = %llu", block); } @@ -3102,9 +2801,7 @@ static int ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, ac->ac_b_ex.fe_group, gdp)); } - gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - - ac->ac_b_ex.fe_len); + le16_add_cpu(&gdp->bg_free_blocks_count, -ac->ac_b_ex.fe_len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); @@ -3138,7 +2835,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_stripe; else ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug("#%u: goal %lu blocks for locality group\n", + mb_debug("#%u: goal %u blocks for locality group\n", current->pid, ac->ac_g_ex.fe_len); } @@ -3146,15 +2843,16 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) * Normalization means making request better in terms of * size and alignment */ -static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, +static noinline_for_stack void +ext4_mb_normalize_request(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { int bsbits, max; ext4_lblk_t end; - struct list_head *cur; loff_t size, orig_size, start_off; ext4_lblk_t start, orig_start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); + struct ext4_prealloc_space *pa; /* do normalize only data requests, metadata requests do not need preallocation */ @@ -3240,12 +2938,9 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* check we don't cross already preallocated blocks */ rcu_read_lock(); - list_for_each_rcu(cur, &ei->i_prealloc_list) { - struct ext4_prealloc_space *pa; + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { unsigned long pa_end; - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); - if (pa->pa_deleted) continue; spin_lock(&pa->pa_lock); @@ -3287,10 +2982,8 @@ static void ext4_mb_normalize_request(struct ext4_allocation_context *ac, /* XXX: extra loop to check we really don't overlap preallocations */ rcu_read_lock(); - list_for_each_rcu(cur, &ei->i_prealloc_list) { - struct ext4_prealloc_space *pa; + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { unsigned long pa_end; - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0) { pa_end = pa->pa_lstart + pa->pa_len; @@ -3382,7 +3075,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug("use %llu/%lu from inode pa %p\n", start, len, pa); + mb_debug("use %llu/%u from inode pa %p\n", start, len, pa); } /* @@ -3412,12 +3105,12 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, /* * search goal blocks in preallocated space */ -static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_locality_group *lg; struct ext4_prealloc_space *pa; - struct list_head *cur; /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) @@ -3425,8 +3118,7 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* first, try per-file preallocation */ rcu_read_lock(); - list_for_each_rcu(cur, &ei->i_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); + list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) { /* all fields in this condition don't change, * so we can skip locking for them */ @@ -3458,8 +3150,7 @@ static int ext4_mb_use_preallocated(struct ext4_allocation_context *ac) return 0; rcu_read_lock(); - list_for_each_rcu(cur, &lg->lg_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, pa_inode_list); + list_for_each_entry_rcu(pa, &lg->lg_prealloc_list, pa_inode_list) { spin_lock(&pa->pa_lock); if (pa->pa_deleted == 0 && pa->pa_free >= ac->ac_o_ex.fe_len) { atomic_inc(&pa->pa_count); @@ -3579,7 +3270,8 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, /* * creates new preallocated space for given inode */ -static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_prealloc_space *pa; @@ -3666,7 +3358,8 @@ static int ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) /* * creates new preallocated space for locality group inodes belongs to */ -static int ext4_mb_new_group_pa(struct ext4_allocation_context *ac) +static noinline_for_stack int +ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; struct ext4_locality_group *lg; @@ -3739,11 +3432,11 @@ static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) * the caller MUST hold group/inode locks. * TODO: optimize the case when there are no in-core structures yet */ -static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, - struct buffer_head *bitmap_bh, - struct ext4_prealloc_space *pa) +static noinline_for_stack int +ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, + struct ext4_prealloc_space *pa, + struct ext4_allocation_context *ac) { - struct ext4_allocation_context *ac; struct super_block *sb = e4b->bd_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); unsigned long end; @@ -3759,8 +3452,6 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, BUG_ON(group != e4b->bd_group && pa->pa_len != 0); end = bit + pa->pa_len; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) { ac->ac_sb = sb; ac->ac_inode = pa->pa_inode; @@ -3797,7 +3488,7 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_error(sb, __FUNCTION__, "free %u, pa_free %u\n", + ext4_error(sb, __func__, "free %u, pa_free %u\n", free, pa->pa_free); /* * pa is already deleted so we use the value obtained @@ -3805,22 +3496,19 @@ static int ext4_mb_release_inode_pa(struct ext4_buddy *e4b, */ } atomic_add(free, &sbi->s_mb_discarded); - if (ac) - kmem_cache_free(ext4_ac_cachep, ac); return err; } -static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, - struct ext4_prealloc_space *pa) +static noinline_for_stack int +ext4_mb_release_group_pa(struct ext4_buddy *e4b, + struct ext4_prealloc_space *pa, + struct ext4_allocation_context *ac) { - struct ext4_allocation_context *ac; struct super_block *sb = e4b->bd_sb; ext4_group_t group; ext4_grpblk_t bit; - ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); - if (ac) ac->ac_op = EXT4_MB_HISTORY_DISCARD; @@ -3838,7 +3526,6 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, ac->ac_b_ex.fe_len = pa->pa_len; ac->ac_b_ex.fe_logical = 0; ext4_mb_store_history(ac); - kmem_cache_free(ext4_ac_cachep, ac); } return 0; @@ -3853,12 +3540,14 @@ static int ext4_mb_release_group_pa(struct ext4_buddy *e4b, * - how many do we discard * 1) how many requested */ -static int ext4_mb_discard_group_preallocations(struct super_block *sb, +static noinline_for_stack int +ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_group_t group, int needed) { struct ext4_group_info *grp = ext4_get_group_info(sb, group); struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; struct list_head list; struct ext4_buddy e4b; int err; @@ -3886,6 +3575,7 @@ static int ext4_mb_discard_group_preallocations(struct super_block *sb, grp = ext4_get_group_info(sb, group); INIT_LIST_HEAD(&list); + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); repeat: ext4_lock_group(sb, group); list_for_each_entry_safe(pa, tmp, @@ -3940,9 +3630,9 @@ repeat: spin_unlock(pa->pa_obj_lock); if (pa->pa_linear) - ext4_mb_release_group_pa(&e4b, pa); + ext4_mb_release_group_pa(&e4b, pa, ac); else - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); @@ -3950,6 +3640,8 @@ repeat: out: ext4_unlock_group(sb, group); + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); ext4_mb_release_desc(&e4b); put_bh(bitmap_bh); return free; @@ -3970,6 +3662,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode) struct super_block *sb = inode->i_sb; struct buffer_head *bitmap_bh = NULL; struct ext4_prealloc_space *pa, *tmp; + struct ext4_allocation_context *ac; ext4_group_t group = 0; struct list_head list; struct ext4_buddy e4b; @@ -3984,6 +3677,7 @@ void ext4_mb_discard_inode_preallocations(struct inode *inode) INIT_LIST_HEAD(&list); + ac = kmem_cache_alloc(ext4_ac_cachep, GFP_NOFS); repeat: /* first, collect all pa's in the inode */ spin_lock(&ei->i_prealloc_lock); @@ -4048,7 +3742,7 @@ repeat: ext4_lock_group(sb, group); list_del(&pa->pa_group_list); - ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa); + ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac); ext4_unlock_group(sb, group); ext4_mb_release_desc(&e4b); @@ -4057,6 +3751,8 @@ repeat: list_del(&pa->u.pa_tmp_list); call_rcu(&(pa)->u.pa_rcu, ext4_mb_pa_callback); } + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); } /* @@ -4116,7 +3812,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) printk(KERN_ERR "PA:%lu:%d:%u \n", i, start, pa->pa_len); } - ext4_lock_group(sb, i); + ext4_unlock_group(sb, i); if (grp->bb_free == 0) continue; @@ -4175,7 +3871,8 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) mutex_lock(&ac->ac_lg->lg_mutex); } -static int ext4_mb_initialize_context(struct ext4_allocation_context *ac, +static noinline_for_stack int +ext4_mb_initialize_context(struct ext4_allocation_context *ac, struct ext4_allocation_request *ar) { struct super_block *sb = ar->inode->i_sb; @@ -4406,7 +4103,8 @@ static void ext4_mb_poll_new_transaction(struct super_block *sb, ext4_mb_free_committed_blocks(sb); } -static int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, +static noinline_for_stack int +ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, ext4_group_t group, ext4_grpblk_t block, int count) { struct ext4_group_info *db = e4b->bd_info; @@ -4497,7 +4195,7 @@ void ext4_mb_free_blocks(handle_t *handle, struct inode *inode, if (block < le32_to_cpu(es->s_first_data_block) || block + count < block || block + count > ext4_blocks_count(es)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Freeing blocks not in datazone - " "block = %lu, count = %lu", block, count); goto error_return; @@ -4538,7 +4236,7 @@ do_more: in_range(block + count - 1, ext4_inode_table(sb, gdp), EXT4_SB(sb)->s_itb_per_group)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "Freeing blocks in system zone - " "Block = %lu, count = %lu", block, count); } @@ -4596,8 +4294,7 @@ do_more: } spin_lock(sb_bgl_lock(sbi, block_group)); - gdp->bg_free_blocks_count = - cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + count); + le16_add_cpu(&gdp->bg_free_blocks_count, count); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); spin_unlock(sb_bgl_lock(sbi, block_group)); percpu_counter_add(&sbi->s_freeblocks_counter, count); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h new file mode 100644 index 00000000000..bfe6add46bc --- /dev/null +++ b/fs/ext4/mballoc.h @@ -0,0 +1,304 @@ +/* + * fs/ext4/mballoc.h + * + * Written by: Alex Tomas <alex@clusterfs.com> + * + */ +#ifndef _EXT4_MBALLOC_H +#define _EXT4_MBALLOC_H + +#include <linux/time.h> +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/quotaops.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/swap.h> +#include <linux/proc_fs.h> +#include <linux/pagemap.h> +#include <linux/seq_file.h> +#include <linux/version.h> +#include "ext4_jbd2.h" +#include "ext4.h" +#include "group.h" + +/* + * with AGGRESSIVE_CHECK allocator runs consistency checks over + * structures. these checks slow things down a lot + */ +#define AGGRESSIVE_CHECK__ + +/* + * with DOUBLE_CHECK defined mballoc creates persistent in-core + * bitmaps, maintains and uses them to check for double allocations + */ +#define DOUBLE_CHECK__ + +/* + */ +#define MB_DEBUG__ +#ifdef MB_DEBUG +#define mb_debug(fmt, a...) printk(fmt, ##a) +#else +#define mb_debug(fmt, a...) +#endif + +/* + * with EXT4_MB_HISTORY mballoc stores last N allocations in memory + * and you can monitor it in /proc/fs/ext4/<dev>/mb_history + */ +#define EXT4_MB_HISTORY +#define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ +#define EXT4_MB_HISTORY_PREALLOC 2 /* preallocated blocks used */ +#define EXT4_MB_HISTORY_DISCARD 4 /* preallocation discarded */ +#define EXT4_MB_HISTORY_FREE 8 /* free */ + +#define EXT4_MB_HISTORY_DEFAULT (EXT4_MB_HISTORY_ALLOC | \ + EXT4_MB_HISTORY_PREALLOC) + +/* + * How long mballoc can look for a best extent (in found extents) + */ +#define MB_DEFAULT_MAX_TO_SCAN 200 + +/* + * How long mballoc must look for a best extent + */ +#define MB_DEFAULT_MIN_TO_SCAN 10 + +/* + * How many groups mballoc will scan looking for the best chunk + */ +#define MB_DEFAULT_MAX_GROUPS_TO_SCAN 5 + +/* + * with 'ext4_mb_stats' allocator will collect stats that will be + * shown at umount. The collecting costs though! + */ +#define MB_DEFAULT_STATS 1 + +/* + * files smaller than MB_DEFAULT_STREAM_THRESHOLD are served + * by the stream allocator, which purpose is to pack requests + * as close each to other as possible to produce smooth I/O traffic + * We use locality group prealloc space for stream request. + * We can tune the same via /proc/fs/ext4/<parition>/stream_req + */ +#define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ + +/* + * for which requests use 2^N search using buddies + */ +#define MB_DEFAULT_ORDER2_REQS 2 + +/* + * default group prealloc size 512 blocks + */ +#define MB_DEFAULT_GROUP_PREALLOC 512 + +static struct kmem_cache *ext4_pspace_cachep; +static struct kmem_cache *ext4_ac_cachep; + +#ifdef EXT4_BB_MAX_BLOCKS +#undef EXT4_BB_MAX_BLOCKS +#endif +#define EXT4_BB_MAX_BLOCKS 30 + +struct ext4_free_metadata { + ext4_group_t group; + unsigned short num; + ext4_grpblk_t blocks[EXT4_BB_MAX_BLOCKS]; + struct list_head list; +}; + +struct ext4_group_info { + unsigned long bb_state; + unsigned long bb_tid; + struct ext4_free_metadata *bb_md_cur; + unsigned short bb_first_free; + unsigned short bb_free; + unsigned short bb_fragments; + struct list_head bb_prealloc_list; +#ifdef DOUBLE_CHECK + void *bb_bitmap; +#endif + unsigned short bb_counters[]; +}; + +#define EXT4_GROUP_INFO_NEED_INIT_BIT 0 +#define EXT4_GROUP_INFO_LOCKED_BIT 1 + +#define EXT4_MB_GRP_NEED_INIT(grp) \ + (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) + + +struct ext4_prealloc_space { + struct list_head pa_inode_list; + struct list_head pa_group_list; + union { + struct list_head pa_tmp_list; + struct rcu_head pa_rcu; + } u; + spinlock_t pa_lock; + atomic_t pa_count; + unsigned pa_deleted; + ext4_fsblk_t pa_pstart; /* phys. block */ + ext4_lblk_t pa_lstart; /* log. block */ + unsigned short pa_len; /* len of preallocated chunk */ + unsigned short pa_free; /* how many blocks are free */ + unsigned short pa_linear; /* consumed in one direction + * strictly, for grp prealloc */ + spinlock_t *pa_obj_lock; + struct inode *pa_inode; /* hack, for history only */ +}; + + +struct ext4_free_extent { + ext4_lblk_t fe_logical; + ext4_grpblk_t fe_start; + ext4_group_t fe_group; + int fe_len; +}; + +/* + * Locality group: + * we try to group all related changes together + * so that writeback can flush/allocate them together as well + */ +struct ext4_locality_group { + /* for allocator */ + struct mutex lg_mutex; /* to serialize allocates */ + struct list_head lg_prealloc_list;/* list of preallocations */ + spinlock_t lg_prealloc_lock; +}; + +struct ext4_allocation_context { + struct inode *ac_inode; + struct super_block *ac_sb; + + /* original request */ + struct ext4_free_extent ac_o_ex; + + /* goal request (after normalization) */ + struct ext4_free_extent ac_g_ex; + + /* the best found extent */ + struct ext4_free_extent ac_b_ex; + + /* copy of the bext found extent taken before preallocation efforts */ + struct ext4_free_extent ac_f_ex; + + /* number of iterations done. we have to track to limit searching */ + unsigned long ac_ex_scanned; + __u16 ac_groups_scanned; + __u16 ac_found; + __u16 ac_tail; + __u16 ac_buddy; + __u16 ac_flags; /* allocation hints */ + __u8 ac_status; + __u8 ac_criteria; + __u8 ac_repeats; + __u8 ac_2order; /* if request is to allocate 2^N blocks and + * N > 0, the field stores N, otherwise 0 */ + __u8 ac_op; /* operation, for history only */ + struct page *ac_bitmap_page; + struct page *ac_buddy_page; + struct ext4_prealloc_space *ac_pa; + struct ext4_locality_group *ac_lg; +}; + +#define AC_STATUS_CONTINUE 1 +#define AC_STATUS_FOUND 2 +#define AC_STATUS_BREAK 3 + +struct ext4_mb_history { + struct ext4_free_extent orig; /* orig allocation */ + struct ext4_free_extent goal; /* goal allocation */ + struct ext4_free_extent result; /* result allocation */ + unsigned pid; + unsigned ino; + __u16 found; /* how many extents have been found */ + __u16 groups; /* how many groups have been scanned */ + __u16 tail; /* what tail broke some buddy */ + __u16 buddy; /* buddy the tail ^^^ broke */ + __u16 flags; + __u8 cr:3; /* which phase the result extent was found at */ + __u8 op:4; + __u8 merged:1; +}; + +struct ext4_buddy { + struct page *bd_buddy_page; + void *bd_buddy; + struct page *bd_bitmap_page; + void *bd_bitmap; + struct ext4_group_info *bd_info; + struct super_block *bd_sb; + __u16 bd_blkbits; + ext4_group_t bd_group; +}; +#define EXT4_MB_BITMAP(e4b) ((e4b)->bd_bitmap) +#define EXT4_MB_BUDDY(e4b) ((e4b)->bd_buddy) + +#ifndef EXT4_MB_HISTORY +static inline void ext4_mb_store_history(struct ext4_allocation_context *ac) +{ + return; +} +#else +static void ext4_mb_store_history(struct ext4_allocation_context *ac); +#endif + +#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1) + +static struct proc_dir_entry *proc_root_ext4; +struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t); + +static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, + ext4_group_t group); +static void ext4_mb_poll_new_transaction(struct super_block *, handle_t *); +static void ext4_mb_free_committed_blocks(struct super_block *); +static void ext4_mb_return_to_preallocation(struct inode *inode, + struct ext4_buddy *e4b, sector_t block, + int count); +static void ext4_mb_put_pa(struct ext4_allocation_context *, + struct super_block *, struct ext4_prealloc_space *pa); +static int ext4_mb_init_per_dev_proc(struct super_block *sb); +static int ext4_mb_destroy_per_dev_proc(struct super_block *sb); + + +static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline void ext4_unlock_group(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); +} + +static inline int ext4_is_group_locked(struct super_block *sb, + ext4_group_t group) +{ + struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); + + return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, + &(grinfo->bb_state)); +} + +static ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb, + struct ext4_free_extent *fex) +{ + ext4_fsblk_t block; + + block = (ext4_fsblk_t) fex->fe_group * EXT4_BLOCKS_PER_GROUP(sb) + + fex->fe_start + + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block); + return block; +} +#endif diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 5c1e27de775..b9e077ba07e 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -13,8 +13,8 @@ */ #include <linux/module.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs_extents.h> +#include "ext4_jbd2.h" +#include "ext4_extents.h" /* * The contiguous blocks details which can be @@ -327,7 +327,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) } static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, - struct inode *tmp_inode) + struct inode *tmp_inode) { int retval; __le32 i_data[3]; @@ -339,7 +339,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * i_data field of the original inode */ retval = ext4_journal_extend(handle, 1); - if (retval != 0) { + if (retval) { retval = ext4_journal_restart(handle, 1); if (retval) goto err_out; @@ -351,6 +351,18 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, down_write(&EXT4_I(inode)->i_data_sem); /* + * if EXT4_EXT_MIGRATE is cleared a block allocation + * happened after we started the migrate. We need to + * fail the migrate + */ + if (!(EXT4_I(inode)->i_flags & EXT4_EXT_MIGRATE)) { + retval = -EAGAIN; + up_write(&EXT4_I(inode)->i_data_sem); + goto err_out; + } else + EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags & + ~EXT4_EXT_MIGRATE; + /* * We have the extent map build with the tmp inode. * Now copy the i_data across */ @@ -508,6 +520,17 @@ int ext4_ext_migrate(struct inode *inode, struct file *filp, * switch the inode format to prevent read. */ mutex_lock(&(inode->i_mutex)); + /* + * Even though we take i_mutex we can still cause block allocation + * via mmap write to holes. If we have allocated new blocks we fail + * migrate. New block allocation will clear EXT4_EXT_MIGRATE flag. + * The flag is updated with i_data_sem held to prevent racing with + * block allocation. + */ + down_read((&EXT4_I(inode)->i_data_sem)); + EXT4_I(inode)->i_flags = EXT4_I(inode)->i_flags | EXT4_EXT_MIGRATE; + up_read((&EXT4_I(inode)->i_data_sem)); + handle = ext4_journal_start(inode, 1); ei = EXT4_I(inode); @@ -559,9 +582,15 @@ err_out: * tmp_inode */ free_ext_block(handle, tmp_inode); - else - retval = ext4_ext_swap_inode_data(handle, inode, - tmp_inode); + else { + retval = ext4_ext_swap_inode_data(handle, inode, tmp_inode); + if (retval) + /* + * if we fail to swap inode data free the extent + * details of the tmp inode + */ + free_ext_block(handle, tmp_inode); + } /* We mark the tmp_inode dirty via ext4_ext_tree_init. */ if (ext4_journal_extend(handle, 1) != 0) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 28aa2ed4297..ab16beaa830 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -28,14 +28,14 @@ #include <linux/pagemap.h> #include <linux/jbd2.h> #include <linux/time.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> #include <linux/fcntl.h> #include <linux/stat.h> #include <linux/string.h> #include <linux/quotaops.h> #include <linux/buffer_head.h> #include <linux/bio.h> +#include "ext4.h" +#include "ext4_jbd2.h" #include "namei.h" #include "xattr.h" @@ -57,10 +57,15 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - if ((bh = ext4_bread(handle, inode, *block, 1, err))) { + bh = ext4_bread(handle, inode, *block, 1, err); + if (bh) { inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; - ext4_journal_get_write_access(handle,bh); + *err = ext4_journal_get_write_access(handle, bh); + if (*err) { + brelse(bh); + bh = NULL; + } } return bh; } @@ -348,7 +353,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "Unrecognised inode hash code %d", root->info.hash_version); brelse(bh); @@ -362,7 +367,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, hash = hinfo->hash; if (root->info.unused_flags & 1) { - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); brelse(bh); @@ -371,7 +376,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, } if ((indirect = root->info.indirect_levels) > 1) { - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); brelse(bh); @@ -384,7 +389,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "dx entry: limit != root limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -396,7 +401,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "dx entry: no count or count > limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -441,7 +446,7 @@ dx_probe(struct dentry *dentry, struct inode *dir, goto fail2; at = entries = ((struct dx_node *) bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "dx entry: limit != node limit"); brelse(bh); *err = ERR_BAD_DX_DIR; @@ -457,7 +462,7 @@ fail2: } fail: if (*err == ERR_BAD_DX_DIR) - ext4_warning(dir->i_sb, __FUNCTION__, + ext4_warning(dir->i_sb, __func__, "Corrupt dir inode %ld, running e2fsck is " "recommended.", dir->i_ino); return NULL; @@ -914,7 +919,7 @@ restart: wait_on_buffer(bh); if (!buffer_uptodate(bh)) { /* read error, skip block & hope for the best */ - ext4_error(sb, __FUNCTION__, "reading directory #%lu " + ext4_error(sb, __func__, "reading directory #%lu " "offset %lu", dir->i_ino, (unsigned long)block); brelse(bh); @@ -1007,7 +1012,7 @@ static struct buffer_head * ext4_dx_find_entry(struct dentry *dentry, retval = ext4_htree_next_block(dir, hash, frame, frames, NULL); if (retval < 0) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "error reading index page in directory #%lu", dir->i_ino); *err = retval; @@ -1532,7 +1537,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (levels && (dx_get_count(frames->entries) == dx_get_limit(frames->entries))) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Directory index full!"); err = -ENOSPC; goto cleanup; @@ -1860,11 +1865,11 @@ static int empty_dir (struct inode * inode) if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2) || !(bh = ext4_bread (NULL, inode, 0, 0, &err))) { if (err) - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "error %d reading directory #%lu offset 0", err, inode->i_ino); else - ext4_warning(inode->i_sb, __FUNCTION__, + ext4_warning(inode->i_sb, __func__, "bad directory (dir #%lu) - no data block", inode->i_ino); return 1; @@ -1893,7 +1898,7 @@ static int empty_dir (struct inode * inode) offset >> EXT4_BLOCK_SIZE_BITS(sb), 0, &err); if (!bh) { if (err) - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "error %d reading directory" " #%lu offset %lu", err, inode->i_ino, offset); @@ -2217,6 +2222,8 @@ retry: goto out_stop; } } else { + /* clear the extent format for fast symlink */ + EXT4_I(inode)->i_flags &= ~EXT4_EXTENTS_FL; inode->i_op = &ext4_fast_symlink_inode_operations; memcpy((char*)&EXT4_I(inode)->i_data,symname,l); inode->i_size = l-1; @@ -2347,6 +2354,9 @@ static int ext4_rename (struct inode * old_dir, struct dentry *old_dentry, EXT4_FEATURE_INCOMPAT_FILETYPE)) new_de->file_type = old_de->file_type; new_dir->i_version++; + new_dir->i_ctime = new_dir->i_mtime = + ext4_current_time(new_dir); + ext4_mark_inode_dirty(handle, new_dir); BUFFER_TRACE(new_bh, "call ext4_journal_dirty_metadata"); ext4_journal_dirty_metadata(handle, new_bh); brelse(new_bh); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e29efa0f9d6..9f086a6a472 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -11,11 +11,10 @@ #define EXT4FS_DEBUG -#include <linux/ext4_jbd2.h> - #include <linux/errno.h> #include <linux/slab.h> +#include "ext4_jbd2.h" #include "group.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) @@ -50,63 +49,63 @@ static int verify_group_input(struct super_block *sb, ext4_get_group_no_and_offset(sb, start, NULL, &offset); if (group != sbi->s_groups_count) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Cannot add at group %u (only %lu groups)", input->group, sbi->s_groups_count); else if (offset != 0) - ext4_warning(sb, __FUNCTION__, "Last group not full"); + ext4_warning(sb, __func__, "Last group not full"); else if (input->reserved_blocks > input->blocks_count / 5) - ext4_warning(sb, __FUNCTION__, "Reserved blocks too high (%u)", + ext4_warning(sb, __func__, "Reserved blocks too high (%u)", input->reserved_blocks); else if (free_blocks_count < 0) - ext4_warning(sb, __FUNCTION__, "Bad blocks count %u", + ext4_warning(sb, __func__, "Bad blocks count %u", input->blocks_count); else if (!(bh = sb_bread(sb, end - 1))) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Cannot read last block (%llu)", end - 1); else if (outside(input->block_bitmap, start, end)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Block bitmap not in group (block %llu)", (unsigned long long)input->block_bitmap); else if (outside(input->inode_bitmap, start, end)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Inode bitmap not in group (block %llu)", (unsigned long long)input->inode_bitmap); else if (outside(input->inode_table, start, end) || outside(itend - 1, start, end)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Inode table not in group (blocks %llu-%llu)", (unsigned long long)input->inode_table, itend - 1); else if (input->inode_bitmap == input->block_bitmap) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Block bitmap same as inode bitmap (%llu)", (unsigned long long)input->block_bitmap); else if (inside(input->block_bitmap, input->inode_table, itend)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Block bitmap (%llu) in inode table (%llu-%llu)", (unsigned long long)input->block_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->inode_bitmap, input->inode_table, itend)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Inode bitmap (%llu) in inode table (%llu-%llu)", (unsigned long long)input->inode_bitmap, (unsigned long long)input->inode_table, itend - 1); else if (inside(input->block_bitmap, start, metaend)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Block bitmap (%llu) in GDT table" " (%llu-%llu)", (unsigned long long)input->block_bitmap, start, metaend - 1); else if (inside(input->inode_bitmap, start, metaend)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Inode bitmap (%llu) in GDT table" " (%llu-%llu)", (unsigned long long)input->inode_bitmap, start, metaend - 1); else if (inside(input->inode_table, start, metaend) || inside(itend - 1, start, metaend)) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Inode table (%llu-%llu) overlaps" "GDT table (%llu-%llu)", (unsigned long long)input->inode_table, @@ -368,7 +367,7 @@ static int verify_reserved_gdb(struct super_block *sb, while ((grp = ext4_list_backups(sb, &three, &five, &seven)) < end) { if (le32_to_cpu(*p++) != grp * EXT4_BLOCKS_PER_GROUP(sb) + blk){ - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "reserved GDT %llu" " missing grp %d (%llu)", blk, grp, @@ -424,7 +423,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, */ if (EXT4_SB(sb)->s_sbh->b_blocknr != le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "won't resize using backup superblock at %llu", (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); return -EPERM; @@ -448,7 +447,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, data = (__le32 *)dind->b_data; if (le32_to_cpu(data[gdb_num % EXT4_ADDR_PER_BLOCK(sb)]) != gdblock) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "new group %u GDT block %llu not reserved", input->group, gdblock); err = -EINVAL; @@ -469,10 +468,10 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, goto exit_dindj; n_group_desc = kmalloc((gdb_num + 1) * sizeof(struct buffer_head *), - GFP_KERNEL); + GFP_NOFS); if (!n_group_desc) { err = -ENOMEM; - ext4_warning (sb, __FUNCTION__, + ext4_warning(sb, __func__, "not enough memory for %lu groups", gdb_num + 1); goto exit_inode; } @@ -502,8 +501,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, EXT4_SB(sb)->s_gdb_count++; kfree(o_group_desc); - es->s_reserved_gdt_blocks = - cpu_to_le16(le16_to_cpu(es->s_reserved_gdt_blocks) - 1); + le16_add_cpu(&es->s_reserved_gdt_blocks, -1); ext4_journal_dirty_metadata(handle, EXT4_SB(sb)->s_sbh); return 0; @@ -553,7 +551,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, int res, i; int err; - primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_KERNEL); + primary = kmalloc(reserved_gdb * sizeof(*primary), GFP_NOFS); if (!primary) return -ENOMEM; @@ -571,7 +569,7 @@ static int reserve_backup_gdb(handle_t *handle, struct inode *inode, /* Get each reserved primary GDT block and verify it holds backups */ for (res = 0; res < reserved_gdb; res++, blk++) { if (le32_to_cpu(*data) != blk) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "reserved block %llu" " not at offset %ld", blk, @@ -715,7 +713,7 @@ static void update_backups(struct super_block *sb, */ exit_err: if (err) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "can't update backup for group %lu (err %d), " "forcing fsck on next reboot", group, err); sbi->s_mount_state &= ~EXT4_VALID_FS; @@ -755,33 +753,33 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) if (gdb_off == 0 && !EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER)) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Can't resize non-sparse filesystem further"); return -EPERM; } if (ext4_blocks_count(es) + input->blocks_count < ext4_blocks_count(es)) { - ext4_warning(sb, __FUNCTION__, "blocks_count overflow\n"); + ext4_warning(sb, __func__, "blocks_count overflow\n"); return -EINVAL; } if (le32_to_cpu(es->s_inodes_count) + EXT4_INODES_PER_GROUP(sb) < le32_to_cpu(es->s_inodes_count)) { - ext4_warning(sb, __FUNCTION__, "inodes_count overflow\n"); + ext4_warning(sb, __func__, "inodes_count overflow\n"); return -EINVAL; } if (reserved_gdb || gdb_off == 0) { if (!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_RESIZE_INODE)){ - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "No reserved GDT blocks, can't resize"); return -EPERM; } inode = ext4_iget(sb, EXT4_RESIZE_INO); if (IS_ERR(inode)) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "Error opening resize inode"); return PTR_ERR(inode); } @@ -810,7 +808,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) lock_super(sb); if (input->group != sbi->s_groups_count) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); err = -EBUSY; goto exit_journal; @@ -877,8 +875,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) */ ext4_blocks_count_set(es, ext4_blocks_count(es) + input->blocks_count); - es->s_inodes_count = cpu_to_le32(le32_to_cpu(es->s_inodes_count) + - EXT4_INODES_PER_GROUP(sb)); + le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb)); /* * We need to protect s_groups_count against other CPUs seeing @@ -977,13 +974,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, " too large to resize to %llu blocks safely\n", sb->s_id, n_blocks_count); if (sizeof(sector_t) < 8) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "CONFIG_LBD not enabled\n"); return -EINVAL; } if (n_blocks_count < o_blocks_count) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "can't shrink FS - resize aborted"); return -EBUSY; } @@ -992,7 +989,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_get_group_no_and_offset(sb, o_blocks_count, NULL, &last); if (last == 0) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "need to use ext2online to resize further"); return -EPERM; } @@ -1000,7 +997,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, add = EXT4_BLOCKS_PER_GROUP(sb) - last; if (o_blocks_count + add < o_blocks_count) { - ext4_warning(sb, __FUNCTION__, "blocks_count overflow"); + ext4_warning(sb, __func__, "blocks_count overflow"); return -EINVAL; } @@ -1008,7 +1005,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, add = n_blocks_count - o_blocks_count; if (o_blocks_count + add < n_blocks_count) - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "will only finish group (%llu" " blocks, %u new)", o_blocks_count + add, add); @@ -1016,7 +1013,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, /* See if the device is actually as big as what was requested */ bh = sb_bread(sb, o_blocks_count + add -1); if (!bh) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "can't read last block, resize aborted"); return -ENOSPC; } @@ -1028,13 +1025,13 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, handle = ext4_journal_start_sb(sb, 3); if (IS_ERR(handle)) { err = PTR_ERR(handle); - ext4_warning(sb, __FUNCTION__, "error %d on journal start",err); + ext4_warning(sb, __func__, "error %d on journal start", err); goto exit_put; } lock_super(sb); if (o_blocks_count != ext4_blocks_count(es)) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); unlock_super(sb); ext4_journal_stop(handle); @@ -1044,7 +1041,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, if ((err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh))) { - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "error %d on journal write access", err); unlock_super(sb); ext4_journal_stop(handle); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 13383ba18f1..52dd0679a4e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -21,8 +21,6 @@ #include <linux/fs.h> #include <linux/time.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> -#include <linux/ext4_jbd2.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> @@ -38,9 +36,10 @@ #include <linux/seq_file.h> #include <linux/log2.h> #include <linux/crc16.h> - #include <asm/uaccess.h> +#include "ext4.h" +#include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" #include "namei.h" @@ -135,7 +134,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; if (is_journal_aborted(journal)) { - ext4_abort(sb, __FUNCTION__, + ext4_abort(sb, __func__, "Detected aborted journal"); return ERR_PTR(-EROFS); } @@ -355,7 +354,7 @@ void ext4_update_dynamic_rev(struct super_block *sb) if (le32_to_cpu(es->s_rev_level) > EXT4_GOOD_OLD_REV) return; - ext4_warning(sb, __FUNCTION__, + ext4_warning(sb, __func__, "updating to rev %d because of new feature flag, " "running e2fsck is recommended", EXT4_DYNAMIC_REV); @@ -813,7 +812,8 @@ static int ext4_acquire_dquot(struct dquot *dquot); static int ext4_release_dquot(struct dquot *dquot); static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); -static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext4_quota_on(struct super_block *sb, int type, int format_id, + char *path, int remount); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -944,8 +944,8 @@ static match_table_t tokens = { {Opt_mballoc, "mballoc"}, {Opt_nomballoc, "nomballoc"}, {Opt_stripe, "stripe=%u"}, - {Opt_err, NULL}, {Opt_resize, "resize"}, + {Opt_err, NULL}, }; static ext4_fsblk_t get_sb_block(void **data) @@ -1387,11 +1387,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, * a plain journaled filesystem we can keep it set as * valid forever! :) */ - es->s_state = cpu_to_le16(le16_to_cpu(es->s_state) & ~EXT4_VALID_FS); + es->s_state &= cpu_to_le16(~EXT4_VALID_FS); #endif if (!(__s16) le16_to_cpu(es->s_max_mnt_count)) es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); - es->s_mnt_count=cpu_to_le16(le16_to_cpu(es->s_mnt_count) + 1); + le16_add_cpu(&es->s_mnt_count, 1); es->s_mtime = cpu_to_le32(get_seconds()); ext4_update_dynamic_rev(sb); EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); @@ -1484,36 +1484,33 @@ static int ext4_check_descriptors(struct super_block *sb) block_bitmap = ext4_block_bitmap(sb, gdp); if (block_bitmap < first_block || block_bitmap > last_block) { - ext4_error (sb, "ext4_check_descriptors", - "Block bitmap for group %lu" - " not in group (block %llu)!", - i, block_bitmap); + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Block bitmap for group %lu not in group " + "(block %llu)!", i, block_bitmap); return 0; } inode_bitmap = ext4_inode_bitmap(sb, gdp); if (inode_bitmap < first_block || inode_bitmap > last_block) { - ext4_error (sb, "ext4_check_descriptors", - "Inode bitmap for group %lu" - " not in group (block %llu)!", - i, inode_bitmap); + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Inode bitmap for group %lu not in group " + "(block %llu)!", i, inode_bitmap); return 0; } inode_table = ext4_inode_table(sb, gdp); if (inode_table < first_block || inode_table + sbi->s_itb_per_group - 1 > last_block) { - ext4_error (sb, "ext4_check_descriptors", - "Inode table for group %lu" - " not in group (block %llu)!", - i, inode_table); + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Inode table for group %lu not in group " + "(block %llu)!", i, inode_table); return 0; } if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { - ext4_error(sb, __FUNCTION__, - "Checksum for group %lu failed (%u!=%u)\n", - i, le16_to_cpu(ext4_group_desc_csum(sbi, i, - gdp)), le16_to_cpu(gdp->bg_checksum)); + printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " + "Checksum for group %lu failed (%u!=%u)\n", + i, le16_to_cpu(ext4_group_desc_csum(sbi, i, + gdp)), le16_to_cpu(gdp->bg_checksum)); return 0; } if (!flexbg_flag) @@ -1593,8 +1590,8 @@ static void ext4_orphan_cleanup (struct super_block * sb, while (es->s_last_orphan) { struct inode *inode; - if (!(inode = - ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)))) { + inode = ext4_orphan_get(sb, le32_to_cpu(es->s_last_orphan)); + if (IS_ERR(inode)) { es->s_last_orphan = 0; break; } @@ -1604,7 +1601,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, if (inode->i_nlink) { printk(KERN_DEBUG "%s: truncating inode %lu to %Ld bytes\n", - __FUNCTION__, inode->i_ino, inode->i_size); + __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %Ld bytes\n", inode->i_ino, inode->i_size); ext4_truncate(inode); @@ -1612,7 +1609,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, } else { printk(KERN_DEBUG "%s: deleting unreferenced inode %lu\n", - __FUNCTION__, inode->i_ino); + __func__, inode->i_ino); jbd_debug(2, "deleting unreferenced inode %lu\n", inode->i_ino); nr_orphans++; @@ -1632,7 +1629,7 @@ static void ext4_orphan_cleanup (struct super_block * sb, /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) - vfs_quota_off(sb, i); + vfs_quota_off(sb, i, 0); } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -2698,9 +2695,9 @@ static void ext4_clear_journal_err(struct super_block * sb, char nbuf[16]; errstr = ext4_decode_error(sb, j_errno, nbuf); - ext4_warning(sb, __FUNCTION__, "Filesystem error recorded " + ext4_warning(sb, __func__, "Filesystem error recorded " "from previous mount: %s", errstr); - ext4_warning(sb, __FUNCTION__, "Marking fs in need of " + ext4_warning(sb, __func__, "Marking fs in need of " "filesystem check."); EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; @@ -2827,7 +2824,7 @@ static int ext4_remount (struct super_block * sb, int * flags, char * data) } if (sbi->s_mount_opt & EXT4_MOUNT_ABORT) - ext4_abort(sb, __FUNCTION__, "Abort forced by user"); + ext4_abort(sb, __func__, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | ((sbi->s_mount_opt & EXT4_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); @@ -3039,8 +3036,14 @@ static int ext4_dquot_drop(struct inode *inode) /* We may delete quota structure so we need to reserve enough blocks */ handle = ext4_journal_start(inode, 2*EXT4_QUOTA_DEL_BLOCKS(inode->i_sb)); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + /* + * We call dquot_drop() anyway to at least release references + * to quota structures so that umount does not hang. + */ + dquot_drop(inode); return PTR_ERR(handle); + } ret = dquot_drop(inode); err = ext4_journal_stop(handle); if (!ret) @@ -3143,7 +3146,7 @@ static int ext4_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int ext4_quota_on(struct super_block *sb, int type, int format_id, - char *path) + char *path, int remount) { int err; struct nameidata nd; @@ -3151,9 +3154,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, if (!test_opt(sb, QUOTA)) return -EINVAL; /* Not journalling quota? */ - if (!EXT4_SB(sb)->s_qf_names[USRQUOTA] && - !EXT4_SB(sb)->s_qf_names[GRPQUOTA]) - return vfs_quota_on(sb, type, format_id, path); + if ((!EXT4_SB(sb)->s_qf_names[USRQUOTA] && + !EXT4_SB(sb)->s_qf_names[GRPQUOTA]) || remount) + return vfs_quota_on(sb, type, format_id, path, remount); err = path_lookup(path, LOOKUP_FOLLOW, &nd); if (err) return err; @@ -3168,7 +3171,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, "EXT4-fs: Quota file not on filesystem root. " "Journalled quota will not work.\n"); path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path); + return vfs_quota_on(sb, type, format_id, path, remount); } /* Read data from quotafile - avoid pagecache and such because we cannot afford diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c index e6f9da4287c..e9178643dc0 100644 --- a/fs/ext4/symlink.c +++ b/fs/ext4/symlink.c @@ -19,8 +19,8 @@ #include <linux/fs.h> #include <linux/jbd2.h> -#include <linux/ext4_fs.h> #include <linux/namei.h> +#include "ext4.h" #include "xattr.h" static void * ext4_follow_link(struct dentry *dentry, struct nameidata *nd) diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e9054c1c7d9..3fbc2c6c3d0 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -53,11 +53,11 @@ #include <linux/init.h> #include <linux/fs.h> #include <linux/slab.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> #include <linux/mbcache.h> #include <linux/quotaops.h> #include <linux/rwsem.h> +#include "ext4_jbd2.h" +#include "ext4.h" #include "xattr.h" #include "acl.h" @@ -92,6 +92,8 @@ static struct buffer_head *ext4_xattr_cache_find(struct inode *, struct mb_cache_entry **); static void ext4_xattr_rehash(struct ext4_xattr_header *, struct ext4_xattr_entry *); +static int ext4_xattr_list(struct inode *inode, char *buffer, + size_t buffer_size); static struct mb_cache *ext4_xattr_cache; @@ -225,7 +227,7 @@ ext4_xattr_block_get(struct inode *inode, int name_index, const char *name, ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { -bad_block: ext4_error(inode->i_sb, __FUNCTION__, +bad_block: ext4_error(inode->i_sb, __func__, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -367,7 +369,7 @@ ext4_xattr_block_list(struct inode *inode, char *buffer, size_t buffer_size) ea_bdebug(bh, "b_count=%d, refcount=%d", atomic_read(&(bh->b_count)), le32_to_cpu(BHDR(bh)->h_refcount)); if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -420,7 +422,7 @@ cleanup: * Returns a negative error number on failure, or the number of bytes * used / required on success. */ -int +static int ext4_xattr_list(struct inode *inode, char *buffer, size_t buffer_size) { int i_error, b_error; @@ -484,8 +486,7 @@ ext4_xattr_release_block(handle_t *handle, struct inode *inode, get_bh(bh); ext4_forget(handle, 1, inode, bh, bh->b_blocknr); } else { - BHDR(bh)->h_refcount = cpu_to_le32( - le32_to_cpu(BHDR(bh)->h_refcount) - 1); + le32_add_cpu(&BHDR(bh)->h_refcount, -1); error = ext4_journal_dirty_metadata(handle, bh); if (IS_SYNC(inode)) handle->h_sync = 1; @@ -660,7 +661,7 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); if (ext4_xattr_check_block(bs->bh)) { - ext4_error(sb, __FUNCTION__, + ext4_error(sb, __func__, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -738,7 +739,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, ce = NULL; } ea_bdebug(bs->bh, "cloning"); - s->base = kmalloc(bs->bh->b_size, GFP_KERNEL); + s->base = kmalloc(bs->bh->b_size, GFP_NOFS); error = -ENOMEM; if (s->base == NULL) goto cleanup; @@ -750,7 +751,7 @@ ext4_xattr_block_set(handle_t *handle, struct inode *inode, } } else { /* Allocate a buffer where we construct the new block. */ - s->base = kzalloc(sb->s_blocksize, GFP_KERNEL); + s->base = kzalloc(sb->s_blocksize, GFP_NOFS); /* assert(header == s->base) */ error = -ENOMEM; if (s->base == NULL) @@ -789,8 +790,7 @@ inserted: if (error) goto cleanup_dquot; lock_buffer(new_bh); - BHDR(new_bh)->h_refcount = cpu_to_le32(1 + - le32_to_cpu(BHDR(new_bh)->h_refcount)); + le32_add_cpu(&BHDR(new_bh)->h_refcount, 1); ea_bdebug(new_bh, "reusing; refcount now=%d", le32_to_cpu(BHDR(new_bh)->h_refcount)); unlock_buffer(new_bh); @@ -808,10 +808,8 @@ inserted: get_bh(new_bh); } else { /* We need to allocate a new block */ - ext4_fsblk_t goal = le32_to_cpu( - EXT4_SB(sb)->s_es->s_first_data_block) + - (ext4_fsblk_t)EXT4_I(inode)->i_block_group * - EXT4_BLOCKS_PER_GROUP(sb); + ext4_fsblk_t goal = ext4_group_first_block_no(sb, + EXT4_I(inode)->i_block_group); ext4_fsblk_t block = ext4_new_block(handle, inode, goal, &error); if (error) @@ -863,7 +861,7 @@ cleanup_dquot: goto cleanup; bad_block: - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; @@ -1166,7 +1164,7 @@ retry: if (!bh) goto cleanup; if (ext4_xattr_check_block(bh)) { - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); error = -EIO; @@ -1341,14 +1339,14 @@ ext4_xattr_delete_inode(handle_t *handle, struct inode *inode) goto cleanup; bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl); if (!bh) { - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "inode %lu: block %llu read error", inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; } if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) || BHDR(bh)->h_blocks != cpu_to_le32(1)) { - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "inode %lu: bad block %llu", inode->i_ino, EXT4_I(inode)->i_file_acl); goto cleanup; @@ -1475,7 +1473,7 @@ again: } bh = sb_bread(inode->i_sb, ce->e_block); if (!bh) { - ext4_error(inode->i_sb, __FUNCTION__, + ext4_error(inode->i_sb, __func__, "inode %lu: block %lu read error", inode->i_ino, (unsigned long) ce->e_block); } else if (le32_to_cpu(BHDR(bh)->h_refcount) >= diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index d7f5d6a1265..5992fe979bb 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -74,7 +74,6 @@ extern struct xattr_handler ext4_xattr_security_handler; extern ssize_t ext4_listxattr(struct dentry *, char *, size_t); extern int ext4_xattr_get(struct inode *, int, const char *, void *, size_t); -extern int ext4_xattr_list(struct inode *, char *, size_t); extern int ext4_xattr_set(struct inode *, int, const char *, const void *, size_t, int); extern int ext4_xattr_set_handle(handle_t *, struct inode *, int, const char *, const void *, size_t, int); @@ -99,12 +98,6 @@ ext4_xattr_get(struct inode *inode, int name_index, const char *name, } static inline int -ext4_xattr_list(struct inode *inode, void *buffer, size_t size) -{ - return -EOPNOTSUPP; -} - -static inline int ext4_xattr_set(struct inode *inode, int name_index, const char *name, const void *value, size_t size, int flags) { diff --git a/fs/ext4/xattr_security.c b/fs/ext4/xattr_security.c index f17eaf2321b..ca5f89fc6ca 100644 --- a/fs/ext4/xattr_security.c +++ b/fs/ext4/xattr_security.c @@ -6,9 +6,9 @@ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> #include <linux/security.h> +#include "ext4_jbd2.h" +#include "ext4.h" #include "xattr.h" static size_t diff --git a/fs/ext4/xattr_trusted.c b/fs/ext4/xattr_trusted.c index e0f05acdafe..fff33382cad 100644 --- a/fs/ext4/xattr_trusted.c +++ b/fs/ext4/xattr_trusted.c @@ -9,8 +9,8 @@ #include <linux/string.h> #include <linux/capability.h> #include <linux/fs.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" #include "xattr.h" #define XATTR_TRUSTED_PREFIX "trusted." diff --git a/fs/ext4/xattr_user.c b/fs/ext4/xattr_user.c index 7ed3d8ebf09..67be723fcc4 100644 --- a/fs/ext4/xattr_user.c +++ b/fs/ext4/xattr_user.c @@ -8,8 +8,8 @@ #include <linux/module.h> #include <linux/string.h> #include <linux/fs.h> -#include <linux/ext4_jbd2.h> -#include <linux/ext4_fs.h> +#include "ext4_jbd2.h" +#include "ext4.h" #include "xattr.h" #define XATTR_USER_PREFIX "user." diff --git a/fs/fat/cache.c b/fs/fat/cache.c index 639b3b4f86d..fda25479af2 100644 --- a/fs/fat/cache.c +++ b/fs/fat/cache.c @@ -242,7 +242,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) /* prevent the infinite loop of cluster chain */ if (*fclus > limit) { fat_fs_panic(sb, "%s: detected the cluster chain loop" - " (i_pos %lld)", __FUNCTION__, + " (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; @@ -253,7 +253,7 @@ int fat_get_cluster(struct inode *inode, int cluster, int *fclus, int *dclus) goto out; else if (nr == FAT_ENT_FREE) { fat_fs_panic(sb, "%s: invalid cluster chain" - " (i_pos %lld)", __FUNCTION__, + " (i_pos %lld)", __func__, MSDOS_I(inode)->i_pos); nr = -EIO; goto out; @@ -286,7 +286,7 @@ static int fat_bmap_cluster(struct inode *inode, int cluster) return ret; else if (ret == FAT_ENT_EOF) { fat_fs_panic(sb, "%s: request beyond EOF (i_pos %lld)", - __FUNCTION__, MSDOS_I(inode)->i_pos); + __func__, MSDOS_I(inode)->i_pos); return -EIO; } return dclus; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 72cbcd61bd9..486725ee99a 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -124,8 +124,8 @@ static inline int fat_get_entry(struct inode *dir, loff_t *pos, * but ignore that right now. * Ahem... Stack smashing in ring 0 isn't fun. Fixed. */ -static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int uni_xlate, - struct nls_table *nls) +static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int len, + int uni_xlate, struct nls_table *nls) { wchar_t *ip, ec; unsigned char *op, nc; @@ -135,10 +135,11 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int uni_xlate, ip = uni; op = ascii; - while (*ip) { + while (*ip && ((len - NLS_MAX_CHARSET_SIZE) > 0)) { ec = *ip++; if ( (charlen = nls->uni2char(ec, op, NLS_MAX_CHARSET_SIZE)) > 0) { op += charlen; + len -= charlen; } else { if (uni_xlate == 1) { *op = ':'; @@ -149,16 +150,19 @@ static int uni16_to_x8(unsigned char *ascii, wchar_t *uni, int uni_xlate, ec >>= 4; } op += 5; + len -= 5; } else { *op++ = '?'; + len--; } } - /* We have some slack there, so it's OK */ - if (op>ascii+256) { - op = ascii + 256; - break; - } } + + if (unlikely(*ip)) { + printk(KERN_WARNING "FAT: filename was truncated while " + "converting."); + } + *op = 0; return (op - ascii); } @@ -243,7 +247,7 @@ static int fat_parse_long(struct inode *dir, loff_t *pos, unsigned char id, slot, slots, alias_checksum; if (!*unicode) { - *unicode = (wchar_t *)__get_free_page(GFP_KERNEL); + *unicode = __getname(); if (!*unicode) { brelse(*bh); return -ENOMEM; @@ -311,9 +315,11 @@ int fat_search_long(struct inode *inode, const unsigned char *name, struct nls_table *nls_io = sbi->nls_io; struct nls_table *nls_disk = sbi->nls_disk; wchar_t bufuname[14]; - unsigned char xlate_len, nr_slots; + unsigned char nr_slots; + int xlate_len; wchar_t *unicode = NULL; - unsigned char work[MSDOS_NAME], bufname[260]; /* 256 + 4 */ + unsigned char work[MSDOS_NAME]; + unsigned char *bufname = NULL; int uni_xlate = sbi->options.unicode_xlate; int utf8 = sbi->options.utf8; int anycase = (sbi->options.name_check != 's'); @@ -321,6 +327,10 @@ int fat_search_long(struct inode *inode, const unsigned char *name, loff_t cpos = 0; int chl, i, j, last_u, err; + bufname = __getname(); + if (!bufname) + return -ENOMEM; + err = -ENOENT; while(1) { if (fat_get_entry(inode, &cpos, &bh, &de) == -1) @@ -386,8 +396,8 @@ parse_record: bufuname[last_u] = 0x0000; xlate_len = utf8 - ?utf8_wcstombs(bufname, bufuname, sizeof(bufname)) - :uni16_to_x8(bufname, bufuname, uni_xlate, nls_io); + ?utf8_wcstombs(bufname, bufuname, PATH_MAX) + :uni16_to_x8(bufname, bufuname, PATH_MAX, uni_xlate, nls_io); if (xlate_len == name_len) if ((!anycase && !memcmp(name, bufname, xlate_len)) || (anycase && !nls_strnicmp(nls_io, name, bufname, @@ -396,8 +406,8 @@ parse_record: if (nr_slots) { xlate_len = utf8 - ?utf8_wcstombs(bufname, unicode, sizeof(bufname)) - :uni16_to_x8(bufname, unicode, uni_xlate, nls_io); + ?utf8_wcstombs(bufname, unicode, PATH_MAX) + :uni16_to_x8(bufname, unicode, PATH_MAX, uni_xlate, nls_io); if (xlate_len != name_len) continue; if ((!anycase && !memcmp(name, bufname, xlate_len)) || @@ -416,8 +426,10 @@ Found: sinfo->i_pos = fat_make_i_pos(sb, sinfo->bh, sinfo->de); err = 0; EODir: + if (bufname) + __putname(bufname); if (unicode) - free_page((unsigned long)unicode); + __putname(unicode); return err; } @@ -598,7 +610,7 @@ parse_record: if (isvfat) { bufuname[j] = 0x0000; i = utf8 ? utf8_wcstombs(bufname, bufuname, sizeof(bufname)) - : uni16_to_x8(bufname, bufuname, uni_xlate, nls_io); + : uni16_to_x8(bufname, bufuname, sizeof(bufname), uni_xlate, nls_io); } fill_name = bufname; @@ -607,10 +619,10 @@ parse_record: /* convert the unicode long name. 261 is maximum size * of unicode buffer. (13 * slots + nul) */ void *longname = unicode + 261; - int buf_size = PAGE_SIZE - (261 * sizeof(unicode[0])); + int buf_size = PATH_MAX - (261 * sizeof(unicode[0])); int long_len = utf8 ? utf8_wcstombs(longname, unicode, buf_size) - : uni16_to_x8(longname, unicode, uni_xlate, nls_io); + : uni16_to_x8(longname, unicode, buf_size, uni_xlate, nls_io); if (!both) { fill_name = longname; @@ -640,7 +652,7 @@ EODir: FillFailed: brelse(bh); if (unicode) - free_page((unsigned long)unicode); + __putname(unicode); out: unlock_kernel(); return ret; diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 5fb366992b7..302e95c4af7 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -450,7 +450,8 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) BUG_ON(nr_cluster > (MAX_BUF_PER_PAGE / 2)); /* fixed limit */ lock_fat(sbi); - if (sbi->free_clusters != -1 && sbi->free_clusters < nr_cluster) { + if (sbi->free_clusters != -1 && sbi->free_clus_valid && + sbi->free_clusters < nr_cluster) { unlock_fat(sbi); return -ENOSPC; } @@ -504,6 +505,7 @@ int fat_alloc_clusters(struct inode *inode, int *cluster, int nr_cluster) /* Couldn't allocate the free entries */ sbi->free_clusters = 0; + sbi->free_clus_valid = 1; sb->s_dirt = 1; err = -ENOSPC; @@ -544,7 +546,7 @@ int fat_free_clusters(struct inode *inode, int cluster) goto error; } else if (cluster == FAT_ENT_FREE) { fat_fs_panic(sb, "%s: deleting FAT entry beyond EOF", - __FUNCTION__); + __func__); err = -EIO; goto error; } @@ -583,8 +585,6 @@ error: brelse(bhs[i]); unlock_fat(sbi); - fat_clusters_flush(sb); - return err; } @@ -615,7 +615,7 @@ int fat_count_free_clusters(struct super_block *sb) int err = 0, free; lock_fat(sbi); - if (sbi->free_clusters != -1) + if (sbi->free_clusters != -1 && sbi->free_clus_valid) goto out; reada_blocks = FAT_READA_SIZE >> sb->s_blocksize_bits; @@ -643,6 +643,7 @@ int fat_count_free_clusters(struct super_block *sb) } while (fat_ent_next(sbi, &fatent)); } sbi->free_clusters = free; + sbi->free_clus_valid = 1; sb->s_dirt = 1; fatent_brelse(&fatent); out: diff --git a/fs/fat/file.c b/fs/fat/file.c index 2a3bed96704..27cc1164ec3 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -157,104 +157,6 @@ out: return err; } -static int check_mode(const struct msdos_sb_info *sbi, mode_t mode) -{ - mode_t req = mode & ~S_IFMT; - - /* - * Of the r and x bits, all (subject to umask) must be present. Of the - * w bits, either all (subject to umask) or none must be present. - */ - - if (S_ISREG(mode)) { - req &= ~sbi->options.fs_fmask; - - if ((req & (S_IRUGO | S_IXUGO)) != - ((S_IRUGO | S_IXUGO) & ~sbi->options.fs_fmask)) - return -EPERM; - - if ((req & S_IWUGO) != 0 && - (req & S_IWUGO) != (S_IWUGO & ~sbi->options.fs_fmask)) - return -EPERM; - } else if (S_ISDIR(mode)) { - req &= ~sbi->options.fs_dmask; - - if ((req & (S_IRUGO | S_IXUGO)) != - ((S_IRUGO | S_IXUGO) & ~sbi->options.fs_dmask)) - return -EPERM; - - if ((req & S_IWUGO) != 0 && - (req & S_IWUGO) != (S_IWUGO & ~sbi->options.fs_dmask)) - return -EPERM; - } else { - return -EPERM; - } - - return 0; -} - -int fat_notify_change(struct dentry *dentry, struct iattr *attr) -{ - struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); - struct inode *inode = dentry->d_inode; - int mask, error = 0; - - lock_kernel(); - - /* - * Expand the file. Since inode_setattr() updates ->i_size - * before calling the ->truncate(), but FAT needs to fill the - * hole before it. - */ - if (attr->ia_valid & ATTR_SIZE) { - if (attr->ia_size > inode->i_size) { - error = fat_cont_expand(inode, attr->ia_size); - if (error || attr->ia_valid == ATTR_SIZE) - goto out; - attr->ia_valid &= ~ATTR_SIZE; - } - } - - error = inode_change_ok(inode, attr); - if (error) { - if (sbi->options.quiet) - error = 0; - goto out; - } - if (((attr->ia_valid & ATTR_UID) && - (attr->ia_uid != sbi->options.fs_uid)) || - ((attr->ia_valid & ATTR_GID) && - (attr->ia_gid != sbi->options.fs_gid))) - error = -EPERM; - - if (error) { - if (sbi->options.quiet) - error = 0; - goto out; - } - - if (attr->ia_valid & ATTR_MODE) { - error = check_mode(sbi, attr->ia_mode); - if (error != 0 && !sbi->options.quiet) - goto out; - } - - error = inode_setattr(inode, attr); - if (error) - goto out; - - if (S_ISDIR(inode->i_mode)) - mask = sbi->options.fs_dmask; - else - mask = sbi->options.fs_fmask; - inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask); -out: - unlock_kernel(); - return error; -} - -EXPORT_SYMBOL_GPL(fat_notify_change); - /* Free all clusters after the skip'th cluster. */ static int fat_free(struct inode *inode, int skip) { @@ -306,7 +208,7 @@ static int fat_free(struct inode *inode, int skip) } else if (ret == FAT_ENT_FREE) { fat_fs_panic(sb, "%s: invalid cluster chain (i_pos %lld)", - __FUNCTION__, MSDOS_I(inode)->i_pos); + __func__, MSDOS_I(inode)->i_pos); ret = -EIO; } else if (ret > 0) { err = fat_ent_write(inode, &fatent, FAT_ENT_EOF, wait); @@ -355,8 +257,112 @@ int fat_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) } EXPORT_SYMBOL_GPL(fat_getattr); +static int fat_check_mode(const struct msdos_sb_info *sbi, struct inode *inode, + mode_t mode) +{ + mode_t mask, req = mode & ~S_IFMT; + + if (S_ISREG(mode)) + mask = sbi->options.fs_fmask; + else + mask = sbi->options.fs_dmask; + + /* + * Of the r and x bits, all (subject to umask) must be present. Of the + * w bits, either all (subject to umask) or none must be present. + */ + req &= ~mask; + if ((req & (S_IRUGO | S_IXUGO)) != (inode->i_mode & (S_IRUGO|S_IXUGO))) + return -EPERM; + if ((req & S_IWUGO) && ((req & S_IWUGO) != (S_IWUGO & ~mask))) + return -EPERM; + + return 0; +} + +static int fat_allow_set_time(struct msdos_sb_info *sbi, struct inode *inode) +{ + mode_t allow_utime = sbi->options.allow_utime; + + if (current->fsuid != inode->i_uid) { + if (in_group_p(inode->i_gid)) + allow_utime >>= 3; + if (allow_utime & MAY_WRITE) + return 1; + } + + /* use a default check */ + return 0; +} + +int fat_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); + struct inode *inode = dentry->d_inode; + int mask, error = 0; + unsigned int ia_valid; + + lock_kernel(); + + /* + * Expand the file. Since inode_setattr() updates ->i_size + * before calling the ->truncate(), but FAT needs to fill the + * hole before it. + */ + if (attr->ia_valid & ATTR_SIZE) { + if (attr->ia_size > inode->i_size) { + error = fat_cont_expand(inode, attr->ia_size); + if (error || attr->ia_valid == ATTR_SIZE) + goto out; + attr->ia_valid &= ~ATTR_SIZE; + } + } + + /* Check for setting the inode time. */ + ia_valid = attr->ia_valid; + if (ia_valid & (ATTR_MTIME_SET | ATTR_ATIME_SET)) { + if (fat_allow_set_time(sbi, inode)) + attr->ia_valid &= ~(ATTR_MTIME_SET | ATTR_ATIME_SET); + } + + error = inode_change_ok(inode, attr); + attr->ia_valid = ia_valid; + if (error) { + if (sbi->options.quiet) + error = 0; + goto out; + } + if (((attr->ia_valid & ATTR_UID) && + (attr->ia_uid != sbi->options.fs_uid)) || + ((attr->ia_valid & ATTR_GID) && + (attr->ia_gid != sbi->options.fs_gid)) || + ((attr->ia_valid & ATTR_MODE) && + fat_check_mode(sbi, inode, attr->ia_mode) < 0)) + error = -EPERM; + + if (error) { + if (sbi->options.quiet) + error = 0; + goto out; + } + + error = inode_setattr(inode, attr); + if (error) + goto out; + + if (S_ISDIR(inode->i_mode)) + mask = sbi->options.fs_dmask; + else + mask = sbi->options.fs_fmask; + inode->i_mode &= S_IFMT | (S_IRWXUGO & ~mask); +out: + unlock_kernel(); + return error; +} +EXPORT_SYMBOL_GPL(fat_setattr); + const struct inode_operations fat_file_inode_operations = { .truncate = fat_truncate, - .setattr = fat_notify_change, + .setattr = fat_setattr, .getattr = fat_getattr, }; diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 53f3cf62b7c..4e0a3dd9d67 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -433,11 +433,8 @@ EXPORT_SYMBOL_GPL(fat_build_inode); static void fat_delete_inode(struct inode *inode) { truncate_inode_pages(&inode->i_data, 0); - - if (!is_bad_inode(inode)) { - inode->i_size = 0; - fat_truncate(inode); - } + inode->i_size = 0; + fat_truncate(inode); clear_inode(inode); } @@ -445,8 +442,6 @@ static void fat_clear_inode(struct inode *inode) { struct msdos_sb_info *sbi = MSDOS_SB(inode->i_sb); - if (is_bad_inode(inode)) - return; lock_kernel(); spin_lock(&sbi->inode_hash_lock); fat_cache_inval_inode(inode); @@ -542,7 +537,7 @@ static int fat_statfs(struct dentry *dentry, struct kstatfs *buf) struct msdos_sb_info *sbi = MSDOS_SB(dentry->d_sb); /* If the count of free cluster is still unknown, counts it here. */ - if (sbi->free_clusters == -1) { + if (sbi->free_clusters == -1 || !sbi->free_clus_valid) { int err = fat_count_free_clusters(dentry->d_sb); if (err) return err; @@ -790,6 +785,8 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, ",gid=%u", opts->fs_gid); seq_printf(m, ",fmask=%04o", opts->fs_fmask); seq_printf(m, ",dmask=%04o", opts->fs_dmask); + if (opts->allow_utime) + seq_printf(m, ",allow_utime=%04o", opts->allow_utime); if (sbi->nls_disk) seq_printf(m, ",codepage=%s", sbi->nls_disk->charset); if (isvfat) { @@ -845,9 +842,9 @@ static int fat_show_options(struct seq_file *m, struct vfsmount *mnt) enum { Opt_check_n, Opt_check_r, Opt_check_s, Opt_uid, Opt_gid, - Opt_umask, Opt_dmask, Opt_fmask, Opt_codepage, Opt_usefree, Opt_nocase, - Opt_quiet, Opt_showexec, Opt_debug, Opt_immutable, - Opt_dots, Opt_nodots, + Opt_umask, Opt_dmask, Opt_fmask, Opt_allow_utime, Opt_codepage, + Opt_usefree, Opt_nocase, Opt_quiet, Opt_showexec, Opt_debug, + Opt_immutable, Opt_dots, Opt_nodots, Opt_charset, Opt_shortname_lower, Opt_shortname_win95, Opt_shortname_winnt, Opt_shortname_mixed, Opt_utf8_no, Opt_utf8_yes, Opt_uni_xl_no, Opt_uni_xl_yes, Opt_nonumtail_no, Opt_nonumtail_yes, @@ -866,6 +863,7 @@ static match_table_t fat_tokens = { {Opt_umask, "umask=%o"}, {Opt_dmask, "dmask=%o"}, {Opt_fmask, "fmask=%o"}, + {Opt_allow_utime, "allow_utime=%o"}, {Opt_codepage, "codepage=%u"}, {Opt_usefree, "usefree"}, {Opt_nocase, "nocase"}, @@ -937,6 +935,7 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, opts->fs_uid = current->uid; opts->fs_gid = current->gid; opts->fs_fmask = opts->fs_dmask = current->fs->umask; + opts->allow_utime = -1; opts->codepage = fat_default_codepage; opts->iocharset = fat_default_iocharset; if (is_vfat) @@ -1024,6 +1023,11 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, return 0; opts->fs_fmask = option; break; + case Opt_allow_utime: + if (match_octal(&args[0], &option)) + return 0; + opts->allow_utime = option & (S_IWGRP | S_IWOTH); + break; case Opt_codepage: if (match_int(&args[0], &option)) return 0; @@ -1106,6 +1110,9 @@ static int parse_options(char *options, int is_vfat, int silent, int *debug, " for FAT filesystems, filesystem will be case sensitive!\n"); } + /* If user doesn't specify allow_utime, it's initialized from dmask. */ + if (opts->allow_utime == (unsigned short)-1) + opts->allow_utime = ~opts->fs_dmask & (S_IWGRP | S_IWOTH); if (opts->unicode_xlate) opts->utf8 = 0; @@ -1208,18 +1215,17 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, */ media = b->media; - if (!FAT_VALID_MEDIA(media)) { + if (!fat_valid_media(media)) { if (!silent) printk(KERN_ERR "FAT: invalid media value (0x%02x)\n", media); brelse(bh); goto out_invalid; } - logical_sector_size = - le16_to_cpu(get_unaligned((__le16 *)&b->sector_size)); + logical_sector_size = get_unaligned_le16(&b->sector_size); if (!is_power_of_2(logical_sector_size) || (logical_sector_size < 512) - || (PAGE_CACHE_SIZE < logical_sector_size)) { + || (logical_sector_size > 4096)) { if (!silent) printk(KERN_ERR "FAT: bogus logical sector size %u\n", logical_sector_size); @@ -1267,6 +1273,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->fat_length = le16_to_cpu(b->fat_length); sbi->root_cluster = 0; sbi->free_clusters = -1; /* Don't know yet */ + sbi->free_clus_valid = 0; sbi->prev_free = FAT_START_ENT; if (!sbi->fat_length && b->fat32_length) { @@ -1302,8 +1309,8 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->fsinfo_sector); } else { if (sbi->options.usefree) - sbi->free_clusters = - le32_to_cpu(fsinfo->free_clusters); + sbi->free_clus_valid = 1; + sbi->free_clusters = le32_to_cpu(fsinfo->free_clusters); sbi->prev_free = le32_to_cpu(fsinfo->next_cluster); } @@ -1314,8 +1321,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, sbi->dir_per_block_bits = ffs(sbi->dir_per_block) - 1; sbi->dir_start = sbi->fat_start + sbi->fats * sbi->fat_length; - sbi->dir_entries = - le16_to_cpu(get_unaligned((__le16 *)&b->dir_entries)); + sbi->dir_entries = get_unaligned_le16(&b->dir_entries); if (sbi->dir_entries & (sbi->dir_per_block - 1)) { if (!silent) printk(KERN_ERR "FAT: bogus directroy-entries per block" @@ -1327,7 +1333,7 @@ int fat_fill_super(struct super_block *sb, void *data, int silent, rootdir_sectors = sbi->dir_entries * sizeof(struct msdos_dir_entry) / sb->s_blocksize; sbi->data_start = sbi->dir_start + rootdir_sectors; - total_sectors = le16_to_cpu(get_unaligned((__le16 *)&b->sectors)); + total_sectors = get_unaligned_le16(&b->sectors); if (total_sectors == 0) total_sectors = le32_to_cpu(b->total_sect); diff --git a/fs/fcntl.c b/fs/fcntl.c index e632da761fc..bfd776509a7 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -9,6 +9,7 @@ #include <linux/mm.h> #include <linux/fs.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/capability.h> #include <linux/dnotify.h> #include <linux/smp_lock.h> @@ -55,14 +56,16 @@ static int get_close_on_exec(unsigned int fd) * file_lock held for write. */ -static int locate_fd(struct files_struct *files, - struct file *file, unsigned int orig_start) +static int locate_fd(unsigned int orig_start, int cloexec) { + struct files_struct *files = current->files; unsigned int newfd; unsigned int start; int error; struct fdtable *fdt; + spin_lock(&files->file_lock); + error = -EINVAL; if (orig_start >= current->signal->rlim[RLIMIT_NOFILE].rlim_cur) goto out; @@ -97,42 +100,28 @@ repeat: if (error) goto repeat; - /* - * We reacquired files_lock, so we are safe as long as - * we reacquire the fdtable pointer and use it while holding - * the lock, no one can free it during that time. - */ if (start <= files->next_fd) files->next_fd = newfd + 1; + FD_SET(newfd, fdt->open_fds); + if (cloexec) + FD_SET(newfd, fdt->close_on_exec); + else + FD_CLR(newfd, fdt->close_on_exec); error = newfd; - + out: + spin_unlock(&files->file_lock); return error; } static int dupfd(struct file *file, unsigned int start, int cloexec) { - struct files_struct * files = current->files; - struct fdtable *fdt; - int fd; - - spin_lock(&files->file_lock); - fd = locate_fd(files, file, start); - if (fd >= 0) { - /* locate_fd() may have expanded fdtable, load the ptr */ - fdt = files_fdtable(files); - FD_SET(fd, fdt->open_fds); - if (cloexec) - FD_SET(fd, fdt->close_on_exec); - else - FD_CLR(fd, fdt->close_on_exec); - spin_unlock(&files->file_lock); + int fd = locate_fd(start, cloexec); + if (fd >= 0) fd_install(fd, file); - } else { - spin_unlock(&files->file_lock); + else fput(file); - } return fd; } diff --git a/fs/file.c b/fs/file.c index 5110acb1c9e..4c6f0ea12c4 100644 --- a/fs/file.c +++ b/fs/file.c @@ -12,6 +12,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/bitops.h> #include <linux/interrupt.h> #include <linux/spinlock.h> @@ -149,8 +150,16 @@ static struct fdtable * alloc_fdtable(unsigned int nr) nr /= (1024 / sizeof(struct file *)); nr = roundup_pow_of_two(nr + 1); nr *= (1024 / sizeof(struct file *)); - if (nr > sysctl_nr_open) - nr = sysctl_nr_open; + /* + * Note that this can drive nr *below* what we had passed if sysctl_nr_open + * had been set lower between the check in expand_files() and here. Deal + * with that in caller, it's cheaper that way. + * + * We make sure that nr remains a multiple of BITS_PER_LONG - otherwise + * bitmaps handling below becomes unpleasant, to put it mildly... + */ + if (unlikely(nr > sysctl_nr_open)) + nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1; fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL); if (!fdt) @@ -199,6 +208,16 @@ static int expand_fdtable(struct files_struct *files, int nr) if (!new_fdt) return -ENOMEM; /* + * extremely unlikely race - sysctl_nr_open decreased between the check in + * caller and alloc_fdtable(). Cheaper to catch it here... + */ + if (unlikely(new_fdt->max_fds <= nr)) { + free_fdarr(new_fdt); + free_fdset(new_fdt); + kfree(new_fdt); + return -EMFILE; + } + /* * Check again since another task may have expanded the fd table while * we dropped the lock */ diff --git a/fs/file_table.c b/fs/file_table.c index 7a0a9b87225..83084225b4c 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -8,6 +8,7 @@ #include <linux/string.h> #include <linux/slab.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/init.h> #include <linux/module.h> #include <linux/fs.h> diff --git a/fs/freevxfs/vxfs_extern.h b/fs/freevxfs/vxfs_extern.h index 2b46064f66b..50ab5eecb99 100644 --- a/fs/freevxfs/vxfs_extern.h +++ b/fs/freevxfs/vxfs_extern.h @@ -50,7 +50,11 @@ extern daddr_t vxfs_bmap1(struct inode *, long); /* vxfs_fshead.c */ extern int vxfs_read_fshead(struct super_block *); +/* vxfs_immed.c */ +extern const struct inode_operations vxfs_immed_symlink_iops; + /* vxfs_inode.c */ +extern const struct address_space_operations vxfs_immed_aops; extern struct kmem_cache *vxfs_inode_cachep; extern void vxfs_dumpi(struct vxfs_inode_info *, ino_t); extern struct inode * vxfs_get_fake_inode(struct super_block *, @@ -69,6 +73,7 @@ extern const struct file_operations vxfs_dir_operations; extern int vxfs_read_olt(struct super_block *, u_long); /* vxfs_subr.c */ +extern const struct address_space_operations vxfs_aops; extern struct page * vxfs_get_page(struct address_space *, u_long); extern void vxfs_put_page(struct page *); extern struct buffer_head * vxfs_bread(struct inode *, int); diff --git a/fs/freevxfs/vxfs_immed.c b/fs/freevxfs/vxfs_immed.c index 8a5959a61ba..c36aeaf92e4 100644 --- a/fs/freevxfs/vxfs_immed.c +++ b/fs/freevxfs/vxfs_immed.c @@ -35,6 +35,7 @@ #include <linux/namei.h> #include "vxfs.h" +#include "vxfs_extern.h" #include "vxfs_inode.h" diff --git a/fs/freevxfs/vxfs_inode.c b/fs/freevxfs/vxfs_inode.c index ad88d2364bc..9f3f2ceb73f 100644 --- a/fs/freevxfs/vxfs_inode.c +++ b/fs/freevxfs/vxfs_inode.c @@ -41,11 +41,6 @@ #include "vxfs_extern.h" -extern const struct address_space_operations vxfs_aops; -extern const struct address_space_operations vxfs_immed_aops; - -extern const struct inode_operations vxfs_immed_symlink_iops; - struct kmem_cache *vxfs_inode_cachep; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 06557679ca4..ae45f77765c 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -25,6 +25,45 @@ #include <linux/buffer_head.h> #include "internal.h" + +/** + * writeback_acquire - attempt to get exclusive writeback access to a device + * @bdi: the device's backing_dev_info structure + * + * It is a waste of resources to have more than one pdflush thread blocked on + * a single request queue. Exclusion at the request_queue level is obtained + * via a flag in the request_queue's backing_dev_info.state. + * + * Non-request_queue-backed address_spaces will share default_backing_dev_info, + * unless they implement their own. Which is somewhat inefficient, as this + * may prevent concurrent writeback against multiple devices. + */ +static int writeback_acquire(struct backing_dev_info *bdi) +{ + return !test_and_set_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_in_progress - determine whether there is writeback in progress + * @bdi: the device's backing_dev_info structure. + * + * Determine whether there is writeback in progress against a backing device. + */ +int writeback_in_progress(struct backing_dev_info *bdi) +{ + return test_bit(BDI_pdflush, &bdi->state); +} + +/** + * writeback_release - relinquish exclusive writeback access against a device. + * @bdi: the device's backing_dev_info structure + */ +static void writeback_release(struct backing_dev_info *bdi) +{ + BUG_ON(!writeback_in_progress(bdi)); + clear_bit(BDI_pdflush, &bdi->state); +} + /** * __mark_inode_dirty - internal function * @inode: inode to mark @@ -747,43 +786,4 @@ int generic_osync_inode(struct inode *inode, struct address_space *mapping, int return err; } - EXPORT_SYMBOL(generic_osync_inode); - -/** - * writeback_acquire - attempt to get exclusive writeback access to a device - * @bdi: the device's backing_dev_info structure - * - * It is a waste of resources to have more than one pdflush thread blocked on - * a single request queue. Exclusion at the request_queue level is obtained - * via a flag in the request_queue's backing_dev_info.state. - * - * Non-request_queue-backed address_spaces will share default_backing_dev_info, - * unless they implement their own. Which is somewhat inefficient, as this - * may prevent concurrent writeback against multiple devices. - */ -int writeback_acquire(struct backing_dev_info *bdi) -{ - return !test_and_set_bit(BDI_pdflush, &bdi->state); -} - -/** - * writeback_in_progress - determine whether there is writeback in progress - * @bdi: the device's backing_dev_info structure. - * - * Determine whether there is writeback in progress against a backing device. - */ -int writeback_in_progress(struct backing_dev_info *bdi) -{ - return test_bit(BDI_pdflush, &bdi->state); -} - -/** - * writeback_release - relinquish exclusive writeback access against a device. - * @bdi: the device's backing_dev_info structure - */ -void writeback_release(struct backing_dev_info *bdi) -{ - BUG_ON(!writeback_in_progress(bdi)); - clear_bit(BDI_pdflush, &bdi->state); -} diff --git a/fs/fuse/control.c b/fs/fuse/control.c index 105d4a271e0..4f3cab32141 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -117,7 +117,7 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) parent = fuse_control_sb->s_root; inc_nlink(parent->d_inode); - sprintf(name, "%llu", (unsigned long long) fc->id); + sprintf(name, "%u", fc->dev); parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, &simple_dir_inode_operations, &simple_dir_operations); diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index af639807524..87250b6a868 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -47,6 +47,14 @@ struct fuse_req *fuse_request_alloc(void) return req; } +struct fuse_req *fuse_request_alloc_nofs(void) +{ + struct fuse_req *req = kmem_cache_alloc(fuse_req_cachep, GFP_NOFS); + if (req) + fuse_request_init(req); + return req; +} + void fuse_request_free(struct fuse_req *req) { kmem_cache_free(fuse_req_cachep, req); @@ -291,6 +299,7 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req) static void wait_answer_interruptible(struct fuse_conn *fc, struct fuse_req *req) + __releases(fc->lock) __acquires(fc->lock) { if (signal_pending(current)) return; @@ -307,8 +316,8 @@ static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req) kill_fasync(&fc->fasync, SIGIO, POLL_IN); } -/* Called with fc->lock held. Releases, and then reacquires it. */ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) + __releases(fc->lock) __acquires(fc->lock) { if (!fc->no_interrupt) { /* Any signal may interrupt this */ @@ -430,6 +439,17 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) } /* + * Called under fc->lock + * + * fc->connected must have been checked previously + */ +void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req) +{ + req->isreply = 1; + request_send_nowait_locked(fc, req); +} + +/* * Lock the request. Up to the next unlock_request() there mustn't be * anything that could cause a page-fault. If the request was already * aborted bail out. @@ -968,6 +988,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) * locked). */ static void end_io_requests(struct fuse_conn *fc) + __releases(fc->lock) __acquires(fc->lock) { while (!list_empty(&fc->io)) { struct fuse_req *req = diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c4807b3fc8a..2060bf06b90 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -132,7 +132,7 @@ static void fuse_lookup_init(struct fuse_req *req, struct inode *dir, req->out.args[0].value = outarg; } -static u64 fuse_get_attr_version(struct fuse_conn *fc) +u64 fuse_get_attr_version(struct fuse_conn *fc) { u64 curr_version; @@ -1107,6 +1107,50 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg) } /* + * Prevent concurrent writepages on inode + * + * This is done by adding a negative bias to the inode write counter + * and waiting for all pending writes to finish. + */ +void fuse_set_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(!mutex_is_locked(&inode->i_mutex)); + + spin_lock(&fc->lock); + BUG_ON(fi->writectr < 0); + fi->writectr += FUSE_NOWRITE; + spin_unlock(&fc->lock); + wait_event(fi->page_waitq, fi->writectr == FUSE_NOWRITE); +} + +/* + * Allow writepages on inode + * + * Remove the bias from the writecounter and send any queued + * writepages. + */ +static void __fuse_release_nowrite(struct inode *inode) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + BUG_ON(fi->writectr != FUSE_NOWRITE); + fi->writectr = 0; + fuse_flush_writepages(inode); +} + +void fuse_release_nowrite(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + + spin_lock(&fc->lock); + __fuse_release_nowrite(inode); + spin_unlock(&fc->lock); +} + +/* * Set attributes, and at the same time refresh them. * * Truncation is slightly complicated, because the 'truncate' request @@ -1122,6 +1166,8 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, struct fuse_req *req; struct fuse_setattr_in inarg; struct fuse_attr_out outarg; + bool is_truncate = false; + loff_t oldsize; int err; if (!fuse_allow_task(fc, current)) @@ -1145,12 +1191,16 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, send_sig(SIGXFSZ, current, 0); return -EFBIG; } + is_truncate = true; } req = fuse_get_req(fc); if (IS_ERR(req)) return PTR_ERR(req); + if (is_truncate) + fuse_set_nowrite(inode); + memset(&inarg, 0, sizeof(inarg)); memset(&outarg, 0, sizeof(outarg)); iattr_to_fattr(attr, &inarg); @@ -1181,16 +1231,44 @@ static int fuse_do_setattr(struct dentry *entry, struct iattr *attr, if (err) { if (err == -EINTR) fuse_invalidate_attr(inode); - return err; + goto error; } if ((inode->i_mode ^ outarg.attr.mode) & S_IFMT) { make_bad_inode(inode); - return -EIO; + err = -EIO; + goto error; + } + + spin_lock(&fc->lock); + fuse_change_attributes_common(inode, &outarg.attr, + attr_timeout(&outarg)); + oldsize = inode->i_size; + i_size_write(inode, outarg.attr.size); + + if (is_truncate) { + /* NOTE: this may release/reacquire fc->lock */ + __fuse_release_nowrite(inode); + } + spin_unlock(&fc->lock); + + /* + * Only call invalidate_inode_pages2() after removing + * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock. + */ + if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) { + if (outarg.attr.size < oldsize) + fuse_truncate(inode->i_mapping, outarg.attr.size); + invalidate_inode_pages2(inode->i_mapping); } - fuse_change_attributes(inode, &outarg.attr, attr_timeout(&outarg), 0); return 0; + +error: + if (is_truncate) + fuse_release_nowrite(inode); + + return err; } static int fuse_setattr(struct dentry *entry, struct iattr *attr) diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 676b0bc8a86..f28cf8b46f8 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -210,6 +210,49 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) return (u64) v0 + ((u64) v1 << 32); } +/* + * Check if page is under writeback + * + * This is currently done by walking the list of writepage requests + * for the inode, which can be pretty inefficient. + */ +static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + bool found = false; + + spin_lock(&fc->lock); + list_for_each_entry(req, &fi->writepages, writepages_entry) { + pgoff_t curr_index; + + BUG_ON(req->inode != inode); + curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT; + if (curr_index == index) { + found = true; + break; + } + } + spin_unlock(&fc->lock); + + return found; +} + +/* + * Wait for page writeback to be completed. + * + * Since fuse doesn't rely on the VM writeback tracking, this has to + * use some other means. + */ +static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) +{ + struct fuse_inode *fi = get_fuse_inode(inode); + + wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); + return 0; +} + static int fuse_flush(struct file *file, fl_owner_t id) { struct inode *inode = file->f_path.dentry->d_inode; @@ -245,6 +288,21 @@ static int fuse_flush(struct file *file, fl_owner_t id) return err; } +/* + * Wait for all pending writepages on the inode to finish. + * + * This is currently done by blocking further writes with FUSE_NOWRITE + * and waiting for all sent writes to complete. + * + * This must be called under i_mutex, otherwise the FUSE_NOWRITE usage + * could conflict with truncation. + */ +static void fuse_sync_writes(struct inode *inode) +{ + fuse_set_nowrite(inode); + fuse_release_nowrite(inode); +} + int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, int isdir) { @@ -261,6 +319,17 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; + /* + * Start writeback against all dirty pages of the inode, then + * wait for all outstanding writes, before sending the FSYNC + * request. + */ + err = write_inode_now(inode, 0); + if (err) + return err; + + fuse_sync_writes(inode); + req = fuse_get_req(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -294,7 +363,7 @@ static int fuse_fsync(struct file *file, struct dentry *de, int datasync) void fuse_read_fill(struct fuse_req *req, struct file *file, struct inode *inode, loff_t pos, size_t count, int opcode) { - struct fuse_read_in *inarg = &req->misc.read_in; + struct fuse_read_in *inarg = &req->misc.read.in; struct fuse_file *ff = file->private_data; inarg->fh = ff->fh; @@ -320,7 +389,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, fuse_read_fill(req, file, inode, pos, count, FUSE_READ); if (owner != NULL) { - struct fuse_read_in *inarg = &req->misc.read_in; + struct fuse_read_in *inarg = &req->misc.read.in; inarg->read_flags |= FUSE_READ_LOCKOWNER; inarg->lock_owner = fuse_lock_owner_id(fc, owner); @@ -329,31 +398,66 @@ static size_t fuse_send_read(struct fuse_req *req, struct file *file, return req->out.args[0].size; } +static void fuse_read_update_size(struct inode *inode, loff_t size, + u64 attr_ver) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + if (attr_ver == fi->attr_version && size < inode->i_size) { + fi->attr_version = ++fc->attr_version; + i_size_write(inode, size); + } + spin_unlock(&fc->lock); +} + static int fuse_readpage(struct file *file, struct page *page) { struct inode *inode = page->mapping->host; struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_req *req; + size_t num_read; + loff_t pos = page_offset(page); + size_t count = PAGE_CACHE_SIZE; + u64 attr_ver; int err; err = -EIO; if (is_bad_inode(inode)) goto out; + /* + * Page writeback can extend beyond the liftime of the + * page-cache page, so make sure we read a properly synced + * page. + */ + fuse_wait_on_page_writeback(inode, page->index); + req = fuse_get_req(fc); err = PTR_ERR(req); if (IS_ERR(req)) goto out; + attr_ver = fuse_get_attr_version(fc); + req->out.page_zeroing = 1; req->num_pages = 1; req->pages[0] = page; - fuse_send_read(req, file, inode, page_offset(page), PAGE_CACHE_SIZE, - NULL); + num_read = fuse_send_read(req, file, inode, pos, count, NULL); err = req->out.h.error; fuse_put_request(fc, req); - if (!err) + + if (!err) { + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (num_read < count) + fuse_read_update_size(inode, pos + num_read, attr_ver); + SetPageUptodate(page); + } + fuse_invalidate_attr(inode); /* atime changed */ out: unlock_page(page); @@ -363,8 +467,19 @@ static int fuse_readpage(struct file *file, struct page *page) static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req) { int i; + size_t count = req->misc.read.in.size; + size_t num_read = req->out.args[0].size; + struct inode *inode = req->pages[0]->mapping->host; + + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!req->out.h.error && num_read < count) { + loff_t pos = page_offset(req->pages[0]) + num_read; + fuse_read_update_size(inode, pos, req->misc.read.attr_ver); + } - fuse_invalidate_attr(req->pages[0]->mapping->host); /* atime changed */ + fuse_invalidate_attr(inode); /* atime changed */ for (i = 0; i < req->num_pages; i++) { struct page *page = req->pages[i]; @@ -387,6 +502,7 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file, size_t count = req->num_pages << PAGE_CACHE_SHIFT; req->out.page_zeroing = 1; fuse_read_fill(req, file, inode, pos, count, FUSE_READ); + req->misc.read.attr_ver = fuse_get_attr_version(fc); if (fc->async_read) { struct fuse_file *ff = file->private_data; req->ff = fuse_file_get(ff); @@ -411,6 +527,8 @@ static int fuse_readpages_fill(void *_data, struct page *page) struct inode *inode = data->inode; struct fuse_conn *fc = get_fuse_conn(inode); + fuse_wait_on_page_writeback(inode, page->index); + if (req->num_pages && (req->num_pages == FUSE_MAX_PAGES_PER_REQ || (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || @@ -477,11 +595,10 @@ static ssize_t fuse_file_aio_read(struct kiocb *iocb, const struct iovec *iov, } static void fuse_write_fill(struct fuse_req *req, struct file *file, - struct inode *inode, loff_t pos, size_t count, - int writepage) + struct fuse_file *ff, struct inode *inode, + loff_t pos, size_t count, int writepage) { struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_file *ff = file->private_data; struct fuse_write_in *inarg = &req->misc.write.in; struct fuse_write_out *outarg = &req->misc.write.out; @@ -490,7 +607,7 @@ static void fuse_write_fill(struct fuse_req *req, struct file *file, inarg->offset = pos; inarg->size = count; inarg->write_flags = writepage ? FUSE_WRITE_CACHE : 0; - inarg->flags = file->f_flags; + inarg->flags = file ? file->f_flags : 0; req->in.h.opcode = FUSE_WRITE; req->in.h.nodeid = get_node_id(inode); req->in.argpages = 1; @@ -511,7 +628,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct file *file, fl_owner_t owner) { struct fuse_conn *fc = get_fuse_conn(inode); - fuse_write_fill(req, file, inode, pos, count, 0); + fuse_write_fill(req, file, file->private_data, inode, pos, count, 0); if (owner != NULL) { struct fuse_write_in *inarg = &req->misc.write.in; inarg->write_flags |= FUSE_WRITE_LOCKOWNER; @@ -533,19 +650,36 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return 0; } +static void fuse_write_update_size(struct inode *inode, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + + spin_lock(&fc->lock); + fi->attr_version = ++fc->attr_version; + if (pos > inode->i_size) + i_size_write(inode, pos); + spin_unlock(&fc->lock); +} + static int fuse_buffered_write(struct file *file, struct inode *inode, loff_t pos, unsigned count, struct page *page) { int err; size_t nres; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_inode *fi = get_fuse_inode(inode); unsigned offset = pos & (PAGE_CACHE_SIZE - 1); struct fuse_req *req; if (is_bad_inode(inode)) return -EIO; + /* + * Make sure writepages on the same page are not mixed up with + * plain writes. + */ + fuse_wait_on_page_writeback(inode, page->index); + req = fuse_get_req(fc); if (IS_ERR(req)) return PTR_ERR(req); @@ -560,12 +694,7 @@ static int fuse_buffered_write(struct file *file, struct inode *inode, err = -EIO; if (!err) { pos += nres; - spin_lock(&fc->lock); - fi->attr_version = ++fc->attr_version; - if (pos > inode->i_size) - i_size_write(inode, pos); - spin_unlock(&fc->lock); - + fuse_write_update_size(inode, pos); if (count == PAGE_CACHE_SIZE) SetPageUptodate(page); } @@ -588,6 +717,198 @@ static int fuse_write_end(struct file *file, struct address_space *mapping, return res; } +static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file, + struct inode *inode, loff_t pos, + size_t count) +{ + size_t res; + unsigned offset; + unsigned i; + + for (i = 0; i < req->num_pages; i++) + fuse_wait_on_page_writeback(inode, req->pages[i]->index); + + res = fuse_send_write(req, file, inode, pos, count, NULL); + + offset = req->page_offset; + count = res; + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + + if (!req->out.h.error && !offset && count >= PAGE_CACHE_SIZE) + SetPageUptodate(page); + + if (count > PAGE_CACHE_SIZE - offset) + count -= PAGE_CACHE_SIZE - offset; + else + count = 0; + offset = 0; + + unlock_page(page); + page_cache_release(page); + } + + return res; +} + +static ssize_t fuse_fill_write_pages(struct fuse_req *req, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_CACHE_SIZE - 1); + size_t count = 0; + int err; + + req->page_offset = offset; + + do { + size_t tmp; + struct page *page; + pgoff_t index = pos >> PAGE_CACHE_SHIFT; + size_t bytes = min_t(size_t, PAGE_CACHE_SIZE - offset, + iov_iter_count(ii)); + + bytes = min_t(size_t, bytes, fc->max_write - count); + + again: + err = -EFAULT; + if (iov_iter_fault_in_readable(ii, bytes)) + break; + + err = -ENOMEM; + page = __grab_cache_page(mapping, index); + if (!page) + break; + + pagefault_disable(); + tmp = iov_iter_copy_from_user_atomic(page, ii, offset, bytes); + pagefault_enable(); + flush_dcache_page(page); + + if (!tmp) { + unlock_page(page); + page_cache_release(page); + bytes = min(bytes, iov_iter_single_seg_count(ii)); + goto again; + } + + err = 0; + req->pages[req->num_pages] = page; + req->num_pages++; + + iov_iter_advance(ii, tmp); + count += tmp; + pos += tmp; + offset += tmp; + if (offset == PAGE_CACHE_SIZE) + offset = 0; + + } while (iov_iter_count(ii) && count < fc->max_write && + req->num_pages < FUSE_MAX_PAGES_PER_REQ && offset == 0); + + return count > 0 ? count : err; +} + +static ssize_t fuse_perform_write(struct file *file, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos) +{ + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + int err = 0; + ssize_t res = 0; + + if (is_bad_inode(inode)) + return -EIO; + + do { + struct fuse_req *req; + ssize_t count; + + req = fuse_get_req(fc); + if (IS_ERR(req)) { + err = PTR_ERR(req); + break; + } + + count = fuse_fill_write_pages(req, mapping, ii, pos); + if (count <= 0) { + err = count; + } else { + size_t num_written; + + num_written = fuse_send_write_pages(req, file, inode, + pos, count); + err = req->out.h.error; + if (!err) { + res += num_written; + pos += num_written; + + /* break out of the loop on short write */ + if (num_written != count) + err = -EIO; + } + } + fuse_put_request(fc, req); + } while (!err && iov_iter_count(ii)); + + if (res > 0) + fuse_write_update_size(inode, pos); + + fuse_invalidate_attr(inode); + + return res > 0 ? res : err; +} + +static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = iocb->ki_filp; + struct address_space *mapping = file->f_mapping; + size_t count = 0; + ssize_t written = 0; + struct inode *inode = mapping->host; + ssize_t err; + struct iov_iter i; + + WARN_ON(iocb->ki_pos != pos); + + err = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); + if (err) + return err; + + mutex_lock(&inode->i_mutex); + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + + /* We can write back this queue in page reclaim */ + current->backing_dev_info = mapping->backing_dev_info; + + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out; + + if (count == 0) + goto out; + + err = remove_suid(file->f_path.dentry); + if (err) + goto out; + + file_update_time(file); + + iov_iter_init(&i, iov, nr_segs, count, 0); + written = fuse_perform_write(file, mapping, &i, pos); + if (written >= 0) + iocb->ki_pos = pos + written; + +out: + current->backing_dev_info = NULL; + mutex_unlock(&inode->i_mutex); + + return written ? written : err; +} + static void fuse_release_user_pages(struct fuse_req *req, int write) { unsigned i; @@ -613,7 +934,7 @@ static int fuse_get_user_pages(struct fuse_req *req, const char __user *buf, nbytes = min(nbytes, (unsigned) FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT); npages = (nbytes + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; - npages = min(max(npages, 1), FUSE_MAX_PAGES_PER_REQ); + npages = clamp(npages, 1, FUSE_MAX_PAGES_PER_REQ); down_read(¤t->mm->mmap_sem); npages = get_user_pages(current, current->mm, user_addr, npages, write, 0, req->pages, NULL); @@ -645,14 +966,15 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, while (count) { size_t nres; - size_t nbytes = min(count, nmax); - int err = fuse_get_user_pages(req, buf, nbytes, !write); + size_t nbytes_limit = min(count, nmax); + size_t nbytes; + int err = fuse_get_user_pages(req, buf, nbytes_limit, !write); if (err) { res = err; break; } nbytes = (req->num_pages << PAGE_SHIFT) - req->page_offset; - nbytes = min(count, nbytes); + nbytes = min(nbytes_limit, nbytes); if (write) nres = fuse_send_write(req, file, inode, pos, nbytes, current->files); @@ -683,12 +1005,8 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, } fuse_put_request(fc, req); if (res > 0) { - if (write) { - spin_lock(&fc->lock); - if (pos > inode->i_size) - i_size_write(inode, pos); - spin_unlock(&fc->lock); - } + if (write) + fuse_write_update_size(inode, pos); *ppos = pos; } fuse_invalidate_attr(inode); @@ -716,21 +1034,225 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf, return res; } -static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) +static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req) { - if ((vma->vm_flags & VM_SHARED)) { - if ((vma->vm_flags & VM_WRITE)) - return -ENODEV; - else - vma->vm_flags &= ~VM_MAYWRITE; + __free_page(req->pages[0]); + fuse_file_put(req->ff); + fuse_put_request(fc, req); +} + +static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info; + + list_del(&req->writepages_entry); + dec_bdi_stat(bdi, BDI_WRITEBACK); + dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP); + bdi_writeout_inc(bdi); + wake_up(&fi->page_waitq); +} + +/* Called under fc->lock, may release and reacquire it */ +static void fuse_send_writepage(struct fuse_conn *fc, struct fuse_req *req) +{ + struct fuse_inode *fi = get_fuse_inode(req->inode); + loff_t size = i_size_read(req->inode); + struct fuse_write_in *inarg = &req->misc.write.in; + + if (!fc->connected) + goto out_free; + + if (inarg->offset + PAGE_CACHE_SIZE <= size) { + inarg->size = PAGE_CACHE_SIZE; + } else if (inarg->offset < size) { + inarg->size = size & (PAGE_CACHE_SIZE - 1); + } else { + /* Got truncated off completely */ + goto out_free; + } + + req->in.args[1].size = inarg->size; + fi->writectr++; + request_send_background_locked(fc, req); + return; + + out_free: + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); + spin_lock(&fc->lock); +} + +/* + * If fi->writectr is positive (no truncate or fsync going on) send + * all queued writepage requests. + * + * Called with fc->lock + */ +void fuse_flush_writepages(struct inode *inode) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + + while (fi->writectr >= 0 && !list_empty(&fi->queued_writes)) { + req = list_entry(fi->queued_writes.next, struct fuse_req, list); + list_del_init(&req->list); + fuse_send_writepage(fc, req); + } +} + +static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req) +{ + struct inode *inode = req->inode; + struct fuse_inode *fi = get_fuse_inode(inode); + + mapping_set_error(inode->i_mapping, req->out.h.error); + spin_lock(&fc->lock); + fi->writectr--; + fuse_writepage_finish(fc, req); + spin_unlock(&fc->lock); + fuse_writepage_free(fc, req); +} + +static int fuse_writepage_locked(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode = mapping->host; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_req *req; + struct fuse_file *ff; + struct page *tmp_page; + + set_page_writeback(page); + + req = fuse_request_alloc_nofs(); + if (!req) + goto err; + + tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + if (!tmp_page) + goto err_free; + + spin_lock(&fc->lock); + BUG_ON(list_empty(&fi->write_files)); + ff = list_entry(fi->write_files.next, struct fuse_file, write_entry); + req->ff = fuse_file_get(ff); + spin_unlock(&fc->lock); + + fuse_write_fill(req, NULL, ff, inode, page_offset(page), 0, 1); + + copy_highpage(tmp_page, page); + req->num_pages = 1; + req->pages[0] = tmp_page; + req->page_offset = 0; + req->end = fuse_writepage_end; + req->inode = inode; + + inc_bdi_stat(mapping->backing_dev_info, BDI_WRITEBACK); + inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP); + end_page_writeback(page); + + spin_lock(&fc->lock); + list_add(&req->writepages_entry, &fi->writepages); + list_add_tail(&req->list, &fi->queued_writes); + fuse_flush_writepages(inode); + spin_unlock(&fc->lock); + + return 0; + +err_free: + fuse_request_free(req); +err: + end_page_writeback(page); + return -ENOMEM; +} + +static int fuse_writepage(struct page *page, struct writeback_control *wbc) +{ + int err; + + err = fuse_writepage_locked(page); + unlock_page(page); + + return err; +} + +static int fuse_launder_page(struct page *page) +{ + int err = 0; + if (clear_page_dirty_for_io(page)) { + struct inode *inode = page->mapping->host; + err = fuse_writepage_locked(page); + if (!err) + fuse_wait_on_page_writeback(inode, page->index); } - return generic_file_mmap(file, vma); + return err; } -static int fuse_set_page_dirty(struct page *page) +/* + * Write back dirty pages now, because there may not be any suitable + * open files later + */ +static void fuse_vma_close(struct vm_area_struct *vma) { - printk("fuse_set_page_dirty: should not happen\n"); - dump_stack(); + filemap_write_and_wait(vma->vm_file->f_mapping); +} + +/* + * Wait for writeback against this page to complete before allowing it + * to be marked dirty again, and hence written back again, possibly + * before the previous writepage completed. + * + * Block here, instead of in ->writepage(), so that the userspace fs + * can only block processes actually operating on the filesystem. + * + * Otherwise unprivileged userspace fs would be able to block + * unrelated: + * + * - page migration + * - sync(2) + * - try_to_free_pages() with order > PAGE_ALLOC_COSTLY_ORDER + */ +static int fuse_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + /* + * Don't use page->mapping as it may become NULL from a + * concurrent truncate. + */ + struct inode *inode = vma->vm_file->f_mapping->host; + + fuse_wait_on_page_writeback(inode, page->index); + return 0; +} + +static struct vm_operations_struct fuse_file_vm_ops = { + .close = fuse_vma_close, + .fault = filemap_fault, + .page_mkwrite = fuse_page_mkwrite, +}; + +static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma) +{ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) { + struct inode *inode = file->f_dentry->d_inode; + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_file *ff = file->private_data; + /* + * file may be written through mmap, so chain it onto the + * inodes's write_file list + */ + spin_lock(&fc->lock); + if (list_empty(&ff->write_entry)) + list_add(&ff->write_entry, &fi->write_files); + spin_unlock(&fc->lock); + } + file_accessed(file); + vma->vm_ops = &fuse_file_vm_ops; return 0; } @@ -909,12 +1431,37 @@ static sector_t fuse_bmap(struct address_space *mapping, sector_t block) return err ? 0 : outarg.block; } +static loff_t fuse_file_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t retval; + struct inode *inode = file->f_path.dentry->d_inode; + + mutex_lock(&inode->i_mutex); + switch (origin) { + case SEEK_END: + offset += i_size_read(inode); + break; + case SEEK_CUR: + offset += file->f_pos; + } + retval = -EINVAL; + if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) { + if (offset != file->f_pos) { + file->f_pos = offset; + file->f_version = 0; + } + retval = offset; + } + mutex_unlock(&inode->i_mutex); + return retval; +} + static const struct file_operations fuse_file_operations = { - .llseek = generic_file_llseek, + .llseek = fuse_file_llseek, .read = do_sync_read, .aio_read = fuse_file_aio_read, .write = do_sync_write, - .aio_write = generic_file_aio_write, + .aio_write = fuse_file_aio_write, .mmap = fuse_file_mmap, .open = fuse_open, .flush = fuse_flush, @@ -926,7 +1473,7 @@ static const struct file_operations fuse_file_operations = { }; static const struct file_operations fuse_direct_io_file_operations = { - .llseek = generic_file_llseek, + .llseek = fuse_file_llseek, .read = fuse_direct_read, .write = fuse_direct_write, .open = fuse_open, @@ -940,10 +1487,12 @@ static const struct file_operations fuse_direct_io_file_operations = { static const struct address_space_operations fuse_file_aops = { .readpage = fuse_readpage, + .writepage = fuse_writepage, + .launder_page = fuse_launder_page, .write_begin = fuse_write_begin, .write_end = fuse_write_end, .readpages = fuse_readpages, - .set_page_dirty = fuse_set_page_dirty, + .set_page_dirty = __set_page_dirty_nobuffers, .bmap = fuse_bmap, }; diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 67aaf6ee38e..dadffa21a20 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -15,6 +15,7 @@ #include <linux/mm.h> #include <linux/backing-dev.h> #include <linux/mutex.h> +#include <linux/rwsem.h> /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 @@ -25,6 +26,9 @@ /** Congestion starts at 75% of maximum */ #define FUSE_CONGESTION_THRESHOLD (FUSE_MAX_BACKGROUND * 75 / 100) +/** Bias for fi->writectr, meaning new writepages must not be sent */ +#define FUSE_NOWRITE INT_MIN + /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -73,6 +77,19 @@ struct fuse_inode { /** Files usable in writepage. Protected by fc->lock */ struct list_head write_files; + + /** Writepages pending on truncate or fsync */ + struct list_head queued_writes; + + /** Number of sent writes, a negative bias (FUSE_NOWRITE) + * means more writes are blocked */ + int writectr; + + /** Waitq for writepage completion */ + wait_queue_head_t page_waitq; + + /** List of writepage requestst (pending or sent) */ + struct list_head writepages; }; /** FUSE specific file data */ @@ -222,7 +239,10 @@ struct fuse_req { } release; struct fuse_init_in init_in; struct fuse_init_out init_out; - struct fuse_read_in read_in; + struct { + struct fuse_read_in in; + u64 attr_ver; + } read; struct { struct fuse_write_in in; struct fuse_write_out out; @@ -242,6 +262,12 @@ struct fuse_req { /** File used in the request (or NULL) */ struct fuse_file *ff; + /** Inode used in the request or NULL */ + struct inode *inode; + + /** Link on fi->writepages */ + struct list_head writepages_entry; + /** Request completion callback */ void (*end)(struct fuse_conn *, struct fuse_req *); @@ -390,8 +416,8 @@ struct fuse_conn { /** Entry on the fuse_conn_list */ struct list_head entry; - /** Unique ID */ - u64 id; + /** Device ID from super block */ + dev_t dev; /** Dentries in the control filesystem */ struct dentry *ctl_dentry[FUSE_CTL_NUM_DENTRIES]; @@ -438,7 +464,7 @@ extern const struct file_operations fuse_dev_operations; /** * Get a filled in inode */ -struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); @@ -446,7 +472,7 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, * Send FORGET command */ void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, - unsigned long nodeid, u64 nlookup); + u64 nodeid, u64 nlookup); /** * Initialize READ or READDIR request @@ -504,6 +530,11 @@ void fuse_init_symlink(struct inode *inode); void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, u64 attr_valid, u64 attr_version); +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid); + +void fuse_truncate(struct address_space *mapping, loff_t offset); + /** * Initialize the client device */ @@ -522,6 +553,8 @@ void fuse_ctl_cleanup(void); */ struct fuse_req *fuse_request_alloc(void); +struct fuse_req *fuse_request_alloc_nofs(void); + /** * Free a request */ @@ -558,6 +591,8 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); */ void request_send_background(struct fuse_conn *fc, struct fuse_req *req); +void request_send_background_locked(struct fuse_conn *fc, struct fuse_req *req); + /* Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); @@ -600,3 +635,10 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id); int fuse_update_attributes(struct inode *inode, struct kstat *stat, struct file *file, bool *refreshed); + +void fuse_flush_writepages(struct inode *inode); + +void fuse_set_nowrite(struct inode *inode); +void fuse_release_nowrite(struct inode *inode); + +u64 fuse_get_attr_version(struct fuse_conn *fc); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 033f7bdd47e..79b61587383 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -59,7 +59,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) fi->nodeid = 0; fi->nlookup = 0; fi->attr_version = 0; + fi->writectr = 0; INIT_LIST_HEAD(&fi->write_files); + INIT_LIST_HEAD(&fi->queued_writes); + INIT_LIST_HEAD(&fi->writepages); + init_waitqueue_head(&fi->page_waitq); fi->forget_req = fuse_request_alloc(); if (!fi->forget_req) { kmem_cache_free(fuse_inode_cachep, inode); @@ -73,13 +77,14 @@ static void fuse_destroy_inode(struct inode *inode) { struct fuse_inode *fi = get_fuse_inode(inode); BUG_ON(!list_empty(&fi->write_files)); + BUG_ON(!list_empty(&fi->queued_writes)); if (fi->forget_req) fuse_request_free(fi->forget_req); kmem_cache_free(fuse_inode_cachep, inode); } void fuse_send_forget(struct fuse_conn *fc, struct fuse_req *req, - unsigned long nodeid, u64 nlookup) + u64 nodeid, u64 nlookup) { struct fuse_forget_in *inarg = &req->misc.forget_in; inarg->nlookup = nlookup; @@ -109,7 +114,7 @@ static int fuse_remount_fs(struct super_block *sb, int *flags, char *data) return 0; } -static void fuse_truncate(struct address_space *mapping, loff_t offset) +void fuse_truncate(struct address_space *mapping, loff_t offset) { /* See vmtruncate() */ unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); @@ -117,19 +122,12 @@ static void fuse_truncate(struct address_space *mapping, loff_t offset) unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1); } - -void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, - u64 attr_valid, u64 attr_version) +void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - loff_t oldsize; - spin_lock(&fc->lock); - if (attr_version != 0 && fi->attr_version > attr_version) { - spin_unlock(&fc->lock); - return; - } fi->attr_version = ++fc->attr_version; fi->i_time = attr_valid; @@ -159,6 +157,22 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, fi->orig_i_mode = inode->i_mode; if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS)) inode->i_mode &= ~S_ISVTX; +} + +void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr, + u64 attr_valid, u64 attr_version) +{ + struct fuse_conn *fc = get_fuse_conn(inode); + struct fuse_inode *fi = get_fuse_inode(inode); + loff_t oldsize; + + spin_lock(&fc->lock); + if (attr_version != 0 && fi->attr_version > attr_version) { + spin_unlock(&fc->lock); + return; + } + + fuse_change_attributes_common(inode, attr, attr_valid); oldsize = inode->i_size; i_size_write(inode, attr->size); @@ -193,7 +207,7 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr) static int fuse_inode_eq(struct inode *inode, void *_nodeidp) { - unsigned long nodeid = *(unsigned long *) _nodeidp; + u64 nodeid = *(u64 *) _nodeidp; if (get_node_id(inode) == nodeid) return 1; else @@ -202,12 +216,12 @@ static int fuse_inode_eq(struct inode *inode, void *_nodeidp) static int fuse_inode_set(struct inode *inode, void *_nodeidp) { - unsigned long nodeid = *(unsigned long *) _nodeidp; + u64 nodeid = *(u64 *) _nodeidp; get_fuse_inode(inode)->nodeid = nodeid; return 0; } -struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, +struct inode *fuse_iget(struct super_block *sb, u64 nodeid, int generation, struct fuse_attr *attr, u64 attr_valid, u64 attr_version) { @@ -242,10 +256,9 @@ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, return inode; } -static void fuse_umount_begin(struct vfsmount *vfsmnt, int flags) +static void fuse_umount_begin(struct super_block *sb) { - if (flags & MNT_FORCE) - fuse_abort_conn(get_fuse_conn_super(vfsmnt->mnt_sb)); + fuse_abort_conn(get_fuse_conn_super(sb)); } static void fuse_send_destroy(struct fuse_conn *fc) @@ -448,7 +461,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) return 0; } -static struct fuse_conn *new_conn(void) +static struct fuse_conn *new_conn(struct super_block *sb) { struct fuse_conn *fc; int err; @@ -469,19 +482,41 @@ static struct fuse_conn *new_conn(void) atomic_set(&fc->num_waiting, 0); fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; + /* fuse does it's own writeback accounting */ + fc->bdi.capabilities = BDI_CAP_NO_ACCT_WB; + fc->dev = sb->s_dev; err = bdi_init(&fc->bdi); - if (err) { - kfree(fc); - fc = NULL; - goto out; - } + if (err) + goto error_kfree; + err = bdi_register_dev(&fc->bdi, fc->dev); + if (err) + goto error_bdi_destroy; + /* + * For a single fuse filesystem use max 1% of dirty + + * writeback threshold. + * + * This gives about 1M of write buffer for memory maps on a + * machine with 1G and 10% dirty_ratio, which should be more + * than enough. + * + * Privileged users can raise it by writing to + * + * /sys/class/bdi/<bdi>/max_ratio + */ + bdi_set_max_ratio(&fc->bdi, 1); fc->reqctr = 0; fc->blocked = 1; fc->attr_version = 1; get_random_bytes(&fc->scramble_key, sizeof(fc->scramble_key)); } -out: return fc; + +error_bdi_destroy: + bdi_destroy(&fc->bdi); +error_kfree: + mutex_destroy(&fc->inst_mutex); + kfree(fc); + return NULL; } void fuse_conn_put(struct fuse_conn *fc) @@ -549,6 +584,7 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->bdi.ra_pages = min(fc->bdi.ra_pages, ra_pages); fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; + fc->max_write = min_t(unsigned, 4096, fc->max_write); fc->conn_init = 1; } fuse_put_request(fc, req); @@ -579,12 +615,6 @@ static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) request_send_background(fc, req); } -static u64 conn_id(void) -{ - static u64 ctr = 1; - return ctr++; -} - static int fuse_fill_super(struct super_block *sb, void *data, int silent) { struct fuse_conn *fc; @@ -622,14 +652,14 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->f_op != &fuse_dev_operations) return -EINVAL; - fc = new_conn(); + fc = new_conn(sb); if (!fc) return -ENOMEM; fc->flags = d.flags; fc->user_id = d.user_id; fc->group_id = d.group_id; - fc->max_read = d.max_read; + fc->max_read = min_t(unsigned, 4096, d.max_read); /* Used by get_root_inode() */ sb->s_fs_info = fc; @@ -660,7 +690,6 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (file->private_data) goto err_unlock; - fc->id = conn_id(); err = fuse_ctl_add_conn(fc); if (err) goto err_unlock; diff --git a/fs/gfs2/locking/dlm/sysfs.c b/fs/gfs2/locking/dlm/sysfs.c index 8479da47049..a4ff271df9e 100644 --- a/fs/gfs2/locking/dlm/sysfs.c +++ b/fs/gfs2/locking/dlm/sysfs.c @@ -212,7 +212,7 @@ int gdlm_sysfs_init(void) { gdlm_kset = kset_create_and_add("lock_dlm", NULL, kernel_kobj); if (!gdlm_kset) { - printk(KERN_WARNING "%s: can not create kset\n", __FUNCTION__); + printk(KERN_WARNING "%s: can not create kset\n", __func__); return -ENOMEM; } return 0; diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index 90a04a6e378..f55394e57cb 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -438,7 +438,7 @@ static int stuffed_readpage(struct gfs2_inode *ip, struct page *page) int error; /* - * Due to the order of unstuffing files and ->nopage(), we can be + * Due to the order of unstuffing files and ->fault(), we can be * asked for a zero page in the case of a stuffed file being extended, * so we need to supply one here. It doesn't happen often. */ diff --git a/fs/gfs2/util.h b/fs/gfs2/util.h index 509c5d60bd8..7f48576289c 100644 --- a/fs/gfs2/util.h +++ b/fs/gfs2/util.h @@ -41,7 +41,7 @@ int gfs2_assert_withdraw_i(struct gfs2_sbd *sdp, char *assertion, #define gfs2_assert_withdraw(sdp, assertion) \ ((likely(assertion)) ? 0 : gfs2_assert_withdraw_i((sdp), #assertion, \ - __FUNCTION__, __FILE__, __LINE__)) + __func__, __FILE__, __LINE__)) int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, @@ -49,28 +49,28 @@ int gfs2_assert_warn_i(struct gfs2_sbd *sdp, char *assertion, #define gfs2_assert_warn(sdp, assertion) \ ((likely(assertion)) ? 0 : gfs2_assert_warn_i((sdp), #assertion, \ - __FUNCTION__, __FILE__, __LINE__)) + __func__, __FILE__, __LINE__)) int gfs2_consist_i(struct gfs2_sbd *sdp, int cluster_wide, const char *function, char *file, unsigned int line); #define gfs2_consist(sdp) \ -gfs2_consist_i((sdp), 0, __FUNCTION__, __FILE__, __LINE__) +gfs2_consist_i((sdp), 0, __func__, __FILE__, __LINE__) int gfs2_consist_inode_i(struct gfs2_inode *ip, int cluster_wide, const char *function, char *file, unsigned int line); #define gfs2_consist_inode(ip) \ -gfs2_consist_inode_i((ip), 0, __FUNCTION__, __FILE__, __LINE__) +gfs2_consist_inode_i((ip), 0, __func__, __FILE__, __LINE__) int gfs2_consist_rgrpd_i(struct gfs2_rgrpd *rgd, int cluster_wide, const char *function, char *file, unsigned int line); #define gfs2_consist_rgrpd(rgd) \ -gfs2_consist_rgrpd_i((rgd), 0, __FUNCTION__, __FILE__, __LINE__) +gfs2_consist_rgrpd_i((rgd), 0, __func__, __FILE__, __LINE__) int gfs2_meta_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -91,7 +91,7 @@ static inline int gfs2_meta_check_i(struct gfs2_sbd *sdp, } #define gfs2_meta_check(sdp, bh) \ -gfs2_meta_check_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__) +gfs2_meta_check_i((sdp), (bh), __func__, __FILE__, __LINE__) int gfs2_metatype_check_ii(struct gfs2_sbd *sdp, struct buffer_head *bh, @@ -118,7 +118,7 @@ static inline int gfs2_metatype_check_i(struct gfs2_sbd *sdp, } #define gfs2_metatype_check(sdp, bh, type) \ -gfs2_metatype_check_i((sdp), (bh), (type), __FUNCTION__, __FILE__, __LINE__) +gfs2_metatype_check_i((sdp), (bh), (type), __func__, __FILE__, __LINE__) static inline void gfs2_metatype_set(struct buffer_head *bh, u16 type, u16 format) @@ -134,14 +134,14 @@ int gfs2_io_error_i(struct gfs2_sbd *sdp, const char *function, char *file, unsigned int line); #define gfs2_io_error(sdp) \ -gfs2_io_error_i((sdp), __FUNCTION__, __FILE__, __LINE__); +gfs2_io_error_i((sdp), __func__, __FILE__, __LINE__); int gfs2_io_error_bh_i(struct gfs2_sbd *sdp, struct buffer_head *bh, const char *function, char *file, unsigned int line); #define gfs2_io_error_bh(sdp, bh) \ -gfs2_io_error_bh_i((sdp), (bh), __FUNCTION__, __FILE__, __LINE__); +gfs2_io_error_bh_i((sdp), (bh), __func__, __FILE__, __LINE__); extern struct kmem_cache *gfs2_glock_cachep; diff --git a/fs/hfs/btree.c b/fs/hfs/btree.c index 24cf6fc4302..f6621a78520 100644 --- a/fs/hfs/btree.c +++ b/fs/hfs/btree.c @@ -208,7 +208,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) struct hfs_bnode *node, *next_node; struct page **pagep; u32 nidx, idx; - u16 off, len; + unsigned off; + u16 off16; + u16 len; u8 *data, byte, m; int i; @@ -235,7 +237,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) node = hfs_bnode_find(tree, nidx); if (IS_ERR(node)) return node; - len = hfs_brec_lenoff(node, 2, &off); + len = hfs_brec_lenoff(node, 2, &off16); + off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_CACHE_SHIFT); @@ -280,7 +283,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) return next_node; node = next_node; - len = hfs_brec_lenoff(node, 0, &off); + len = hfs_brec_lenoff(node, 0, &off16); + off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_CACHE_SHIFT); data = kmap(*pagep); diff --git a/fs/hfs/mdb.c b/fs/hfs/mdb.c index b4651e128d7..36ca2e1a4fa 100644 --- a/fs/hfs/mdb.c +++ b/fs/hfs/mdb.c @@ -215,7 +215,7 @@ int hfs_mdb_get(struct super_block *sb) attrib &= cpu_to_be16(~HFS_SB_ATTRIB_UNMNT); attrib |= cpu_to_be16(HFS_SB_ATTRIB_INCNSTNT); mdb->drAtrb = attrib; - mdb->drWrCnt = cpu_to_be32(be32_to_cpu(mdb->drWrCnt) + 1); + be32_add_cpu(&mdb->drWrCnt, 1); mdb->drLsMod = hfs_mtime(); mark_buffer_dirty(HFS_SB(sb)->mdb_bh); diff --git a/fs/hfs/super.c b/fs/hfs/super.c index 32de44ed002..8cf67974adf 100644 --- a/fs/hfs/super.c +++ b/fs/hfs/super.c @@ -297,7 +297,8 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) return 0; } p = match_strdup(&args[0]); - hsb->nls_disk = load_nls(p); + if (p) + hsb->nls_disk = load_nls(p); if (!hsb->nls_disk) { printk(KERN_ERR "hfs: unable to load codepage \"%s\"\n", p); kfree(p); @@ -311,7 +312,8 @@ static int parse_options(char *options, struct hfs_sb_info *hsb) return 0; } p = match_strdup(&args[0]); - hsb->nls_io = load_nls(p); + if (p) + hsb->nls_io = load_nls(p); if (!hsb->nls_io) { printk(KERN_ERR "hfs: unable to load iocharset \"%s\"\n", p); kfree(p); diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index bb5433608a4..e49fcee1e29 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -184,7 +184,9 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) struct hfs_bnode *node, *next_node; struct page **pagep; u32 nidx, idx; - u16 off, len; + unsigned off; + u16 off16; + u16 len; u8 *data, byte, m; int i; @@ -211,7 +213,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) node = hfs_bnode_find(tree, nidx); if (IS_ERR(node)) return node; - len = hfs_brec_lenoff(node, 2, &off); + len = hfs_brec_lenoff(node, 2, &off16); + off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_CACHE_SHIFT); @@ -256,7 +259,8 @@ struct hfs_bnode *hfs_bmap_alloc(struct hfs_btree *tree) return next_node; node = next_node; - len = hfs_brec_lenoff(node, 0, &off); + len = hfs_brec_lenoff(node, 0, &off16); + off = off16; off += node->page_offset; pagep = node->page + (off >> PAGE_CACHE_SHIFT); data = kmap(*pagep); diff --git a/fs/hfsplus/hfsplus_fs.h b/fs/hfsplus/hfsplus_fs.h index d72d0a8b25a..9e59537b43d 100644 --- a/fs/hfsplus/hfsplus_fs.h +++ b/fs/hfsplus/hfsplus_fs.h @@ -311,6 +311,10 @@ int hfsplus_delete_cat(u32, struct inode *, struct qstr *); int hfsplus_rename_cat(u32, struct inode *, struct qstr *, struct inode *, struct qstr *); +/* dir.c */ +extern const struct inode_operations hfsplus_dir_inode_operations; +extern const struct file_operations hfsplus_dir_operations; + /* extents.c */ int hfsplus_ext_cmp_key(const hfsplus_btree_key *, const hfsplus_btree_key *); void hfsplus_ext_write_extent(struct inode *); diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 37744cf3706..d53b2af91c2 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -278,9 +278,6 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) return 0; } -extern const struct inode_operations hfsplus_dir_inode_operations; -extern struct file_operations hfsplus_dir_operations; - static const struct inode_operations hfsplus_file_inode_operations = { .lookup = hfsplus_file_lookup, .truncate = hfsplus_file_truncate, diff --git a/fs/hfsplus/options.c b/fs/hfsplus/options.c index dc64fac0083..9997cbf8beb 100644 --- a/fs/hfsplus/options.c +++ b/fs/hfsplus/options.c @@ -132,7 +132,8 @@ int hfsplus_parse_options(char *input, struct hfsplus_sb_info *sbi) return 0; } p = match_strdup(&args[0]); - sbi->nls = load_nls(p); + if (p) + sbi->nls = load_nls(p); if (!sbi->nls) { printk(KERN_ERR "hfs: unable to load nls mapping \"%s\"\n", p); kfree(p); diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index b0f9ad362d1..ce97a54518d 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -357,7 +357,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) printk(KERN_WARNING "hfs: Filesystem is marked locked, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } else if (vhdr->attributes & cpu_to_be32(HFSPLUS_VOL_JOURNALED)) { - printk(KERN_WARNING "hfs: write access to a jounaled filesystem is not supported, " + printk(KERN_WARNING "hfs: write access to a journaled filesystem is not supported, " "use the force option at your own risk, mounting read-only.\n"); sb->s_flags |= MS_RDONLY; } @@ -423,7 +423,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) */ vhdr->last_mount_vers = cpu_to_be32(HFSP_MOUNT_VERSION); vhdr->modify_date = hfsp_now2mt(); - vhdr->write_count = cpu_to_be32(be32_to_cpu(vhdr->write_count) + 1); + be32_add_cpu(&vhdr->write_count, 1); vhdr->attributes &= cpu_to_be32(~HFSPLUS_VOL_UNMNT); vhdr->attributes |= cpu_to_be32(HFSPLUS_VOL_INCNSTNT); mark_buffer_dirty(HFSPLUS_SB(sb).s_vhbh); diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 72cab78f050..175d08eacc8 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -47,7 +47,7 @@ static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) return 0; wd->ablk_start = be16_to_cpu(*(__be16 *)(bufptr + HFSP_WRAPOFF_ABLKSTART)); - extent = be32_to_cpu(get_unaligned((__be32 *)(bufptr + HFSP_WRAPOFF_EMBEDEXT))); + extent = get_unaligned_be32(bufptr + HFSP_WRAPOFF_EMBEDEXT); wd->embed_start = (extent >> 16) & 0xFFFF; wd->embed_count = extent & 0xFFFF; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6846785fe90..aeabf80f81a 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -45,7 +45,7 @@ static const struct inode_operations hugetlbfs_inode_operations; static struct backing_dev_info hugetlbfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; int sysctl_hugetlb_shm_group; @@ -504,7 +504,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, uid_t uid, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; INIT_LIST_HEAD(&inode->i_mapping->private_list); info = HUGETLBFS_I(inode); - mpol_shared_policy_init(&info->policy, MPOL_DEFAULT, NULL); + mpol_shared_policy_init(&info->policy, NULL); switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); diff --git a/fs/inode.c b/fs/inode.c index 27ee1af50d0..bf647813042 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -495,8 +495,7 @@ static struct inode * find_inode(struct super_block * sb, struct hlist_head *hea struct inode * inode = NULL; repeat: - hlist_for_each (node, head) { - inode = hlist_entry(node, struct inode, i_hash); + hlist_for_each_entry(inode, node, head, i_hash) { if (inode->i_sb != sb) continue; if (!test(inode, data)) @@ -520,8 +519,7 @@ static struct inode * find_inode_fast(struct super_block * sb, struct hlist_head struct inode * inode = NULL; repeat: - hlist_for_each (node, head) { - inode = hlist_entry(node, struct inode, i_hash); + hlist_for_each_entry(inode, node, head, i_hash) { if (inode->i_ino != ino) continue; if (inode->i_sb != sb) diff --git a/fs/inotify_user.c b/fs/inotify_user.c index 7b94a1e3c01..6676c06bb7c 100644 --- a/fs/inotify_user.c +++ b/fs/inotify_user.c @@ -598,7 +598,7 @@ asmlinkage long sys_inotify_init(void) } ih = inotify_init(&inotify_user_ops); - if (unlikely(IS_ERR(ih))) { + if (IS_ERR(ih)) { ret = PTR_ERR(ih); goto out_free_dev; } diff --git a/fs/ioctl.c b/fs/ioctl.c index f32fbde2175..7db32b3382d 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -28,8 +28,8 @@ * * Returns 0 on success, -errno on error. */ -long vfs_ioctl(struct file *filp, unsigned int cmd, - unsigned long arg) +static long vfs_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) { int error = -ENOTTY; diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index 1ba407c64df..2f0dc5a1463 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -145,6 +145,14 @@ static int do_isofs_readdir(struct inode *inode, struct file *filp, } de = tmpde; } + /* Basic sanity check, whether name doesn't exceed dir entry */ + if (de_len < de->name_len[0] + + sizeof(struct iso_directory_record)) { + printk(KERN_NOTICE "iso9660: Corrupted directory entry" + " in block %lu of inode %lu\n", block, + inode->i_ino); + return -EIO; + } if (first_de) { isofs_normalize_block_and_offset(de, diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index d1bdf8adb35..ccbf72faf27 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -78,29 +78,29 @@ static inline int isonum_712(char *p) } static inline unsigned int isonum_721(char *p) { - return le16_to_cpu(get_unaligned((__le16 *)p)); + return get_unaligned_le16(p); } static inline unsigned int isonum_722(char *p) { - return be16_to_cpu(get_unaligned((__le16 *)p)); + return get_unaligned_be16(p); } static inline unsigned int isonum_723(char *p) { /* Ignore bigendian datum due to broken mastering programs */ - return le16_to_cpu(get_unaligned((__le16 *)p)); + return get_unaligned_le16(p); } static inline unsigned int isonum_731(char *p) { - return le32_to_cpu(get_unaligned((__le32 *)p)); + return get_unaligned_le32(p); } static inline unsigned int isonum_732(char *p) { - return be32_to_cpu(get_unaligned((__le32 *)p)); + return get_unaligned_be32(p); } static inline unsigned int isonum_733(char *p) { /* Ignore bigendian datum due to broken mastering programs */ - return le32_to_cpu(get_unaligned((__le32 *)p)); + return get_unaligned_le32(p); } extern int iso_date(char *, int); diff --git a/fs/isofs/namei.c b/fs/isofs/namei.c index 344b247bc29..8299889a835 100644 --- a/fs/isofs/namei.c +++ b/fs/isofs/namei.c @@ -111,6 +111,13 @@ isofs_find_entry(struct inode *dir, struct dentry *dentry, dlen = de->name_len[0]; dpnt = de->name; + /* Basic sanity check, whether name doesn't exceed dir entry */ + if (de_len < dlen + sizeof(struct iso_directory_record)) { + printk(KERN_NOTICE "iso9660: Corrupted directory entry" + " in block %lu of inode %lu\n", block, + dir->i_ino); + return 0; + } if (sbi->s_rock && ((i = get_rock_ridge_filename(de, tmpname, dir)))) { diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index a38c7186c57..cd931ef1f00 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -407,22 +407,6 @@ void journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); /* - * First, drop modified flag: all accesses to the buffers - * will be tracked for a new trasaction only -bzzz - */ - spin_lock(&journal->j_list_lock); - if (commit_transaction->t_buffers) { - new_jh = jh = commit_transaction->t_buffers->b_tnext; - do { - J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || - new_jh->b_modified == 0); - new_jh->b_modified = 0; - new_jh = new_jh->b_tnext; - } while (new_jh != jh); - } - spin_unlock(&journal->j_list_lock); - - /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ @@ -488,6 +472,9 @@ void journal_commit_transaction(journal_t *journal) */ commit_transaction->t_state = T_COMMIT; + J_ASSERT(commit_transaction->t_nr_buffers <= + commit_transaction->t_outstanding_credits); + descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 0e081d5f32e..b99c3b3654c 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -534,7 +534,7 @@ int log_wait_commit(journal_t *journal, tid_t tid) if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG "%s: error: j_commit_request=%d, tid=%d\n", - __FUNCTION__, journal->j_commit_request, tid); + __func__, journal->j_commit_request, tid); } spin_unlock(&journal->j_state_lock); #endif @@ -599,7 +599,7 @@ int journal_bmap(journal_t *journal, unsigned long blocknr, printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", - __FUNCTION__, + __func__, blocknr, bdevname(journal->j_dev, b)); err = -EIO; @@ -728,7 +728,7 @@ journal_t * journal_init_dev(struct block_device *bdev, journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", - __FUNCTION__); + __func__); kfree(journal); journal = NULL; goto out; @@ -782,7 +782,7 @@ journal_t * journal_init_inode (struct inode *inode) journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", - __FUNCTION__); + __func__); kfree(journal); return NULL; } @@ -791,7 +791,7 @@ journal_t * journal_init_inode (struct inode *inode) /* If that failed, give up */ if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", - __FUNCTION__); + __func__); kfree(journal); return NULL; } @@ -877,7 +877,7 @@ int journal_create(journal_t *journal) */ printk(KERN_EMERG "%s: creation of journal on external device!\n", - __FUNCTION__); + __func__); BUG(); } @@ -1657,7 +1657,7 @@ static struct journal_head *journal_alloc_journal_head(void) jbd_debug(1, "out of memory for journal_head\n"); if (time_after(jiffies, last_warning + 5*HZ)) { printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __FUNCTION__); + __func__); last_warning = jiffies; } while (ret == NULL) { @@ -1794,13 +1794,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh) if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing " "b_frozen_data\n", - __FUNCTION__); + __func__); jbd_free(jh->b_frozen_data, bh->b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing " "b_committed_data\n", - __FUNCTION__); + __func__); jbd_free(jh->b_committed_data, bh->b_size); } bh->b_private = NULL; diff --git a/fs/jbd/revoke.c b/fs/jbd/revoke.c index d5f8eee7c88..1bb43e987f4 100644 --- a/fs/jbd/revoke.c +++ b/fs/jbd/revoke.c @@ -138,7 +138,7 @@ repeat: oom: if (!journal_oom_retry) return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); yield(); goto repeat; } diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 2c9e8f5d13a..67ff2024c23 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -609,6 +609,12 @@ repeat: goto done; /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ @@ -681,7 +687,7 @@ repeat: if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", - __FUNCTION__); + __func__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; jbd_lock_bh_state(bh); @@ -820,9 +826,16 @@ int journal_get_create_access(handle_t *handle, struct buffer_head *bh) if (jh->b_transaction == NULL) { jh->b_transaction = transaction; + + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); __journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "set next transaction"); jh->b_next_transaction = transaction; } @@ -891,7 +904,7 @@ repeat: committed_data = jbd_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { printk(KERN_EMERG "%s: No memory for committed data\n", - __FUNCTION__); + __func__); err = -ENOMEM; goto out; } @@ -1222,6 +1235,7 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int drop_reserve = 0; int err = 0; + int was_modified = 0; BUFFER_TRACE(bh, "entry"); @@ -1240,6 +1254,9 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) goto not_jbd; } + /* keep track of wether or not this transaction modified us */ + was_modified = jh->b_modified; + /* * The buffer's going from the transaction, we must drop * all references -bzzz @@ -1257,7 +1274,12 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); - drop_reserve = 1; + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; /* * We are no longer going to journal this buffer. @@ -1297,7 +1319,13 @@ int journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); jh->b_next_transaction = NULL; - drop_reserve = 1; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; } } @@ -2069,7 +2097,7 @@ void __journal_refile_buffer(struct journal_head *jh) jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; __journal_file_buffer(jh, jh->b_transaction, - was_dirty ? BJ_Metadata : BJ_Reserved); + jh->b_modified ? BJ_Metadata : BJ_Reserved); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index a8173081f83..e0139786f71 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -520,22 +520,6 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug (3, "JBD: commit phase 2\n"); /* - * First, drop modified flag: all accesses to the buffers - * will be tracked for a new trasaction only -bzzz - */ - spin_lock(&journal->j_list_lock); - if (commit_transaction->t_buffers) { - new_jh = jh = commit_transaction->t_buffers->b_tnext; - do { - J_ASSERT_JH(new_jh, new_jh->b_modified == 1 || - new_jh->b_modified == 0); - new_jh->b_modified = 0; - new_jh = new_jh->b_tnext; - } while (new_jh != jh); - } - spin_unlock(&journal->j_list_lock); - - /* * Now start flushing things to disk, in the order they appear * on the transaction lists. Data blocks go first. */ @@ -584,6 +568,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.u.run.rs_blocks = commit_transaction->t_outstanding_credits; stats.u.run.rs_blocks_logged = 0; + J_ASSERT(commit_transaction->t_nr_buffers <= + commit_transaction->t_outstanding_credits); + descriptor = NULL; bufs = 0; while (commit_transaction->t_buffers) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 954cff001df..53632e3e845 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -534,7 +534,7 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG "%s: error: j_commit_request=%d, tid=%d\n", - __FUNCTION__, journal->j_commit_request, tid); + __func__, journal->j_commit_request, tid); } spin_unlock(&journal->j_state_lock); #endif @@ -599,7 +599,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, printk(KERN_ALERT "%s: journal block not found " "at offset %lu on %s\n", - __FUNCTION__, + __func__, blocknr, bdevname(journal->j_dev, b)); err = -EIO; @@ -904,19 +904,10 @@ static void jbd2_stats_proc_init(journal_t *journal) snprintf(name, sizeof(name) - 1, "%s", bdevname(journal->j_dev, name)); journal->j_proc_entry = proc_mkdir(name, proc_jbd2_stats); if (journal->j_proc_entry) { - struct proc_dir_entry *p; - p = create_proc_entry("history", S_IRUGO, - journal->j_proc_entry); - if (p) { - p->proc_fops = &jbd2_seq_history_fops; - p->data = journal; - p = create_proc_entry("info", S_IRUGO, - journal->j_proc_entry); - if (p) { - p->proc_fops = &jbd2_seq_info_fops; - p->data = journal; - } - } + proc_create_data("history", S_IRUGO, journal->j_proc_entry, + &jbd2_seq_history_fops, journal); + proc_create_data("info", S_IRUGO, journal->j_proc_entry, + &jbd2_seq_info_fops, journal); } } @@ -1006,13 +997,14 @@ fail: */ /** - * journal_t * jbd2_journal_init_dev() - creates an initialises a journal structure + * journal_t * jbd2_journal_init_dev() - creates and initialises a journal structure * @bdev: Block device on which to create the journal * @fs_dev: Device which hold journalled filesystem for this journal. * @start: Block nr Start of journal. * @len: Length of the journal in blocks. * @blocksize: blocksize of journalling device - * @returns: a newly created journal_t * + * + * Returns: a newly created journal_t * * * jbd2_journal_init_dev creates a journal which maps a fixed contiguous * range of blocks on an arbitrary block device. @@ -1036,7 +1028,7 @@ journal_t * jbd2_journal_init_dev(struct block_device *bdev, journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", - __FUNCTION__); + __func__); kfree(journal); journal = NULL; goto out; @@ -1092,7 +1084,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) journal->j_wbuf = kmalloc(n * sizeof(struct buffer_head*), GFP_KERNEL); if (!journal->j_wbuf) { printk(KERN_ERR "%s: Cant allocate bhs for commit thread\n", - __FUNCTION__); + __func__); kfree(journal); return NULL; } @@ -1101,7 +1093,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) /* If that failed, give up */ if (err) { printk(KERN_ERR "%s: Cannnot locate journal superblock\n", - __FUNCTION__); + __func__); kfree(journal); return NULL; } @@ -1187,7 +1179,7 @@ int jbd2_journal_create(journal_t *journal) */ printk(KERN_EMERG "%s: creation of journal on external device!\n", - __FUNCTION__); + __func__); BUG(); } @@ -1985,9 +1977,10 @@ static int journal_init_jbd2_journal_head_cache(void) static void jbd2_journal_destroy_jbd2_journal_head_cache(void) { - J_ASSERT(jbd2_journal_head_cache != NULL); - kmem_cache_destroy(jbd2_journal_head_cache); - jbd2_journal_head_cache = NULL; + if (jbd2_journal_head_cache) { + kmem_cache_destroy(jbd2_journal_head_cache); + jbd2_journal_head_cache = NULL; + } } /* @@ -2006,7 +1999,7 @@ static struct journal_head *journal_alloc_journal_head(void) jbd_debug(1, "out of memory for journal_head\n"); if (time_after(jiffies, last_warning + 5*HZ)) { printk(KERN_NOTICE "ENOMEM in %s, retrying.\n", - __FUNCTION__); + __func__); last_warning = jiffies; } while (!ret) { @@ -2143,13 +2136,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh) if (jh->b_frozen_data) { printk(KERN_WARNING "%s: freeing " "b_frozen_data\n", - __FUNCTION__); + __func__); jbd2_free(jh->b_frozen_data, bh->b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing " "b_committed_data\n", - __FUNCTION__); + __func__); jbd2_free(jh->b_committed_data, bh->b_size); } bh->b_private = NULL; @@ -2314,10 +2307,12 @@ static int __init journal_init(void) BUILD_BUG_ON(sizeof(struct journal_superblock_s) != 1024); ret = journal_init_caches(); - if (ret != 0) + if (ret == 0) { + jbd2_create_debugfs_entry(); + jbd2_create_jbd_stats_proc_entry(); + } else { jbd2_journal_destroy_caches(); - jbd2_create_debugfs_entry(); - jbd2_create_jbd_stats_proc_entry(); + } return ret; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index 2e1453a5e99..257ff262576 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -139,7 +139,7 @@ repeat: oom: if (!journal_oom_retry) return -ENOMEM; - jbd_debug(1, "ENOMEM in %s, retrying\n", __FUNCTION__); + jbd_debug(1, "ENOMEM in %s, retrying\n", __func__); yield(); goto repeat; } @@ -167,138 +167,121 @@ static struct jbd2_revoke_record_s *find_revoke_record(journal_t *journal, return NULL; } +void jbd2_journal_destroy_revoke_caches(void) +{ + if (jbd2_revoke_record_cache) { + kmem_cache_destroy(jbd2_revoke_record_cache); + jbd2_revoke_record_cache = NULL; + } + if (jbd2_revoke_table_cache) { + kmem_cache_destroy(jbd2_revoke_table_cache); + jbd2_revoke_table_cache = NULL; + } +} + int __init jbd2_journal_init_revoke_caches(void) { + J_ASSERT(!jbd2_revoke_record_cache); + J_ASSERT(!jbd2_revoke_table_cache); + jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record", sizeof(struct jbd2_revoke_record_s), 0, SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY, NULL); if (!jbd2_revoke_record_cache) - return -ENOMEM; + goto record_cache_failure; jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table", sizeof(struct jbd2_revoke_table_s), 0, SLAB_TEMPORARY, NULL); - if (!jbd2_revoke_table_cache) { - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; - return -ENOMEM; - } + if (!jbd2_revoke_table_cache) + goto table_cache_failure; return 0; +table_cache_failure: + jbd2_journal_destroy_revoke_caches(); +record_cache_failure: + return -ENOMEM; } -void jbd2_journal_destroy_revoke_caches(void) +static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { - kmem_cache_destroy(jbd2_revoke_record_cache); - jbd2_revoke_record_cache = NULL; - kmem_cache_destroy(jbd2_revoke_table_cache); - jbd2_revoke_table_cache = NULL; -} - -/* Initialise the revoke table for a given journal to a given size. */ - -int jbd2_journal_init_revoke(journal_t *journal, int hash_size) -{ - int shift, tmp; + int shift = 0; + int tmp = hash_size; + struct jbd2_revoke_table_s *table; - J_ASSERT (journal->j_revoke_table[0] == NULL); + table = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); + if (!table) + goto out; - shift = 0; - tmp = hash_size; while((tmp >>= 1UL) != 0UL) shift++; - journal->j_revoke_table[0] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[0]) - return -ENOMEM; - journal->j_revoke = journal->j_revoke_table[0]; - - /* Check that the hash_size is a power of two */ - J_ASSERT(is_power_of_2(hash_size)); - - journal->j_revoke->hash_size = hash_size; - - journal->j_revoke->hash_shift = shift; - - journal->j_revoke->hash_table = + table->hash_size = hash_size; + table->hash_shift = shift; + table->hash_table = kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]); - journal->j_revoke = NULL; - return -ENOMEM; + if (!table->hash_table) { + kmem_cache_free(jbd2_revoke_table_cache, table); + table = NULL; + goto out; } for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + INIT_LIST_HEAD(&table->hash_table[tmp]); - journal->j_revoke_table[1] = kmem_cache_alloc(jbd2_revoke_table_cache, GFP_KERNEL); - if (!journal->j_revoke_table[1]) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]); - return -ENOMEM; +out: + return table; +} + +static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +{ + int i; + struct list_head *hash_list; + + for (i = 0; i < table->hash_size; i++) { + hash_list = &table->hash_table[i]; + J_ASSERT(list_empty(hash_list)); } - journal->j_revoke = journal->j_revoke_table[1]; + kfree(table->hash_table); + kmem_cache_free(jbd2_revoke_table_cache, table); +} - /* Check that the hash_size is a power of two */ +/* Initialise the revoke table for a given journal to a given size. */ +int jbd2_journal_init_revoke(journal_t *journal, int hash_size) +{ + J_ASSERT(journal->j_revoke_table[0] == NULL); J_ASSERT(is_power_of_2(hash_size)); - journal->j_revoke->hash_size = hash_size; - - journal->j_revoke->hash_shift = shift; + journal->j_revoke_table[0] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[0]) + goto fail0; - journal->j_revoke->hash_table = - kmalloc(hash_size * sizeof(struct list_head), GFP_KERNEL); - if (!journal->j_revoke->hash_table) { - kfree(journal->j_revoke_table[0]->hash_table); - kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[0]); - kmem_cache_free(jbd2_revoke_table_cache, journal->j_revoke_table[1]); - journal->j_revoke = NULL; - return -ENOMEM; - } + journal->j_revoke_table[1] = jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke_table[1]) + goto fail1; - for (tmp = 0; tmp < hash_size; tmp++) - INIT_LIST_HEAD(&journal->j_revoke->hash_table[tmp]); + journal->j_revoke = journal->j_revoke_table[1]; spin_lock_init(&journal->j_revoke_lock); return 0; -} -/* Destoy a journal's revoke table. The table must already be empty! */ +fail1: + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); +fail0: + return -ENOMEM; +} +/* Destroy a journal's revoke table. The table must already be empty! */ void jbd2_journal_destroy_revoke(journal_t *journal) { - struct jbd2_revoke_table_s *table; - struct list_head *hash_list; - int i; - - table = journal->j_revoke_table[0]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(jbd2_revoke_table_cache, table); - journal->j_revoke = NULL; - - table = journal->j_revoke_table[1]; - if (!table) - return; - - for (i=0; i<table->hash_size; i++) { - hash_list = &table->hash_table[i]; - J_ASSERT (list_empty(hash_list)); - } - - kfree(table->hash_table); - kmem_cache_free(jbd2_revoke_table_cache, table); journal->j_revoke = NULL; + if (journal->j_revoke_table[0]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[0]); + if (journal->j_revoke_table[1]) + jbd2_journal_destroy_revoke_table(journal->j_revoke_table[1]); } diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b9b0b6f899b..d6e006e6780 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -618,6 +618,12 @@ repeat: goto done; /* + * this is the first time this transaction is touching this buffer, + * reset the modified flag + */ + jh->b_modified = 0; + + /* * If there is already a copy-out version of this buffer, then we don't * need to make another one */ @@ -690,7 +696,7 @@ repeat: if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", - __FUNCTION__); + __func__); JBUFFER_TRACE(jh, "oom!"); error = -ENOMEM; jbd_lock_bh_state(bh); @@ -829,9 +835,16 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh) if (jh->b_transaction == NULL) { jh->b_transaction = transaction; + + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "file as BJ_Reserved"); __jbd2_journal_file_buffer(jh, transaction, BJ_Reserved); } else if (jh->b_transaction == journal->j_committing_transaction) { + /* first access by this transaction */ + jh->b_modified = 0; + JBUFFER_TRACE(jh, "set next transaction"); jh->b_next_transaction = transaction; } @@ -901,7 +914,7 @@ repeat: committed_data = jbd2_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { printk(KERN_EMERG "%s: No memory for committed data\n", - __FUNCTION__); + __func__); err = -ENOMEM; goto out; } @@ -1230,6 +1243,7 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) struct journal_head *jh; int drop_reserve = 0; int err = 0; + int was_modified = 0; BUFFER_TRACE(bh, "entry"); @@ -1248,6 +1262,9 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) goto not_jbd; } + /* keep track of wether or not this transaction modified us */ + was_modified = jh->b_modified; + /* * The buffer's going from the transaction, we must drop * all references -bzzz @@ -1265,7 +1282,12 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) JBUFFER_TRACE(jh, "belongs to current transaction: unfile"); - drop_reserve = 1; + /* + * we only want to drop a reference if this transaction + * modified the buffer + */ + if (was_modified) + drop_reserve = 1; /* * We are no longer going to journal this buffer. @@ -1305,7 +1327,13 @@ int jbd2_journal_forget (handle_t *handle, struct buffer_head *bh) if (jh->b_next_transaction) { J_ASSERT(jh->b_next_transaction == transaction); jh->b_next_transaction = NULL; - drop_reserve = 1; + + /* + * only drop a reference if this transaction modified + * the buffer + */ + if (was_modified) + drop_reserve = 1; } } @@ -1434,7 +1462,8 @@ int jbd2_journal_stop(handle_t *handle) return err; } -/**int jbd2_journal_force_commit() - force any uncommitted transactions +/** + * int jbd2_journal_force_commit() - force any uncommitted transactions * @journal: journal to force * * For synchronous operations: force any uncommitted transactions @@ -2077,7 +2106,7 @@ void __jbd2_journal_refile_buffer(struct journal_head *jh) jh->b_transaction = jh->b_next_transaction; jh->b_next_transaction = NULL; __jbd2_journal_file_buffer(jh, jh->b_transaction, - was_dirty ? BJ_Metadata : BJ_Reserved); + jh->b_modified ? BJ_Metadata : BJ_Reserved); J_ASSERT_JH(jh, jh->b_transaction->t_state == T_RUNNING); if (was_dirty) diff --git a/fs/jffs2/README.Locking b/fs/jffs2/README.Locking index d14d5a4dc5a..3ea36554107 100644 --- a/fs/jffs2/README.Locking +++ b/fs/jffs2/README.Locking @@ -14,7 +14,7 @@ be fairly close. alloc_sem --------- -The alloc_sem is a per-filesystem semaphore, used primarily to ensure +The alloc_sem is a per-filesystem mutex, used primarily to ensure contiguous allocation of space on the medium. It is automatically obtained during space allocations (jffs2_reserve_space()) and freed upon write completion (jffs2_complete_reservation()). Note that @@ -41,10 +41,10 @@ if the wbuf is currently holding any data is permitted, though. Ordering constraints: See f->sem. - File Semaphore f->sem + File Mutex f->sem --------------------- -This is the JFFS2-internal equivalent of the inode semaphore i->i_sem. +This is the JFFS2-internal equivalent of the inode mutex i->i_sem. It protects the contents of the jffs2_inode_info private inode data, including the linked list of node fragments (but see the notes below on erase_completion_lock), etc. @@ -60,14 +60,14 @@ lead to deadlock, unless we played games with unlocking the i_sem before calling the space allocation functions. Instead of playing such games, we just have an extra internal -semaphore, which is obtained by the garbage collection code and also +mutex, which is obtained by the garbage collection code and also by the normal file system code _after_ allocation of space. Ordering constraints: 1. Never attempt to allocate space or lock alloc_sem with any f->sem held. - 2. Never attempt to lock two file semaphores in one thread. + 2. Never attempt to lock two file mutexes in one thread. No ordering rules have been made for doing so. @@ -86,8 +86,8 @@ a simple spin_lock() rather than spin_lock_bh(). Note that the per-inode list of physical nodes (f->nodes) is a special case. Any changes to _valid_ nodes (i.e. ->flash_offset & 1 == 0) in -the list are protected by the file semaphore f->sem. But the erase -code may remove _obsolete_ nodes from the list while holding only the +the list are protected by the file mutex f->sem. But the erase code +may remove _obsolete_ nodes from the list while holding only the erase_completion_lock. So you can walk the list only while holding the erase_completion_lock, and can drop the lock temporarily mid-walk as long as the pointer you're holding is to a _valid_ node, not an @@ -124,10 +124,10 @@ Ordering constraints: erase_free_sem -------------- -This semaphore is only used by the erase code which frees obsolete -node references and the jffs2_garbage_collect_deletion_dirent() -function. The latter function on NAND flash must read _obsolete_ nodes -to determine whether the 'deletion dirent' under consideration can be +This mutex is only used by the erase code which frees obsolete node +references and the jffs2_garbage_collect_deletion_dirent() function. +The latter function on NAND flash must read _obsolete_ nodes to +determine whether the 'deletion dirent' under consideration can be discarded or whether it is still required to show that an inode has been unlinked. Because reading from the flash may sleep, the erase_completion_lock cannot be held, so an alternative, more diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index 722a6b68295..c5e1450d79f 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -46,7 +46,7 @@ next_inode(int *i, struct jffs2_inode_cache *ic, struct jffs2_sb_info *c) static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, - struct jffs2_inode_cache *ic) + struct jffs2_inode_cache *ic) { struct jffs2_full_dirent *fd; @@ -68,11 +68,17 @@ static void jffs2_build_inode_pass1(struct jffs2_sb_info *c, continue; } - if (child_ic->nlink++ && fd->type == DT_DIR) { - JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n", - fd->name, fd->ino, ic->ino); - /* TODO: What do we do about it? */ - } + if (fd->type == DT_DIR) { + if (child_ic->pino_nlink) { + JFFS2_ERROR("child dir \"%s\" (ino #%u) of dir ino #%u appears to be a hard link\n", + fd->name, fd->ino, ic->ino); + /* TODO: What do we do about it? */ + } else { + child_ic->pino_nlink = ic->ino; + } + } else + child_ic->pino_nlink++; + dbg_fsbuild("increased nlink for child \"%s\" (ino #%u)\n", fd->name, fd->ino); /* Can't free scan_dents so far. We might need them in pass 2 */ } @@ -125,7 +131,7 @@ static int jffs2_build_filesystem(struct jffs2_sb_info *c) dbg_fsbuild("pass 2 starting\n"); for_each_inode(i, c, ic) { - if (ic->nlink) + if (ic->pino_nlink) continue; jffs2_build_remove_unlinked_inode(c, ic, &dead_fds); @@ -232,16 +238,19 @@ static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *c, /* Reduce nlink of the child. If it's now zero, stick it on the dead_fds list to be cleaned up later. Else just free the fd */ - child_ic->nlink--; + if (fd->type == DT_DIR) + child_ic->pino_nlink = 0; + else + child_ic->pino_nlink--; - if (!child_ic->nlink) { - dbg_fsbuild("inode #%u (\"%s\") has now got zero nlink, adding to dead_fds list.\n", + if (!child_ic->pino_nlink) { + dbg_fsbuild("inode #%u (\"%s\") now has no links; adding to dead_fds list.\n", fd->ino, fd->name); fd->next = *dead_fds; *dead_fds = fd; } else { dbg_fsbuild("inode #%u (\"%s\") has now got nlink %d. Ignoring.\n", - fd->ino, fd->name, child_ic->nlink); + fd->ino, fd->name, child_ic->pino_nlink); jffs2_free_full_dirent(fd); } } @@ -345,6 +354,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) INIT_LIST_HEAD(&c->dirty_list); INIT_LIST_HEAD(&c->erasable_list); INIT_LIST_HEAD(&c->erasing_list); + INIT_LIST_HEAD(&c->erase_checking_list); INIT_LIST_HEAD(&c->erase_pending_list); INIT_LIST_HEAD(&c->erasable_pending_wbuf_list); INIT_LIST_HEAD(&c->erase_complete_list); diff --git a/fs/jffs2/debug.c b/fs/jffs2/debug.c index 3a32c64ed49..5544d31c066 100644 --- a/fs/jffs2/debug.c +++ b/fs/jffs2/debug.c @@ -62,9 +62,9 @@ __jffs2_dbg_acct_sanity_check(struct jffs2_sb_info *c, void __jffs2_dbg_fragtree_paranoia_check(struct jffs2_inode_info *f) { - down(&f->sem); + mutex_lock(&f->sem); __jffs2_dbg_fragtree_paranoia_check_nolock(f); - up(&f->sem); + mutex_unlock(&f->sem); } void @@ -153,6 +153,139 @@ __jffs2_dbg_prewrite_paranoia_check(struct jffs2_sb_info *c, kfree(buf); } +void __jffs2_dbg_superblock_counts(struct jffs2_sb_info *c) +{ + struct jffs2_eraseblock *jeb; + uint32_t free = 0, dirty = 0, used = 0, wasted = 0, + erasing = 0, bad = 0, unchecked = 0; + int nr_counted = 0; + int dump = 0; + + if (c->gcblock) { + nr_counted++; + free += c->gcblock->free_size; + dirty += c->gcblock->dirty_size; + used += c->gcblock->used_size; + wasted += c->gcblock->wasted_size; + unchecked += c->gcblock->unchecked_size; + } + if (c->nextblock) { + nr_counted++; + free += c->nextblock->free_size; + dirty += c->nextblock->dirty_size; + used += c->nextblock->used_size; + wasted += c->nextblock->wasted_size; + unchecked += c->nextblock->unchecked_size; + } + list_for_each_entry(jeb, &c->clean_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->very_dirty_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->dirty_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erasable_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erasable_pending_wbuf_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->erase_pending_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->free_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + list_for_each_entry(jeb, &c->bad_used_list, list) { + nr_counted++; + free += jeb->free_size; + dirty += jeb->dirty_size; + used += jeb->used_size; + wasted += jeb->wasted_size; + unchecked += jeb->unchecked_size; + } + + list_for_each_entry(jeb, &c->erasing_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->erase_checking_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->erase_complete_list, list) { + nr_counted++; + erasing += c->sector_size; + } + list_for_each_entry(jeb, &c->bad_list, list) { + nr_counted++; + bad += c->sector_size; + } + +#define check(sz) \ + if (sz != c->sz##_size) { \ + printk(KERN_WARNING #sz "_size mismatch counted 0x%x, c->" #sz "_size 0x%x\n", \ + sz, c->sz##_size); \ + dump = 1; \ + } + check(free); + check(dirty); + check(used); + check(wasted); + check(unchecked); + check(bad); + check(erasing); +#undef check + + if (nr_counted != c->nr_blocks) { + printk(KERN_WARNING "%s counted only 0x%x blocks of 0x%x. Where are the others?\n", + __func__, nr_counted, c->nr_blocks); + dump = 1; + } + + if (dump) { + __jffs2_dbg_dump_block_lists_nolock(c); + BUG(); + } +} + /* * Check the space accounting and node_ref list correctness for the JFFS2 erasable block 'jeb'. */ @@ -229,6 +362,9 @@ __jffs2_dbg_acct_paranoia_check_nolock(struct jffs2_sb_info *c, } #endif + if (!(c->flags & (JFFS2_SB_FLAG_BUILDING|JFFS2_SB_FLAG_SCANNING))) + __jffs2_dbg_superblock_counts(c); + return; error: @@ -268,7 +404,10 @@ __jffs2_dbg_dump_node_refs_nolock(struct jffs2_sb_info *c, printk(JFFS2_DBG); for (ref = jeb->first_node; ; ref = ref_next(ref)) { - printk("%#08x(%#x)", ref_offset(ref), ref->__totlen); + printk("%#08x", ref_offset(ref)); +#ifdef TEST_TOTLEN + printk("(%x)", ref->__totlen); +#endif if (ref_next(ref)) printk("->"); else @@ -447,6 +586,21 @@ __jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c) } } } + if (list_empty(&c->erase_checking_list)) { + printk(JFFS2_DBG "erase_checking_list: empty\n"); + } else { + struct list_head *this; + + list_for_each(this, &c->erase_checking_list) { + struct jffs2_eraseblock *jeb = list_entry(this, struct jffs2_eraseblock, list); + + if (!(jeb->used_size == 0 && jeb->dirty_size == 0 && jeb->wasted_size == 0)) { + printk(JFFS2_DBG "erase_checking_list: %#08x (used %#08x, dirty %#08x, wasted %#08x, unchecked %#08x, free %#08x)\n", + jeb->offset, jeb->used_size, jeb->dirty_size, jeb->wasted_size, + jeb->unchecked_size, jeb->free_size); + } + } + } if (list_empty(&c->erase_pending_list)) { printk(JFFS2_DBG "erase_pending_list: empty\n"); @@ -532,9 +686,9 @@ __jffs2_dbg_dump_block_lists_nolock(struct jffs2_sb_info *c) void __jffs2_dbg_dump_fragtree(struct jffs2_inode_info *f) { - down(&f->sem); + mutex_lock(&f->sem); jffs2_dbg_dump_fragtree_nolock(f); - up(&f->sem); + mutex_unlock(&f->sem); } void diff --git a/fs/jffs2/debug.h b/fs/jffs2/debug.h index 4130adabd76..a113ecc3baf 100644 --- a/fs/jffs2/debug.h +++ b/fs/jffs2/debug.h @@ -38,6 +38,7 @@ #if CONFIG_JFFS2_FS_DEBUG > 1 #define JFFS2_DBG_FRAGTREE2_MESSAGES +#define JFFS2_DBG_READINODE2_MESSAGES #define JFFS2_DBG_MEMALLOC_MESSAGES #endif @@ -81,28 +82,28 @@ do { \ printk(JFFS2_ERR_MSG_PREFIX \ " (%d) %s: " fmt, task_pid_nr(current), \ - __FUNCTION__ , ##__VA_ARGS__); \ + __func__ , ##__VA_ARGS__); \ } while(0) #define JFFS2_WARNING(fmt, ...) \ do { \ printk(JFFS2_WARN_MSG_PREFIX \ " (%d) %s: " fmt, task_pid_nr(current), \ - __FUNCTION__ , ##__VA_ARGS__); \ + __func__ , ##__VA_ARGS__); \ } while(0) #define JFFS2_NOTICE(fmt, ...) \ do { \ printk(JFFS2_NOTICE_MSG_PREFIX \ " (%d) %s: " fmt, task_pid_nr(current), \ - __FUNCTION__ , ##__VA_ARGS__); \ + __func__ , ##__VA_ARGS__); \ } while(0) #define JFFS2_DEBUG(fmt, ...) \ do { \ printk(JFFS2_DBG_MSG_PREFIX \ " (%d) %s: " fmt, task_pid_nr(current), \ - __FUNCTION__ , ##__VA_ARGS__); \ + __func__ , ##__VA_ARGS__); \ } while(0) /* @@ -115,6 +116,11 @@ #else #define dbg_readinode(fmt, ...) #endif +#ifdef JFFS2_DBG_READINODE2_MESSAGES +#define dbg_readinode2(fmt, ...) JFFS2_DEBUG(fmt, ##__VA_ARGS__) +#else +#define dbg_readinode2(fmt, ...) +#endif /* Fragtree build debugging messages */ #ifdef JFFS2_DBG_FRAGTREE_MESSAGES diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index f948f7e6ec8..c0c141f6fde 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -86,7 +86,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, dir_f = JFFS2_INODE_INFO(dir_i); c = JFFS2_SB_INFO(dir_i->i_sb); - down(&dir_f->sem); + mutex_lock(&dir_f->sem); /* NB: The 2.2 backport will need to explicitly check for '.' and '..' here */ for (fd_list = dir_f->dents; fd_list && fd_list->nhash <= target->d_name.hash; fd_list = fd_list->next) { @@ -99,7 +99,7 @@ static struct dentry *jffs2_lookup(struct inode *dir_i, struct dentry *target, } if (fd) ino = fd->ino; - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); if (ino) { inode = jffs2_iget(dir_i->i_sb, ino); if (IS_ERR(inode)) { @@ -146,7 +146,7 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) } curofs=1; - down(&f->sem); + mutex_lock(&f->sem); for (fd = f->dents; fd; fd = fd->next) { curofs++; @@ -166,7 +166,7 @@ static int jffs2_readdir(struct file *filp, void *dirent, filldir_t filldir) break; offset++; } - up(&f->sem); + mutex_unlock(&f->sem); out: filp->f_pos = offset; return 0; @@ -208,6 +208,13 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, f = JFFS2_INODE_INFO(inode); dir_f = JFFS2_INODE_INFO(dir_i); + /* jffs2_do_create() will want to lock it, _after_ reserving + space and taking c-alloc_sem. If we keep it locked here, + lockdep gets unhappy (although it's a false positive; + nothing else will be looking at this inode yet so there's + no chance of AB-BA deadlock involving its f->sem). */ + mutex_unlock(&f->sem); + ret = jffs2_do_create(c, dir_f, f, ri, dentry->d_name.name, dentry->d_name.len); if (ret) @@ -219,7 +226,8 @@ static int jffs2_create(struct inode *dir_i, struct dentry *dentry, int mode, d_instantiate(dentry, inode); D1(printk(KERN_DEBUG "jffs2_create: Created ino #%lu with mode %o, nlink %d(%d). nrpages %ld\n", - inode->i_ino, inode->i_mode, inode->i_nlink, f->inocache->nlink, inode->i_mapping->nrpages)); + inode->i_ino, inode->i_mode, inode->i_nlink, + f->inocache->pino_nlink, inode->i_mapping->nrpages)); return 0; fail: @@ -243,7 +251,7 @@ static int jffs2_unlink(struct inode *dir_i, struct dentry *dentry) ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, dentry->d_name.len, dead_f, now); if (dead_f->inocache) - dentry->d_inode->i_nlink = dead_f->inocache->nlink; + dentry->d_inode->i_nlink = dead_f->inocache->pino_nlink; if (!ret) dir_i->i_mtime = dir_i->i_ctime = ITIME(now); return ret; @@ -275,9 +283,9 @@ static int jffs2_link (struct dentry *old_dentry, struct inode *dir_i, struct de ret = jffs2_do_link(c, dir_f, f->inocache->ino, type, dentry->d_name.name, dentry->d_name.len, now); if (!ret) { - down(&f->sem); - old_dentry->d_inode->i_nlink = ++f->inocache->nlink; - up(&f->sem); + mutex_lock(&f->sem); + old_dentry->d_inode->i_nlink = ++f->inocache->pino_nlink; + mutex_unlock(&f->sem); d_instantiate(dentry, old_dentry->d_inode); dir_i->i_mtime = dir_i->i_ctime = ITIME(now); atomic_inc(&old_dentry->d_inode->i_count); @@ -351,7 +359,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char if (IS_ERR(fn)) { /* Eeek. Wave bye bye */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); return PTR_ERR(fn); @@ -361,7 +369,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char f->target = kmalloc(targetlen + 1, GFP_KERNEL); if (!f->target) { printk(KERN_WARNING "Can't allocate %d bytes of memory\n", targetlen + 1); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); return -ENOMEM; @@ -374,7 +382,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char obsoleted by the first data write */ f->metadata = fn; - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); @@ -406,7 +414,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char } dir_f = JFFS2_INODE_INFO(dir_i); - down(&dir_f->sem); + mutex_lock(&dir_f->sem); rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); @@ -429,7 +437,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char as if it were the final unlink() */ jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); jffs2_clear_inode(inode); return PTR_ERR(fd); } @@ -442,7 +450,7 @@ static int jffs2_symlink (struct inode *dir_i, struct dentry *dentry, const char one if necessary. */ jffs2_add_fd_to_list(c, fd, &dir_f->dents); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); d_instantiate(dentry, inode); @@ -493,11 +501,14 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) inode->i_op = &jffs2_dir_inode_operations; inode->i_fop = &jffs2_dir_operations; - /* Directories get nlink 2 at start */ - inode->i_nlink = 2; f = JFFS2_INODE_INFO(inode); + /* Directories get nlink 2 at start */ + inode->i_nlink = 2; + /* but ic->pino_nlink is the parent ino# */ + f->inocache->pino_nlink = dir_i->i_ino; + ri->data_crc = cpu_to_je32(0); ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); @@ -507,7 +518,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) if (IS_ERR(fn)) { /* Eeek. Wave bye bye */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); return PTR_ERR(fn); @@ -516,7 +527,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) obsoleted by the first data write */ f->metadata = fn; - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); @@ -548,7 +559,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) } dir_f = JFFS2_INODE_INFO(dir_i); - down(&dir_f->sem); + mutex_lock(&dir_f->sem); rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); @@ -571,7 +582,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) as if it were the final unlink() */ jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); jffs2_clear_inode(inode); return PTR_ERR(fd); } @@ -585,7 +596,7 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) one if necessary. */ jffs2_add_fd_to_list(c, fd, &dir_f->dents); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); d_instantiate(dentry, inode); @@ -594,17 +605,25 @@ static int jffs2_mkdir (struct inode *dir_i, struct dentry *dentry, int mode) static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) { + struct jffs2_sb_info *c = JFFS2_SB_INFO(dir_i->i_sb); + struct jffs2_inode_info *dir_f = JFFS2_INODE_INFO(dir_i); struct jffs2_inode_info *f = JFFS2_INODE_INFO(dentry->d_inode); struct jffs2_full_dirent *fd; int ret; + uint32_t now = get_seconds(); for (fd = f->dents ; fd; fd = fd->next) { if (fd->ino) return -ENOTEMPTY; } - ret = jffs2_unlink(dir_i, dentry); - if (!ret) + + ret = jffs2_do_unlink(c, dir_f, dentry->d_name.name, + dentry->d_name.len, f, now); + if (!ret) { + dir_i->i_mtime = dir_i->i_ctime = ITIME(now); + clear_nlink(dentry->d_inode); drop_nlink(dir_i); + } return ret; } @@ -673,7 +692,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de if (IS_ERR(fn)) { /* Eeek. Wave bye bye */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); jffs2_clear_inode(inode); return PTR_ERR(fn); @@ -682,7 +701,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de obsoleted by the first data write */ f->metadata = fn; - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); @@ -714,7 +733,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de } dir_f = JFFS2_INODE_INFO(dir_i); - down(&dir_f->sem); + mutex_lock(&dir_f->sem); rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); @@ -740,7 +759,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de as if it were the final unlink() */ jffs2_complete_reservation(c); jffs2_free_raw_dirent(rd); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); jffs2_clear_inode(inode); return PTR_ERR(fd); } @@ -753,7 +772,7 @@ static int jffs2_mknod (struct inode *dir_i, struct dentry *dentry, int mode, de one if necessary. */ jffs2_add_fd_to_list(c, fd, &dir_f->dents); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); jffs2_complete_reservation(c); d_instantiate(dentry, inode); @@ -780,14 +799,14 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (S_ISDIR(new_dentry->d_inode->i_mode)) { struct jffs2_full_dirent *fd; - down(&victim_f->sem); + mutex_lock(&victim_f->sem); for (fd = victim_f->dents; fd; fd = fd->next) { if (fd->ino) { - up(&victim_f->sem); + mutex_unlock(&victim_f->sem); return -ENOTEMPTY; } } - up(&victim_f->sem); + mutex_unlock(&victim_f->sem); } } @@ -816,9 +835,12 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, /* Don't oops if the victim was a dirent pointing to an inode which didn't exist. */ if (victim_f->inocache) { - down(&victim_f->sem); - victim_f->inocache->nlink--; - up(&victim_f->sem); + mutex_lock(&victim_f->sem); + if (S_ISDIR(new_dentry->d_inode->i_mode)) + victim_f->inocache->pino_nlink = 0; + else + victim_f->inocache->pino_nlink--; + mutex_unlock(&victim_f->sem); } } @@ -836,11 +858,11 @@ static int jffs2_rename (struct inode *old_dir_i, struct dentry *old_dentry, if (ret) { /* Oh shit. We really ought to make a single node which can do both atomically */ struct jffs2_inode_info *f = JFFS2_INODE_INFO(old_dentry->d_inode); - down(&f->sem); + mutex_lock(&f->sem); inc_nlink(old_dentry->d_inode); - if (f->inocache) - f->inocache->nlink++; - up(&f->sem); + if (f->inocache && !S_ISDIR(old_dentry->d_inode->i_mode)) + f->inocache->pino_nlink++; + mutex_unlock(&f->sem); printk(KERN_NOTICE "jffs2_rename(): Link succeeded, unlink failed (err %d). You now have a hard link\n", ret); /* Might as well let the VFS know */ diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index a1db9180633..dddb2a6c9e2 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -50,14 +50,14 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, instr = kmalloc(sizeof(struct erase_info) + sizeof(struct erase_priv_struct), GFP_KERNEL); if (!instr) { printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); return; } @@ -84,14 +84,14 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, if (ret == -ENOMEM || ret == -EAGAIN) { /* Erase failed immediately. Refile it on the list */ D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); return; } @@ -107,7 +107,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) { struct jffs2_eraseblock *jeb; - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); @@ -116,9 +116,9 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) if (!list_empty(&c->erase_complete_list)) { jeb = list_entry(c->erase_complete_list.next, struct jffs2_eraseblock, list); - list_del(&jeb->list); + list_move(&jeb->list, &c->erase_checking_list); spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); jffs2_mark_erased_block(c, jeb); if (!--count) { @@ -139,7 +139,7 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) jffs2_free_jeb_node_refs(c, jeb); list_add(&jeb->list, &c->erasing_list); spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); jffs2_erase_block(c, jeb); @@ -149,12 +149,12 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) /* Be nice */ yield(); - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); } spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); done: D1(printk(KERN_DEBUG "jffs2_erase_pending_blocks completed\n")); } @@ -162,11 +162,11 @@ void jffs2_erase_pending_blocks(struct jffs2_sb_info *c, int count) static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) { D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move_tail(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); /* Ensure that kupdated calls us again to mark them clean */ jffs2_erase_pending_trigger(c); } @@ -180,26 +180,26 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { /* We'd like to give this block another try. */ - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); return; } } - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); c->erasing_size -= c->sector_size; c->bad_size += c->sector_size; list_move(&jeb->list, &c->bad_list); c->nr_erasing_blocks--; spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); wake_up(&c->erase_wait); } @@ -294,7 +294,7 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, break; #endif default: - if (ic->nodes == (void *)ic && ic->nlink == 0) + if (ic->nodes == (void *)ic && ic->pino_nlink == 0) jffs2_del_ino_cache(c, ic); } } @@ -332,7 +332,8 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (c->mtd->point) { unsigned long *wordebuf; - ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size, &retlen, (unsigned char **)&ebuf); + ret = c->mtd->point(c->mtd, jeb->offset, c->sector_size, + &retlen, &ebuf, NULL); if (ret) { D1(printk(KERN_DEBUG "MTD point failed %d\n", ret)); goto do_flash_read; @@ -340,7 +341,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (retlen < c->sector_size) { /* Don't muck about if it won't let us point to the whole erase sector */ D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", retlen)); - c->mtd->unpoint(c->mtd, ebuf, jeb->offset, retlen); + c->mtd->unpoint(c->mtd, jeb->offset, retlen); goto do_flash_read; } wordebuf = ebuf-sizeof(*wordebuf); @@ -349,10 +350,12 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (*++wordebuf != ~0) break; } while(--retlen); - c->mtd->unpoint(c->mtd, ebuf, jeb->offset, c->sector_size); - if (retlen) + c->mtd->unpoint(c->mtd, jeb->offset, c->sector_size); + if (retlen) { printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08tx\n", *wordebuf, jeb->offset + c->sector_size-retlen*sizeof(*wordebuf)); + return -EIO; + } return 0; } do_flash_read: @@ -373,10 +376,12 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl ret = c->mtd->read(c->mtd, ofs, readlen, &retlen, ebuf); if (ret) { printk(KERN_WARNING "Read of newly-erased block at 0x%08x failed: %d. Putting on bad_list\n", ofs, ret); + ret = -EIO; goto fail; } if (retlen != readlen) { printk(KERN_WARNING "Short read from newly-erased block at 0x%08x. Wanted %d, got %zd\n", ofs, readlen, retlen); + ret = -EIO; goto fail; } for (i=0; i<readlen; i += sizeof(unsigned long)) { @@ -385,6 +390,7 @@ static int jffs2_block_check_erase(struct jffs2_sb_info *c, struct jffs2_erasebl if (*datum + 1) { *bad_offset += i; printk(KERN_WARNING "Newly-erased block contained word 0x%lx at offset 0x%08x\n", *datum, *bad_offset); + ret = -EIO; goto fail; } } @@ -419,9 +425,6 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb if (jffs2_write_nand_cleanmarker(c, jeb)) goto filebad; } - - /* Everything else got zeroed before the erase */ - jeb->free_size = c->sector_size; } else { struct kvec vecs[1]; @@ -449,48 +452,50 @@ static void jffs2_mark_erased_block(struct jffs2_sb_info *c, struct jffs2_eraseb goto filebad; } - - /* Everything else got zeroed before the erase */ - jeb->free_size = c->sector_size; - /* FIXME Special case for cleanmarker in empty block */ - jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL); } + /* Everything else got zeroed before the erase */ + jeb->free_size = c->sector_size; - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); + c->erasing_size -= c->sector_size; - c->free_size += jeb->free_size; - c->used_size += jeb->used_size; + c->free_size += c->sector_size; - jffs2_dbg_acct_sanity_check_nolock(c,jeb); - jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + /* Account for cleanmarker now, if it's in-band */ + if (c->cleanmarker_size && !jffs2_cleanmarker_oob(c)) + jffs2_link_node_ref(c, jeb, jeb->offset | REF_NORMAL, c->cleanmarker_size, NULL); - list_add_tail(&jeb->list, &c->free_list); + list_move_tail(&jeb->list, &c->free_list); c->nr_erasing_blocks--; c->nr_free_blocks++; + + jffs2_dbg_acct_sanity_check_nolock(c, jeb); + jffs2_dbg_acct_paranoia_check_nolock(c, jeb); + spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); wake_up(&c->erase_wait); return; filebad: - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); /* Stick it on a list (any list) so erase_failed can take it right off again. Silly, but shouldn't happen often. */ - list_add(&jeb->list, &c->erasing_list); + list_move(&jeb->list, &c->erasing_list); spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); jffs2_erase_failed(c, jeb, bad_offset); return; refile: /* Stick it back on the list from whence it came and come back later */ jffs2_erase_pending_trigger(c); - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); spin_lock(&c->erase_completion_lock); - list_add(&jeb->list, &c->erase_complete_list); + list_move(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); return; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index dcc2734e0b5..5e920343b2c 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -115,9 +115,9 @@ static int jffs2_readpage (struct file *filp, struct page *pg) struct jffs2_inode_info *f = JFFS2_INODE_INFO(pg->mapping->host); int ret; - down(&f->sem); + mutex_lock(&f->sem); ret = jffs2_do_readpage_unlock(pg->mapping->host, pg); - up(&f->sem); + mutex_unlock(&f->sem); return ret; } @@ -154,7 +154,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (ret) goto out_page; - down(&f->sem); + mutex_lock(&f->sem); memset(&ri, 0, sizeof(ri)); ri.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -181,7 +181,7 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, if (IS_ERR(fn)) { ret = PTR_ERR(fn); jffs2_complete_reservation(c); - up(&f->sem); + mutex_unlock(&f->sem); goto out_page; } ret = jffs2_add_full_dnode_to_inode(c, f, fn); @@ -195,12 +195,12 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); jffs2_complete_reservation(c); - up(&f->sem); + mutex_unlock(&f->sem); goto out_page; } jffs2_complete_reservation(c); inode->i_size = pageofs; - up(&f->sem); + mutex_unlock(&f->sem); } /* @@ -209,9 +209,9 @@ static int jffs2_write_begin(struct file *filp, struct address_space *mapping, * case of a short-copy. */ if (!PageUptodate(pg)) { - down(&f->sem); + mutex_lock(&f->sem); ret = jffs2_do_readpage_nolock(inode, pg); - up(&f->sem); + mutex_unlock(&f->sem); if (ret) goto out_page; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index e26ea78c789..086c4383022 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -36,6 +36,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) unsigned int ivalid; uint32_t alloclen; int ret; + int alloc_type = ALLOC_NORMAL; D1(printk(KERN_DEBUG "jffs2_setattr(): ino #%lu\n", inode->i_ino)); @@ -50,20 +51,20 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) mdata = (char *)&dev; D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of kdev_t\n", mdatalen)); } else if (S_ISLNK(inode->i_mode)) { - down(&f->sem); + mutex_lock(&f->sem); mdatalen = f->metadata->size; mdata = kmalloc(f->metadata->size, GFP_USER); if (!mdata) { - up(&f->sem); + mutex_unlock(&f->sem); return -ENOMEM; } ret = jffs2_read_dnode(c, f, f->metadata, mdata, 0, mdatalen); if (ret) { - up(&f->sem); + mutex_unlock(&f->sem); kfree(mdata); return ret; } - up(&f->sem); + mutex_unlock(&f->sem); D1(printk(KERN_DEBUG "jffs2_setattr(): Writing %d bytes of symlink target\n", mdatalen)); } @@ -82,7 +83,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) kfree(mdata); return ret; } - down(&f->sem); + mutex_lock(&f->sem); ivalid = iattr->ia_valid; ri->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -115,6 +116,10 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) ri->compr = JFFS2_COMPR_ZERO; ri->dsize = cpu_to_je32(iattr->ia_size - inode->i_size); ri->offset = cpu_to_je32(inode->i_size); + } else if (ivalid & ATTR_SIZE && !iattr->ia_size) { + /* For truncate-to-zero, treat it as deletion because + it'll always be obsoleting all previous nodes */ + alloc_type = ALLOC_DELETION; } ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); if (mdatalen) @@ -122,14 +127,14 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) else ri->data_crc = cpu_to_je32(0); - new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, ALLOC_NORMAL); + new_metadata = jffs2_write_dnode(c, f, ri, mdata, mdatalen, alloc_type); if (S_ISLNK(inode->i_mode)) kfree(mdata); if (IS_ERR(new_metadata)) { jffs2_complete_reservation(c); jffs2_free_raw_inode(ri); - up(&f->sem); + mutex_unlock(&f->sem); return PTR_ERR(new_metadata); } /* It worked. Update the inode */ @@ -149,6 +154,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) if (ivalid & ATTR_SIZE && inode->i_size < iattr->ia_size) { jffs2_add_full_dnode_to_inode(c, f, new_metadata); inode->i_size = iattr->ia_size; + inode->i_blocks = (inode->i_size + 511) >> 9; f->metadata = NULL; } else { f->metadata = new_metadata; @@ -159,7 +165,7 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) } jffs2_free_raw_inode(ri); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); /* We have to do the vmtruncate() without f->sem held, since @@ -167,8 +173,10 @@ int jffs2_do_setattr (struct inode *inode, struct iattr *iattr) We are protected from a simultaneous write() extending i_size back past iattr->ia_size, because do_truncate() holds the generic inode semaphore. */ - if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) - vmtruncate(inode, iattr->ia_size); + if (ivalid & ATTR_SIZE && inode->i_size > iattr->ia_size) { + vmtruncate(inode, iattr->ia_size); + inode->i_blocks = (inode->i_size + 511) >> 9; + } return 0; } @@ -248,12 +256,12 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) c = JFFS2_SB_INFO(inode->i_sb); jffs2_init_inode_info(f); - down(&f->sem); + mutex_lock(&f->sem); ret = jffs2_do_read_inode(c, f, inode->i_ino, &latest_node); if (ret) { - up(&f->sem); + mutex_unlock(&f->sem); iget_failed(inode); return ERR_PTR(ret); } @@ -265,7 +273,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) inode->i_mtime = ITIME(je32_to_cpu(latest_node.mtime)); inode->i_ctime = ITIME(je32_to_cpu(latest_node.ctime)); - inode->i_nlink = f->inocache->nlink; + inode->i_nlink = f->inocache->pino_nlink; inode->i_blocks = (inode->i_size + 511) >> 9; @@ -278,13 +286,12 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) case S_IFDIR: { struct jffs2_full_dirent *fd; + inode->i_nlink = 2; /* parent and '.' */ for (fd=f->dents; fd; fd = fd->next) { if (fd->type == DT_DIR && fd->ino) inc_nlink(inode); } - /* and '..' */ - inc_nlink(inode); /* Root dir gets i_nlink 3 for some reason */ if (inode->i_ino == 1) inc_nlink(inode); @@ -330,7 +337,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) printk(KERN_WARNING "jffs2_read_inode(): Bogus imode %o for ino %lu\n", inode->i_mode, (unsigned long)inode->i_ino); } - up(&f->sem); + mutex_unlock(&f->sem); D1(printk(KERN_DEBUG "jffs2_read_inode() returning\n")); unlock_new_inode(inode); @@ -339,7 +346,7 @@ struct inode *jffs2_iget(struct super_block *sb, unsigned long ino) error_io: ret = -EIO; error: - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); iget_failed(inode); return ERR_PTR(ret); @@ -380,9 +387,9 @@ int jffs2_remount_fs (struct super_block *sb, int *flags, char *data) Flush the writebuffer, if neccecary, else we loose it */ if (!(sb->s_flags & MS_RDONLY)) { jffs2_stop_garbage_collect_thread(c); - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); } if (!(*flags & MS_RDONLY)) @@ -429,7 +436,7 @@ struct inode *jffs2_new_inode (struct inode *dir_i, int mode, struct jffs2_raw_i f = JFFS2_INODE_INFO(inode); jffs2_init_inode_info(f); - down(&f->sem); + mutex_lock(&f->sem); memset(ri, 0, sizeof(*ri)); /* Set OS-specific defaults for new inodes */ @@ -578,11 +585,12 @@ void jffs2_gc_release_inode(struct jffs2_sb_info *c, } struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, - int inum, int nlink) + int inum, int unlinked) { struct inode *inode; struct jffs2_inode_cache *ic; - if (!nlink) { + + if (unlinked) { /* The inode has zero nlink but its nodes weren't yet marked obsolete. This has to be because we're still waiting for the final (close() and) iput() to happen. @@ -630,8 +638,8 @@ struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, return ERR_CAST(inode); } if (is_bad_inode(inode)) { - printk(KERN_NOTICE "Eep. read_inode() failed for ino #%u. nlink %d\n", - inum, nlink); + printk(KERN_NOTICE "Eep. read_inode() failed for ino #%u. unlinked %d\n", + inum, unlinked); /* NB. This will happen again. We need to do something appropriate here. */ iput(inode); return ERR_PTR(-EIO); diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 32ff0373aa0..090c556ffed 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -126,7 +126,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) int ret = 0, inum, nlink; int xattr = 0; - if (down_interruptible(&c->alloc_sem)) + if (mutex_lock_interruptible(&c->alloc_sem)) return -EINTR; for (;;) { @@ -143,7 +143,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) c->unchecked_size); jffs2_dbg_dump_block_lists_nolock(c); spin_unlock(&c->erase_completion_lock); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return -ENOSPC; } @@ -161,8 +161,8 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) continue; } - if (!ic->nlink) { - D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink zero\n", + if (!ic->pino_nlink) { + D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink/pino zero\n", ic->ino)); spin_unlock(&c->inocache_lock); jffs2_xattr_delete_inode(c, ic); @@ -190,7 +190,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) made no progress in this case, but that should be OK */ c->checked_ino--; - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); return 0; @@ -210,7 +210,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) printk(KERN_WARNING "Returned error for crccheck of ino #%u. Expect badness...\n", ic->ino); jffs2_set_inocache_state(c, ic, INO_STATE_CHECKEDABSENT); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return ret; } @@ -221,9 +221,15 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) jeb = jffs2_find_gc_block(c); if (!jeb) { - D1 (printk(KERN_NOTICE "jffs2: Couldn't find erase block to garbage collect!\n")); + /* Couldn't find a free block. But maybe we can just erase one and make 'progress'? */ + if (!list_empty(&c->erase_pending_list)) { + spin_unlock(&c->erase_completion_lock); + mutex_unlock(&c->alloc_sem); + return -EAGAIN; + } + D1(printk(KERN_NOTICE "jffs2: Couldn't find erase block to garbage collect!\n")); spin_unlock(&c->erase_completion_lock); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return -EIO; } @@ -232,7 +238,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) printk(KERN_DEBUG "Nextblock at %08x, used_size %08x, dirty_size %08x, wasted_size %08x, free_size %08x\n", c->nextblock->offset, c->nextblock->used_size, c->nextblock->dirty_size, c->nextblock->wasted_size, c->nextblock->free_size)); if (!jeb->used_size) { - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); goto eraseit; } @@ -248,7 +254,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) jeb->offset, jeb->free_size, jeb->dirty_size, jeb->used_size); jeb->gc_node = raw; spin_unlock(&c->erase_completion_lock); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); BUG(); } } @@ -266,7 +272,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) /* Just mark it obsolete */ jffs2_mark_node_obsolete(c, raw); } - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); goto eraseit_lock; } @@ -334,7 +340,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) */ printk(KERN_CRIT "Inode #%u already in state %d in jffs2_garbage_collect_pass()!\n", ic->ino, ic->state); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); spin_unlock(&c->inocache_lock); BUG(); @@ -345,7 +351,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) the alloc_sem() (for marking nodes invalid) so we must drop the alloc_sem before sleeping. */ - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); D1(printk(KERN_DEBUG "jffs2_garbage_collect_pass() waiting for ino #%u in state %d\n", ic->ino, ic->state)); sleep_on_spinunlock(&c->inocache_wq, &c->inocache_lock); @@ -392,10 +398,10 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) it's vaguely possible. */ inum = ic->ino; - nlink = ic->nlink; + nlink = ic->pino_nlink; spin_unlock(&c->inocache_lock); - f = jffs2_gc_fetch_inode(c, inum, nlink); + f = jffs2_gc_fetch_inode(c, inum, !nlink); if (IS_ERR(f)) { ret = PTR_ERR(f); goto release_sem; @@ -416,7 +422,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) ret = -ENOSPC; } release_sem: - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); eraseit_lock: /* If we've finished this block, start it erasing */ @@ -445,7 +451,7 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era uint32_t start = 0, end = 0, nrfrags = 0; int ret = 0; - down(&f->sem); + mutex_lock(&f->sem); /* Now we have the lock for this inode. Check that it's still the one at the head of the list. */ @@ -525,7 +531,7 @@ static int jffs2_garbage_collect_live(struct jffs2_sb_info *c, struct jffs2_era } } upnout: - up(&f->sem); + mutex_unlock(&f->sem); return ret; } @@ -846,7 +852,7 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct /* Prevent the erase code from nicking the obsolete node refs while we're looking at them. I really don't like this extra lock but can't see any alternative. Suggestions on a postcard to... */ - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); for (raw = f->inocache->nodes; raw != (void *)f->inocache; raw = raw->next_in_ino) { @@ -899,7 +905,7 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct /* OK. The name really does match. There really is still an older node on the flash which our deletion dirent obsoletes. So we have to write out a new deletion dirent to replace it */ - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); D1(printk(KERN_DEBUG "Deletion dirent at %08x still obsoletes real dirent \"%s\" at %08x for ino #%u\n", ref_offset(fd->raw), fd->name, ref_offset(raw), je32_to_cpu(rd->ino))); @@ -908,7 +914,7 @@ static int jffs2_garbage_collect_deletion_dirent(struct jffs2_sb_info *c, struct return jffs2_garbage_collect_dirent(c, jeb, f, fd); } - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); kfree(rd); } @@ -1081,7 +1087,7 @@ static int jffs2_garbage_collect_hole(struct jffs2_sb_info *c, struct jffs2_eras return 0; } -static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb, +static int jffs2_garbage_collect_dnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *orig_jeb, struct jffs2_inode_info *f, struct jffs2_full_dnode *fn, uint32_t start, uint32_t end) { diff --git a/fs/jffs2/ioctl.c b/fs/jffs2/ioctl.c index f4d525b0ea5..e2177210f62 100644 --- a/fs/jffs2/ioctl.c +++ b/fs/jffs2/ioctl.c @@ -10,6 +10,7 @@ */ #include <linux/fs.h> +#include "nodelist.h" int jffs2_ioctl(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) diff --git a/fs/jffs2/jffs2_fs_i.h b/fs/jffs2/jffs2_fs_i.h index a841f4973a7..31559f45fdd 100644 --- a/fs/jffs2/jffs2_fs_i.h +++ b/fs/jffs2/jffs2_fs_i.h @@ -15,7 +15,7 @@ #include <linux/version.h> #include <linux/rbtree.h> #include <linux/posix_acl.h> -#include <linux/semaphore.h> +#include <linux/mutex.h> struct jffs2_inode_info { /* We need an internal mutex similar to inode->i_mutex. @@ -24,7 +24,7 @@ struct jffs2_inode_info { before letting GC proceed. Or we'd have to put ugliness into the GC code so it didn't attempt to obtain the i_mutex for the inode(s) which are already locked */ - struct semaphore sem; + struct mutex sem; /* The highest (datanode) version number used for this ino */ uint32_t highest_version; diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 18fca2b9e53..85ef6dbb1be 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -16,7 +16,7 @@ #include <linux/spinlock.h> #include <linux/workqueue.h> #include <linux/completion.h> -#include <linux/semaphore.h> +#include <linux/mutex.h> #include <linux/timer.h> #include <linux/wait.h> #include <linux/list.h> @@ -44,7 +44,7 @@ struct jffs2_sb_info { struct completion gc_thread_start; /* GC thread start completion */ struct completion gc_thread_exit; /* GC thread exit completion port */ - struct semaphore alloc_sem; /* Used to protect all the following + struct mutex alloc_sem; /* Used to protect all the following fields, and also to protect against out-of-order writing of nodes. And GC. */ uint32_t cleanmarker_size; /* Size of an _inline_ CLEANMARKER @@ -87,6 +87,7 @@ struct jffs2_sb_info { struct list_head erasable_list; /* Blocks which are completely dirty, and need erasing */ struct list_head erasable_pending_wbuf_list; /* Blocks which need erasing but only after the current wbuf is flushed */ struct list_head erasing_list; /* Blocks which are currently erasing */ + struct list_head erase_checking_list; /* Blocks which are being checked and marked */ struct list_head erase_pending_list; /* Blocks which need erasing now */ struct list_head erase_complete_list; /* Blocks which are erased and need the clean marker written to them */ struct list_head free_list; /* Blocks which are free and ready to be used */ @@ -104,7 +105,7 @@ struct jffs2_sb_info { /* Sem to allow jffs2_garbage_collect_deletion_dirent to drop the erase_completion_lock while it's holding a pointer to an obsoleted node. I don't like this. Alternatives welcomed. */ - struct semaphore erase_free_sem; + struct mutex erase_free_sem; uint32_t wbuf_pagesize; /* 0 for NOR and other flashes with no wbuf */ diff --git a/fs/jffs2/nodelist.h b/fs/jffs2/nodelist.h index ec1aae9e695..1750445556c 100644 --- a/fs/jffs2/nodelist.h +++ b/fs/jffs2/nodelist.h @@ -87,7 +87,7 @@ struct jffs2_raw_node_ref xattr_ref or xattr_datum instead. The common part of those structures has NULL in the first word. See jffs2_raw_ref_to_ic() below */ uint32_t flash_offset; -#define TEST_TOTLEN +#undef TEST_TOTLEN #ifdef TEST_TOTLEN uint32_t __totlen; /* This may die; use ref_totlen(c, jeb, ) below */ #endif @@ -177,7 +177,10 @@ struct jffs2_inode_cache { #ifdef CONFIG_JFFS2_FS_XATTR struct jffs2_xattr_ref *xref; #endif - int nlink; + uint32_t pino_nlink; /* Directories store parent inode + here; other inodes store nlink. + Zero always means that it's + completely unlinked. */ }; /* Inode states for 'state' above. We need the 'GC' state to prevent diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index a0313fa8748..a9bf9603c1b 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -48,7 +48,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, minsize = PAD(minsize); D1(printk(KERN_DEBUG "jffs2_reserve_space(): Requested 0x%x bytes\n", minsize)); - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); D1(printk(KERN_DEBUG "jffs2_reserve_space(): alloc sem got\n")); @@ -57,7 +57,6 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, /* this needs a little more thought (true <tglx> :)) */ while(ret == -EAGAIN) { while(c->nr_free_blocks + c->nr_erasing_blocks < blocksneeded) { - int ret; uint32_t dirty, avail; /* calculate real dirty size @@ -82,7 +81,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, dirty, c->unchecked_size, c->sector_size)); spin_unlock(&c->erase_completion_lock); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return -ENOSPC; } @@ -105,11 +104,11 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, D1(printk(KERN_DEBUG "max. available size 0x%08x < blocksneeded * sector_size 0x%08x, returning -ENOSPC\n", avail, blocksneeded * c->sector_size)); spin_unlock(&c->erase_completion_lock); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return -ENOSPC; } - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); D1(printk(KERN_DEBUG "Triggering GC pass. nr_free_blocks %d, nr_erasing_blocks %d, free_size 0x%08x, dirty_size 0x%08x, wasted_size 0x%08x, used_size 0x%08x, erasing_size 0x%08x, bad_size 0x%08x (total 0x%08x of 0x%08x)\n", c->nr_free_blocks, c->nr_erasing_blocks, c->free_size, c->dirty_size, c->wasted_size, c->used_size, c->erasing_size, c->bad_size, @@ -117,7 +116,10 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, spin_unlock(&c->erase_completion_lock); ret = jffs2_garbage_collect_pass(c); - if (ret) + + if (ret == -EAGAIN) + jffs2_erase_pending_blocks(c, 1); + else if (ret) return ret; cond_resched(); @@ -125,7 +127,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, if (signal_pending(current)) return -EINTR; - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); spin_lock(&c->erase_completion_lock); } @@ -138,7 +140,7 @@ int jffs2_reserve_space(struct jffs2_sb_info *c, uint32_t minsize, if (!ret) ret = jffs2_prealloc_raw_node_refs(c, c->nextblock, 1); if (ret) - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return ret; } @@ -463,7 +465,7 @@ void jffs2_complete_reservation(struct jffs2_sb_info *c) { D1(printk(KERN_DEBUG "jffs2_complete_reservation()\n")); jffs2_garbage_collect_trigger(c); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); } static inline int on_list(struct list_head *obj, struct list_head *head) @@ -512,7 +514,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref any jffs2_raw_node_refs. So we don't need to stop erases from happening, or protect against people holding an obsolete jffs2_raw_node_ref without the erase_completion_lock. */ - down(&c->erase_free_sem); + mutex_lock(&c->erase_free_sem); } spin_lock(&c->erase_completion_lock); @@ -707,7 +709,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref break; #endif default: - if (ic->nodes == (void *)ic && ic->nlink == 0) + if (ic->nodes == (void *)ic && ic->pino_nlink == 0) jffs2_del_ino_cache(c, ic); break; } @@ -715,7 +717,7 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref } out_erase_sem: - up(&c->erase_free_sem); + mutex_unlock(&c->erase_free_sem); } int jffs2_thread_should_wake(struct jffs2_sb_info *c) diff --git a/fs/jffs2/os-linux.h b/fs/jffs2/os-linux.h index 1b10d259409..2cc866cf134 100644 --- a/fs/jffs2/os-linux.h +++ b/fs/jffs2/os-linux.h @@ -187,7 +187,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent); void jffs2_gc_release_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f); struct jffs2_inode_info *jffs2_gc_fetch_inode(struct jffs2_sb_info *c, - int inum, int nlink); + int inum, int unlinked); unsigned char *jffs2_gc_fetch_page(struct jffs2_sb_info *c, struct jffs2_inode_info *f, diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index e512a93d624..6ca08ad887c 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -63,10 +63,11 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info /* TODO: instead, incapsulate point() stuff to jffs2_flash_read(), * adding and jffs2_flash_read_end() interface. */ if (c->mtd->point) { - err = c->mtd->point(c->mtd, ofs, len, &retlen, &buffer); + err = c->mtd->point(c->mtd, ofs, len, &retlen, + (void **)&buffer, NULL); if (!err && retlen < len) { JFFS2_WARNING("MTD point returned len too short: %zu instead of %u.\n", retlen, tn->csize); - c->mtd->unpoint(c->mtd, buffer, ofs, retlen); + c->mtd->unpoint(c->mtd, ofs, retlen); } else if (err) JFFS2_WARNING("MTD point failed: error code %d.\n", err); else @@ -100,7 +101,7 @@ static int check_node_data(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info kfree(buffer); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, buffer, ofs, len); + c->mtd->unpoint(c->mtd, ofs, len); #endif if (crc != tn->data_crc) { @@ -136,7 +137,7 @@ free_out: kfree(buffer); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, buffer, ofs, len); + c->mtd->unpoint(c->mtd, ofs, len); #endif return err; } @@ -825,8 +826,9 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref else // normal case... tn->fn->size = je32_to_cpu(rd->dsize); - dbg_readinode("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n", - ref_offset(ref), je32_to_cpu(rd->version), je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize); + dbg_readinode2("dnode @%08x: ver %u, offset %#04x, dsize %#04x, csize %#04x\n", + ref_offset(ref), je32_to_cpu(rd->version), + je32_to_cpu(rd->offset), je32_to_cpu(rd->dsize), csize); ret = jffs2_add_tn_to_tree(c, rii, tn); @@ -836,13 +838,13 @@ static inline int read_dnode(struct jffs2_sb_info *c, struct jffs2_raw_node_ref jffs2_free_tmp_dnode_info(tn); return ret; } -#ifdef JFFS2_DBG_READINODE_MESSAGES - dbg_readinode("After adding ver %d:\n", je32_to_cpu(rd->version)); +#ifdef JFFS2_DBG_READINODE2_MESSAGES + dbg_readinode2("After adding ver %d:\n", je32_to_cpu(rd->version)); tn = tn_first(&rii->tn_root); while (tn) { - dbg_readinode("%p: v %d r 0x%x-0x%x ov %d\n", - tn, tn->version, tn->fn->ofs, - tn->fn->ofs+tn->fn->size, tn->overlapped); + dbg_readinode2("%p: v %d r 0x%x-0x%x ov %d\n", + tn, tn->version, tn->fn->ofs, + tn->fn->ofs+tn->fn->size, tn->overlapped); tn = tn_next(tn); } #endif @@ -1122,7 +1124,8 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, size_t retlen; int ret; - dbg_readinode("ino #%u nlink is %d\n", f->inocache->ino, f->inocache->nlink); + dbg_readinode("ino #%u pino/nlink is %d\n", f->inocache->ino, + f->inocache->pino_nlink); memset(&rii, 0, sizeof(rii)); @@ -1193,7 +1196,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, JFFS2_ERROR("failed to read from flash: error %d, %zd of %zd bytes read\n", ret, retlen, sizeof(*latest_node)); /* FIXME: If this fails, there seems to be a memory leak. Find it. */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return ret?ret:-EIO; } @@ -1202,7 +1205,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, if (crc != je32_to_cpu(latest_node->node_crc)) { JFFS2_ERROR("CRC failed for read_inode of inode %u at physical location 0x%x\n", f->inocache->ino, ref_offset(rii.latest_ref)); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return -EIO; } @@ -1242,7 +1245,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, f->target = kmalloc(je32_to_cpu(latest_node->csize) + 1, GFP_KERNEL); if (!f->target) { JFFS2_ERROR("can't allocate %d bytes of memory for the symlink target path cache\n", je32_to_cpu(latest_node->csize)); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return -ENOMEM; } @@ -1255,7 +1258,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, ret = -EIO; kfree(f->target); f->target = NULL; - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return -ret; } @@ -1273,14 +1276,14 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, if (f->metadata) { JFFS2_ERROR("Argh. Special inode #%u with mode 0%o had metadata node\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return -EIO; } if (!frag_first(&f->fragtree)) { JFFS2_ERROR("Argh. Special inode #%u with mode 0%o has no fragments\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return -EIO; } @@ -1289,7 +1292,7 @@ static int jffs2_do_read_inode_internal(struct jffs2_sb_info *c, JFFS2_ERROR("Argh. Special inode #%u with mode 0x%x had more than one node\n", f->inocache->ino, jemode_to_cpu(latest_node->mode)); /* FIXME: Deal with it - check crc32, check for duplicate node, check times and discard the older one */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); return -EIO; } @@ -1357,7 +1360,7 @@ int jffs2_do_read_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, } dbg_readinode("creating inocache for root inode\n"); memset(f->inocache, 0, sizeof(struct jffs2_inode_cache)); - f->inocache->ino = f->inocache->nlink = 1; + f->inocache->ino = f->inocache->pino_nlink = 1; f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; f->inocache->state = INO_STATE_READING; jffs2_add_ino_cache(c, f->inocache); @@ -1379,12 +1382,13 @@ int jffs2_do_crccheck_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i if (!f) return -ENOMEM; - init_MUTEX_LOCKED(&f->sem); + mutex_init(&f->sem); + mutex_lock(&f->sem); f->inocache = ic; ret = jffs2_do_read_inode_internal(c, f, &n); if (!ret) { - up(&f->sem); + mutex_unlock(&f->sem); jffs2_do_clear_inode(c, f); } kfree (f); @@ -1398,8 +1402,8 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_clear_acl(f); jffs2_xattr_delete_inode(c, f->inocache); - down(&f->sem); - deleted = f->inocache && !f->inocache->nlink; + mutex_lock(&f->sem); + deleted = f->inocache && !f->inocache->pino_nlink; if (f->inocache && f->inocache->state != INO_STATE_CHECKING) jffs2_set_inocache_state(c, f->inocache, INO_STATE_CLEARING); @@ -1430,5 +1434,5 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) jffs2_del_ino_cache(c, f->inocache); } - up(&f->sem); + mutex_unlock(&f->sem); } diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 272872d27fd..1d437de1e9a 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -97,11 +97,12 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) size_t pointlen; if (c->mtd->point) { - ret = c->mtd->point (c->mtd, 0, c->mtd->size, &pointlen, &flashbuf); + ret = c->mtd->point(c->mtd, 0, c->mtd->size, &pointlen, + (void **)&flashbuf, NULL); if (!ret && pointlen < c->mtd->size) { /* Don't muck about if it won't let us point to the whole flash */ D1(printk(KERN_DEBUG "MTD point returned len too short: 0x%zx\n", pointlen)); - c->mtd->unpoint(c->mtd, flashbuf, 0, pointlen); + c->mtd->unpoint(c->mtd, 0, pointlen); flashbuf = NULL; } if (ret) @@ -267,7 +268,7 @@ int jffs2_scan_medium(struct jffs2_sb_info *c) kfree(flashbuf); #ifndef __ECOS else - c->mtd->unpoint(c->mtd, flashbuf, 0, c->mtd->size); + c->mtd->unpoint(c->mtd, 0, c->mtd->size); #endif if (s) kfree(s); @@ -940,7 +941,7 @@ struct jffs2_inode_cache *jffs2_scan_make_ino_cache(struct jffs2_sb_info *c, uin ic->nodes = (void *)ic; jffs2_add_ino_cache(c, ic); if (ino == 1) - ic->nlink = 1; + ic->pino_nlink = 1; return ic; } diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index 4677355996c..7da69eae49e 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -31,11 +31,12 @@ static struct kmem_cache *jffs2_inode_cachep; static struct inode *jffs2_alloc_inode(struct super_block *sb) { - struct jffs2_inode_info *ei; - ei = (struct jffs2_inode_info *)kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); - if (!ei) + struct jffs2_inode_info *f; + + f = kmem_cache_alloc(jffs2_inode_cachep, GFP_KERNEL); + if (!f) return NULL; - return &ei->vfs_inode; + return &f->vfs_inode; } static void jffs2_destroy_inode(struct inode *inode) @@ -45,19 +46,19 @@ static void jffs2_destroy_inode(struct inode *inode) static void jffs2_i_init_once(struct kmem_cache *cachep, void *foo) { - struct jffs2_inode_info *ei = (struct jffs2_inode_info *) foo; + struct jffs2_inode_info *f = foo; - init_MUTEX(&ei->sem); - inode_init_once(&ei->vfs_inode); + mutex_init(&f->sem); + inode_init_once(&f->vfs_inode); } static int jffs2_sync_fs(struct super_block *sb, int wait) { struct jffs2_sb_info *c = JFFS2_SB_INFO(sb); - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return 0; } @@ -95,8 +96,8 @@ static int jffs2_fill_super(struct super_block *sb, void *data, int silent) /* Initialize JFFS2 superblock locks, the further initialization will * be done later */ - init_MUTEX(&c->alloc_sem); - init_MUTEX(&c->erase_free_sem); + mutex_init(&c->alloc_sem); + mutex_init(&c->erase_free_sem); init_waitqueue_head(&c->erase_wait); init_waitqueue_head(&c->inocache_wq); spin_lock_init(&c->erase_completion_lock); @@ -125,9 +126,9 @@ static void jffs2_put_super (struct super_block *sb) D2(printk(KERN_DEBUG "jffs2: jffs2_put_super()\n")); - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); jffs2_flush_wbuf_pad(c); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); jffs2_sum_exit(c); diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index d1d4f27464b..0e78b00035e 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -494,7 +494,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* If it's an in-core inode, then we have to adjust any full_dirent or full_dnode structure to point to the new version instead of the old */ - f = jffs2_gc_fetch_inode(c, ic->ino, ic->nlink); + f = jffs2_gc_fetch_inode(c, ic->ino, !ic->pino_nlink); if (IS_ERR(f)) { /* Should never happen; it _must_ be present */ JFFS2_ERROR("Failed to iget() ino #%u, err %ld\n", @@ -578,8 +578,8 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) if (!jffs2_is_writebuffered(c)) return 0; - if (!down_trylock(&c->alloc_sem)) { - up(&c->alloc_sem); + if (mutex_trylock(&c->alloc_sem)) { + mutex_unlock(&c->alloc_sem); printk(KERN_CRIT "jffs2_flush_wbuf() called with alloc_sem not locked!\n"); BUG(); } @@ -702,10 +702,10 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) if (!c->wbuf) return 0; - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); if (!jffs2_wbuf_pending_for_ino(c, ino)) { D1(printk(KERN_DEBUG "Ino #%d not pending in wbuf. Returning\n", ino)); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return 0; } @@ -725,14 +725,14 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) } else while (old_wbuf_len && old_wbuf_ofs == c->wbuf_ofs) { - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() calls gc pass\n")); ret = jffs2_garbage_collect_pass(c); if (ret) { /* GC failed. Flush it with padding instead */ - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); down_write(&c->wbuf_sem); ret = __jffs2_flush_wbuf(c, PAD_ACCOUNTING); /* retry flushing wbuf in case jffs2_wbuf_recover @@ -742,12 +742,12 @@ int jffs2_flush_wbuf_gc(struct jffs2_sb_info *c, uint32_t ino) up_write(&c->wbuf_sem); break; } - down(&c->alloc_sem); + mutex_lock(&c->alloc_sem); } D1(printk(KERN_DEBUG "jffs2_flush_wbuf_gc() ends...\n")); - up(&c->alloc_sem); + mutex_unlock(&c->alloc_sem); return ret; } @@ -1236,12 +1236,24 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { if (!c->wbuf) return -ENOMEM; +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + c->wbuf_verify = kmalloc(c->wbuf_pagesize, GFP_KERNEL); + if (!c->wbuf_verify) { + kfree(c->oobbuf); + kfree(c->wbuf); + return -ENOMEM; + } +#endif + printk(KERN_INFO "JFFS2 write-buffering enabled buffer (%d) erasesize (%d)\n", c->wbuf_pagesize, c->sector_size); return 0; } void jffs2_dataflash_cleanup(struct jffs2_sb_info *c) { +#ifdef CONFIG_JFFS2_FS_WBUF_VERIFY + kfree(c->wbuf_verify); +#endif kfree(c->wbuf); } diff --git a/fs/jffs2/write.c b/fs/jffs2/write.c index 776f13cbf2b..ca29440e943 100644 --- a/fs/jffs2/write.c +++ b/fs/jffs2/write.c @@ -19,7 +19,8 @@ #include "compr.h" -int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint32_t mode, struct jffs2_raw_inode *ri) +int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, + uint32_t mode, struct jffs2_raw_inode *ri) { struct jffs2_inode_cache *ic; @@ -31,7 +32,7 @@ int jffs2_do_new_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f, uint memset(ic, 0, sizeof(*ic)); f->inocache = ic; - f->inocache->nlink = 1; + f->inocache->pino_nlink = 1; /* Will be overwritten shortly for directories */ f->inocache->nodes = (struct jffs2_raw_node_ref *)f->inocache; f->inocache->state = INO_STATE_PRESENT; @@ -137,12 +138,12 @@ struct jffs2_full_dnode *jffs2_write_dnode(struct jffs2_sb_info *c, struct jffs2 JFFS2_SUMMARY_INODE_SIZE); } else { /* Locking pain */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); ret = jffs2_reserve_space(c, sizeof(*ri) + datalen, &dummy, alloc_mode, JFFS2_SUMMARY_INODE_SIZE); - down(&f->sem); + mutex_lock(&f->sem); } if (!ret) { @@ -285,12 +286,12 @@ struct jffs2_full_dirent *jffs2_write_dirent(struct jffs2_sb_info *c, struct jff JFFS2_SUMMARY_DIRENT_SIZE(namelen)); } else { /* Locking pain */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); ret = jffs2_reserve_space(c, sizeof(*rd) + namelen, &dummy, alloc_mode, JFFS2_SUMMARY_DIRENT_SIZE(namelen)); - down(&f->sem); + mutex_lock(&f->sem); } if (!ret) { @@ -353,7 +354,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, D1(printk(KERN_DEBUG "jffs2_reserve_space returned %d\n", ret)); break; } - down(&f->sem); + mutex_lock(&f->sem); datalen = min_t(uint32_t, writelen, PAGE_CACHE_SIZE - (offset & (PAGE_CACHE_SIZE-1))); cdatalen = min_t(uint32_t, alloclen - sizeof(*ri), datalen); @@ -381,7 +382,7 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, if (IS_ERR(fn)) { ret = PTR_ERR(fn); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); if (!retried) { /* Write error to be retried */ @@ -403,11 +404,11 @@ int jffs2_write_inode_range(struct jffs2_sb_info *c, struct jffs2_inode_info *f, jffs2_mark_node_obsolete(c, fn->raw); jffs2_free_full_dnode(fn); - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); break; } - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); if (!datalen) { printk(KERN_WARNING "Eep. We didn't actually write any data in jffs2_write_inode_range()\n"); @@ -438,10 +439,10 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str ret = jffs2_reserve_space(c, sizeof(*ri), &alloclen, ALLOC_NORMAL, JFFS2_SUMMARY_INODE_SIZE); D1(printk(KERN_DEBUG "jffs2_do_create(): reserved 0x%x bytes\n", alloclen)); - if (ret) { - up(&f->sem); + if (ret) return ret; - } + + mutex_lock(&f->sem); ri->data_crc = cpu_to_je32(0); ri->node_crc = cpu_to_je32(crc32(0, ri, sizeof(*ri)-8)); @@ -454,7 +455,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str if (IS_ERR(fn)) { D1(printk(KERN_DEBUG "jffs2_write_dnode() failed\n")); /* Eeek. Wave bye bye */ - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); return PTR_ERR(fn); } @@ -463,7 +464,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str */ f->metadata = fn; - up(&f->sem); + mutex_unlock(&f->sem); jffs2_complete_reservation(c); ret = jffs2_init_security(&f->vfs_inode, &dir_f->vfs_inode); @@ -489,7 +490,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str return -ENOMEM; } - down(&dir_f->sem); + mutex_lock(&dir_f->sem); rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rd->nodetype = cpu_to_je16(JFFS2_NODETYPE_DIRENT); @@ -513,7 +514,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str /* dirent failed to write. Delete the inode normally as if it were the final unlink() */ jffs2_complete_reservation(c); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); return PTR_ERR(fd); } @@ -522,7 +523,7 @@ int jffs2_do_create(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, str jffs2_add_fd_to_list(c, fd, &dir_f->dents); jffs2_complete_reservation(c); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); return 0; } @@ -551,7 +552,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, return ret; } - down(&dir_f->sem); + mutex_lock(&dir_f->sem); /* Build a deletion node */ rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -574,21 +575,21 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, if (IS_ERR(fd)) { jffs2_complete_reservation(c); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); return PTR_ERR(fd); } /* File it. This will mark the old one obsolete. */ jffs2_add_fd_to_list(c, fd, &dir_f->dents); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); } else { - struct jffs2_full_dirent *fd = dir_f->dents; uint32_t nhash = full_name_hash(name, namelen); + fd = dir_f->dents; /* We don't actually want to reserve any space, but we do want to be holding the alloc_sem when we write to flash */ - down(&c->alloc_sem); - down(&dir_f->sem); + mutex_lock(&c->alloc_sem); + mutex_lock(&dir_f->sem); for (fd = dir_f->dents; fd; fd = fd->next) { if (fd->nhash == nhash && @@ -607,7 +608,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, break; } } - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); } /* dead_f is NULL if this was a rename not a real unlink */ @@ -615,7 +616,7 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, pointing to an inode which didn't exist. */ if (dead_f && dead_f->inocache) { - down(&dead_f->sem); + mutex_lock(&dead_f->sem); if (S_ISDIR(OFNI_EDONI_2SFFJ(dead_f)->i_mode)) { while (dead_f->dents) { @@ -635,11 +636,11 @@ int jffs2_do_unlink(struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, jffs2_mark_node_obsolete(c, fd->raw); jffs2_free_full_dirent(fd); } - } - - dead_f->inocache->nlink--; + dead_f->inocache->pino_nlink = 0; + } else + dead_f->inocache->pino_nlink--; /* NB: Caller must set inode nlink if appropriate */ - up(&dead_f->sem); + mutex_unlock(&dead_f->sem); } jffs2_complete_reservation(c); @@ -666,7 +667,7 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint return ret; } - down(&dir_f->sem); + mutex_lock(&dir_f->sem); /* Build a deletion node */ rd->magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -691,7 +692,7 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint if (IS_ERR(fd)) { jffs2_complete_reservation(c); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); return PTR_ERR(fd); } @@ -699,7 +700,7 @@ int jffs2_do_link (struct jffs2_sb_info *c, struct jffs2_inode_info *dir_f, uint jffs2_add_fd_to_list(c, fd, &dir_f->dents); jffs2_complete_reservation(c); - up(&dir_f->sem); + mutex_unlock(&dir_f->sem); return 0; } diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index e48665984cb..082e844ab2d 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -82,7 +82,7 @@ static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_ static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) { /* must be called under down_write(xattr_sem) */ - D1(dbg_xattr("%s: xid=%u, version=%u\n", __FUNCTION__, xd->xid, xd->version)); + D1(dbg_xattr("%s: xid=%u, version=%u\n", __func__, xd->xid, xd->version)); if (xd->xname) { c->xdatum_mem_usage -= (xd->name_len + 1 + xd->value_len); kfree(xd->xname); @@ -592,7 +592,7 @@ void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache When an inode with XATTR is removed, those XATTRs must be removed. */ struct jffs2_xattr_ref *ref, *_ref; - if (!ic || ic->nlink > 0) + if (!ic || ic->pino_nlink > 0) return; down_write(&c->xattr_sem); @@ -829,7 +829,7 @@ void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) ref->xd and ref->ic are not valid yet. */ xd = jffs2_find_xattr_datum(c, ref->xid); ic = jffs2_get_ino_cache(c, ref->ino); - if (!xd || !ic || !ic->nlink) { + if (!xd || !ic || !ic->pino_nlink) { dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n", ref->ino, ref->xid, ref->xseqno); ref->xseqno |= XREF_DELETE_MARKER; @@ -1252,7 +1252,7 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE); if (rc) { JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", - __FUNCTION__, rc, totlen); + __func__, rc, totlen); rc = rc ? rc : -EBADFD; goto out; } diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c index 887f5759e53..bf6ab19b86e 100644 --- a/fs/jfs/jfs_debug.c +++ b/fs/jfs/jfs_debug.c @@ -89,7 +89,7 @@ void jfs_proc_init(void) { int i; - if (!(base = proc_mkdir("jfs", proc_root_fs))) + if (!(base = proc_mkdir("fs/jfs", NULL))) return; base->owner = THIS_MODULE; @@ -109,7 +109,7 @@ void jfs_proc_clean(void) if (base) { for (i = 0; i < NPROCENT; i++) remove_proc_entry(Entries[i].name, base); - remove_proc_entry("jfs", proc_root_fs); + remove_proc_entry("fs/jfs", NULL); } } diff --git a/fs/lockd/clntproc.c b/fs/lockd/clntproc.c index b6b74a60e1e..5df517b81f3 100644 --- a/fs/lockd/clntproc.c +++ b/fs/lockd/clntproc.c @@ -155,8 +155,6 @@ static void nlmclnt_release_lockargs(struct nlm_rqst *req) int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) { struct nlm_rqst *call; - sigset_t oldset; - unsigned long flags; int status; nlm_get_host(host); @@ -168,22 +166,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) /* Set up the argument struct */ nlmclnt_setlockargs(call, fl); - /* Keep the old signal mask */ - spin_lock_irqsave(¤t->sighand->siglock, flags); - oldset = current->blocked; - - /* If we're cleaning up locks because the process is exiting, - * perform the RPC call asynchronously. */ - if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) - && fl->fl_type == F_UNLCK - && (current->flags & PF_EXITING)) { - sigfillset(¤t->blocked); /* Mask all signals */ - recalc_sigpending(); - - call->a_flags = RPC_TASK_ASYNC; - } - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - if (IS_SETLK(cmd) || IS_SETLKW(cmd)) { if (fl->fl_type != F_UNLCK) { call->a_args.block = IS_SETLKW(cmd) ? 1 : 0; @@ -198,11 +180,6 @@ int nlmclnt_proc(struct nlm_host *host, int cmd, struct file_lock *fl) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - dprintk("lockd: clnt proc returns %d\n", status); return status; } @@ -221,6 +198,7 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) for(;;) { call = kzalloc(sizeof(*call), GFP_KERNEL); if (call != NULL) { + atomic_set(&call->a_count, 1); locks_init_lock(&call->a_args.lock.fl); locks_init_lock(&call->a_res.lock.fl); call->a_host = host; @@ -237,6 +215,8 @@ struct nlm_rqst *nlm_alloc_call(struct nlm_host *host) void nlm_release_call(struct nlm_rqst *call) { + if (!atomic_dec_and_test(&call->a_count)) + return; nlm_release_host(call->a_host); nlmclnt_release_lockargs(call); kfree(call); @@ -267,7 +247,7 @@ static int nlm_wait_on_grace(wait_queue_head_t *queue) * Generic NLM call */ static int -nlmclnt_call(struct nlm_rqst *req, u32 proc) +nlmclnt_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; @@ -276,6 +256,7 @@ nlmclnt_call(struct nlm_rqst *req, u32 proc) struct rpc_message msg = { .rpc_argp = argp, .rpc_resp = resp, + .rpc_cred = cred, }; int status; @@ -343,10 +324,16 @@ in_grace_period: /* * Generic NLM call, async version. */ -static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +static struct rpc_task *__nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) { struct nlm_host *host = req->a_host; struct rpc_clnt *clnt; + struct rpc_task_setup task_setup_data = { + .rpc_message = msg, + .callback_ops = tk_ops, + .callback_data = req, + .flags = RPC_TASK_ASYNC, + }; dprintk("lockd: call procedure %d on %s (async)\n", (int)proc, host->h_name); @@ -356,21 +343,36 @@ static int __nlm_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message * if (clnt == NULL) goto out_err; msg->rpc_proc = &clnt->cl_procinfo[proc]; + task_setup_data.rpc_client = clnt; /* bootstrap and kick off the async RPC call */ - return rpc_call_async(clnt, msg, RPC_TASK_ASYNC, tk_ops, req); + return rpc_run_task(&task_setup_data); out_err: tk_ops->rpc_release(req); - return -ENOLCK; + return ERR_PTR(-ENOLCK); } +static int nlm_do_async_call(struct nlm_rqst *req, u32 proc, struct rpc_message *msg, const struct rpc_call_ops *tk_ops) +{ + struct rpc_task *task; + + task = __nlm_async_call(req, proc, msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + +/* + * NLM asynchronous call. + */ int nlm_async_call(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) { struct rpc_message msg = { .rpc_argp = &req->a_args, .rpc_resp = &req->a_res, }; - return __nlm_async_call(req, proc, &msg, tk_ops); + return nlm_do_async_call(req, proc, &msg, tk_ops); } int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) @@ -378,7 +380,33 @@ int nlm_async_reply(struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *t struct rpc_message msg = { .rpc_argp = &req->a_res, }; - return __nlm_async_call(req, proc, &msg, tk_ops); + return nlm_do_async_call(req, proc, &msg, tk_ops); +} + +/* + * NLM client asynchronous call. + * + * Note that although the calls are asynchronous, and are therefore + * guaranteed to complete, we still always attempt to wait for + * completion in order to be able to correctly track the lock + * state. + */ +static int nlmclnt_async_call(struct rpc_cred *cred, struct nlm_rqst *req, u32 proc, const struct rpc_call_ops *tk_ops) +{ + struct rpc_message msg = { + .rpc_argp = &req->a_args, + .rpc_resp = &req->a_res, + .rpc_cred = cred, + }; + struct rpc_task *task; + int err; + + task = __nlm_async_call(req, proc, &msg, tk_ops); + if (IS_ERR(task)) + return PTR_ERR(task); + err = rpc_wait_for_completion_task(task); + rpc_put_task(task); + return err; } /* @@ -389,7 +417,7 @@ nlmclnt_test(struct nlm_rqst *req, struct file_lock *fl) { int status; - status = nlmclnt_call(req, NLMPROC_TEST); + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_TEST); if (status < 0) goto out; @@ -480,10 +508,12 @@ static int do_vfs_lock(struct file_lock *fl) static int nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) { + struct rpc_cred *cred = nfs_file_cred(fl->fl_file); struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; struct nlm_wait *block = NULL; unsigned char fl_flags = fl->fl_flags; + unsigned char fl_type; int status = -ENOLCK; if (nsm_monitor(host) < 0) { @@ -493,18 +523,22 @@ nlmclnt_lock(struct nlm_rqst *req, struct file_lock *fl) } fl->fl_flags |= FL_ACCESS; status = do_vfs_lock(fl); + fl->fl_flags = fl_flags; if (status < 0) goto out; block = nlmclnt_prepare_block(host, fl); again: + /* + * Initialise resp->status to a valid non-zero value, + * since 0 == nlm_lck_granted + */ + resp->status = nlm_lck_blocked; for(;;) { /* Reboot protection */ fl->fl_u.nfs_fl.state = host->h_state; - status = nlmclnt_call(req, NLMPROC_LOCK); + status = nlmclnt_call(cred, req, NLMPROC_LOCK); if (status < 0) - goto out_unblock; - if (!req->a_args.block) break; /* Did a reclaimer thread notify us of a server reboot? */ if (resp->status == nlm_lck_denied_grace_period) @@ -513,15 +547,22 @@ again: break; /* Wait on an NLM blocking lock */ status = nlmclnt_block(block, req, NLMCLNT_POLL_TIMEOUT); - /* if we were interrupted. Send a CANCEL request to the server - * and exit - */ if (status < 0) - goto out_unblock; + break; if (resp->status != nlm_lck_blocked) break; } + /* if we were interrupted while blocking, then cancel the lock request + * and exit + */ + if (resp->status == nlm_lck_blocked) { + if (!req->a_args.block) + goto out_unlock; + if (nlmclnt_cancel(host, req->a_args.block, fl) == 0) + goto out_unblock; + } + if (resp->status == nlm_granted) { down_read(&host->h_rwsem); /* Check whether or not the server has rebooted */ @@ -530,20 +571,34 @@ again: goto again; } /* Ensure the resulting lock will get added to granted list */ - fl->fl_flags = fl_flags | FL_SLEEP; + fl->fl_flags |= FL_SLEEP; if (do_vfs_lock(fl) < 0) - printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __FUNCTION__); + printk(KERN_WARNING "%s: VFS is out of sync with lock manager!\n", __func__); up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + status = 0; } + if (status < 0) + goto out_unlock; status = nlm_stat_to_errno(resp->status); out_unblock: nlmclnt_finish_block(block); - /* Cancel the blocked request if it is still pending */ - if (resp->status == nlm_lck_blocked) - nlmclnt_cancel(host, req->a_args.block, fl); out: nlm_release_call(req); + return status; +out_unlock: + /* Fatal error: ensure that we remove the lock altogether */ + dprintk("lockd: lock attempt ended in fatal error.\n" + " Attempting to unlock.\n"); + nlmclnt_finish_block(block); + fl_type = fl->fl_type; + fl->fl_type = F_UNLCK; + down_read(&host->h_rwsem); + do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_type = fl_type; fl->fl_flags = fl_flags; + nlmclnt_async_call(cred, req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); return status; } @@ -567,8 +622,8 @@ nlmclnt_reclaim(struct nlm_host *host, struct file_lock *fl) nlmclnt_setlockargs(req, fl); req->a_args.reclaim = 1; - if ((status = nlmclnt_call(req, NLMPROC_LOCK)) >= 0 - && req->a_res.status == nlm_granted) + status = nlmclnt_call(nfs_file_cred(fl->fl_file), req, NLMPROC_LOCK); + if (status >= 0 && req->a_res.status == nlm_granted) return 0; printk(KERN_WARNING "lockd: failed to reclaim lock for pid %d " @@ -598,7 +653,8 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) { struct nlm_host *host = req->a_host; struct nlm_res *resp = &req->a_res; - int status = 0; + int status; + unsigned char fl_flags = fl->fl_flags; /* * Note: the server is supposed to either grant us the unlock @@ -607,16 +663,17 @@ nlmclnt_unlock(struct nlm_rqst *req, struct file_lock *fl) */ fl->fl_flags |= FL_EXISTS; down_read(&host->h_rwsem); - if (do_vfs_lock(fl) == -ENOENT) { - up_read(&host->h_rwsem); + status = do_vfs_lock(fl); + up_read(&host->h_rwsem); + fl->fl_flags = fl_flags; + if (status == -ENOENT) { + status = 0; goto out; } - up_read(&host->h_rwsem); - - if (req->a_flags & RPC_TASK_ASYNC) - return nlm_async_call(req, NLMPROC_UNLOCK, &nlmclnt_unlock_ops); - status = nlmclnt_call(req, NLMPROC_UNLOCK); + atomic_inc(&req->a_count); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_UNLOCK, &nlmclnt_unlock_ops); if (status < 0) goto out; @@ -671,16 +728,10 @@ static const struct rpc_call_ops nlmclnt_unlock_ops = { static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl) { struct nlm_rqst *req; - unsigned long flags; - sigset_t oldset; - int status; + int status; - /* Block all signals while setting up call */ - spin_lock_irqsave(¤t->sighand->siglock, flags); - oldset = current->blocked; - sigfillset(¤t->blocked); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); + dprintk("lockd: blocking lock attempt was interrupted by a signal.\n" + " Attempting to cancel lock.\n"); req = nlm_alloc_call(nlm_get_host(host)); if (!req) @@ -690,13 +741,12 @@ static int nlmclnt_cancel(struct nlm_host *host, int block, struct file_lock *fl nlmclnt_setlockargs(req, fl); req->a_args.block = block; - status = nlm_async_call(req, NLMPROC_CANCEL, &nlmclnt_cancel_ops); - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->blocked = oldset; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - + atomic_inc(&req->a_count); + status = nlmclnt_async_call(nfs_file_cred(fl->fl_file), req, + NLMPROC_CANCEL, &nlmclnt_cancel_ops); + if (status == 0 && req->a_res.status == nlm_lck_denied) + status = -ENOLCK; + nlm_release_call(req); return status; } diff --git a/fs/lockd/host.c b/fs/lockd/host.c index f1ef49fff11..a17664c7eac 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -19,12 +19,11 @@ #define NLMDBG_FACILITY NLMDBG_HOSTCACHE -#define NLM_HOST_MAX 64 #define NLM_HOST_NRHASH 32 #define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1)) #define NLM_HOST_REBIND (60 * HZ) -#define NLM_HOST_EXPIRE ((nrhosts > NLM_HOST_MAX)? 300 * HZ : 120 * HZ) -#define NLM_HOST_COLLECT ((nrhosts > NLM_HOST_MAX)? 120 * HZ : 60 * HZ) +#define NLM_HOST_EXPIRE (300 * HZ) +#define NLM_HOST_COLLECT (120 * HZ) static struct hlist_head nlm_hosts[NLM_HOST_NRHASH]; static unsigned long next_gc; @@ -42,11 +41,12 @@ static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, /* * Common host lookup routine for server & client */ -static struct nlm_host * -nlm_lookup_host(int server, const struct sockaddr_in *sin, - int proto, int version, const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(int server, + const struct sockaddr_in *sin, + int proto, u32 version, + const char *hostname, + unsigned int hostname_len, + const struct sockaddr_in *ssin) { struct hlist_head *chain; struct hlist_node *pos; @@ -55,7 +55,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin, int hash; dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%d, my role=%s, name=%.*s)\n", + ", p=%d, v=%u, my role=%s, name=%.*s)\n", NIPQUAD(ssin->sin_addr.s_addr), NIPQUAD(sin->sin_addr.s_addr), proto, version, server? "server" : "client", @@ -142,9 +142,7 @@ nlm_lookup_host(int server, const struct sockaddr_in *sin, INIT_LIST_HEAD(&host->h_granted); INIT_LIST_HEAD(&host->h_reclaim); - if (++nrhosts > NLM_HOST_MAX) - next_gc = 0; - + nrhosts++; out: mutex_unlock(&nlm_host_mutex); return host; @@ -175,9 +173,10 @@ nlm_destroy_host(struct nlm_host *host) /* * Find an NLM server handle in the cache. If there is none, create it. */ -struct nlm_host * -nlmclnt_lookup_host(const struct sockaddr_in *sin, int proto, int version, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, + int proto, u32 version, + const char *hostname, + unsigned int hostname_len) { struct sockaddr_in ssin = {0}; @@ -460,7 +459,7 @@ nlm_gc_hosts(void) * Manage NSM handles */ static LIST_HEAD(nsm_handles); -static DEFINE_MUTEX(nsm_mutex); +static DEFINE_SPINLOCK(nsm_lock); static struct nsm_handle * __nsm_find(const struct sockaddr_in *sin, @@ -468,7 +467,7 @@ __nsm_find(const struct sockaddr_in *sin, int create) { struct nsm_handle *nsm = NULL; - struct list_head *pos; + struct nsm_handle *pos; if (!sin) return NULL; @@ -482,38 +481,43 @@ __nsm_find(const struct sockaddr_in *sin, return NULL; } - mutex_lock(&nsm_mutex); - list_for_each(pos, &nsm_handles) { - nsm = list_entry(pos, struct nsm_handle, sm_link); +retry: + spin_lock(&nsm_lock); + list_for_each_entry(pos, &nsm_handles, sm_link) { if (hostname && nsm_use_hostnames) { - if (strlen(nsm->sm_name) != hostname_len - || memcmp(nsm->sm_name, hostname, hostname_len)) + if (strlen(pos->sm_name) != hostname_len + || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(&nsm->sm_addr, sin)) + } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) continue; - atomic_inc(&nsm->sm_count); - goto out; + atomic_inc(&pos->sm_count); + kfree(nsm); + nsm = pos; + goto found; } - - if (!create) { - nsm = NULL; - goto out; + if (nsm) { + list_add(&nsm->sm_link, &nsm_handles); + goto found; } + spin_unlock(&nsm_lock); + + if (!create) + return NULL; nsm = kzalloc(sizeof(*nsm) + hostname_len + 1, GFP_KERNEL); - if (nsm != NULL) { - nsm->sm_addr = *sin; - nsm->sm_name = (char *) (nsm + 1); - memcpy(nsm->sm_name, hostname, hostname_len); - nsm->sm_name[hostname_len] = '\0'; - atomic_set(&nsm->sm_count, 1); + if (nsm == NULL) + return NULL; - list_add(&nsm->sm_link, &nsm_handles); - } + nsm->sm_addr = *sin; + nsm->sm_name = (char *) (nsm + 1); + memcpy(nsm->sm_name, hostname, hostname_len); + nsm->sm_name[hostname_len] = '\0'; + atomic_set(&nsm->sm_count, 1); + goto retry; -out: - mutex_unlock(&nsm_mutex); +found: + spin_unlock(&nsm_lock); return nsm; } @@ -532,12 +536,9 @@ nsm_release(struct nsm_handle *nsm) { if (!nsm) return; - if (atomic_dec_and_test(&nsm->sm_count)) { - mutex_lock(&nsm_mutex); - if (atomic_read(&nsm->sm_count) == 0) { - list_del(&nsm->sm_link); - kfree(nsm); - } - mutex_unlock(&nsm_mutex); + if (atomic_dec_and_lock(&nsm->sm_count, &nsm_lock)) { + list_del(&nsm->sm_link); + spin_unlock(&nsm_lock); + kfree(nsm); } } diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index 908b23fadd0..e4d563543b1 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -18,6 +18,8 @@ #define NLMDBG_FACILITY NLMDBG_MONITOR +#define XDR_ADDRBUF_LEN (20) + static struct rpc_clnt * nsm_create(void); static struct rpc_program nsm_program; @@ -147,28 +149,55 @@ nsm_create(void) /* * XDR functions for NSM. + * + * See http://www.opengroup.org/ for details on the Network + * Status Monitor wire protocol. */ -static __be32 * -xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +static __be32 *xdr_encode_nsm_string(__be32 *p, char *string) { - char buffer[20], *name; - - /* - * Use the dotted-quad IP address of the remote host as - * identifier. Linux statd always looks up the canonical - * hostname first for whatever remote hostname it receives, - * so this works alright. - */ - if (nsm_use_hostnames) { - name = argp->mon_name; - } else { - sprintf(buffer, "%u.%u.%u.%u", NIPQUAD(argp->addr)); + size_t len = strlen(string); + + if (len > SM_MAXSTRLEN) + len = SM_MAXSTRLEN; + return xdr_encode_opaque(p, string, len); +} + +/* + * "mon_name" specifies the host to be monitored. + * + * Linux uses a text version of the IP address of the remote + * host as the host identifier (the "mon_name" argument). + * + * Linux statd always looks up the canonical hostname first for + * whatever remote hostname it receives, so this works alright. + */ +static __be32 *xdr_encode_mon_name(__be32 *p, struct nsm_args *argp) +{ + char buffer[XDR_ADDRBUF_LEN + 1]; + char *name = argp->mon_name; + + if (!nsm_use_hostnames) { + snprintf(buffer, XDR_ADDRBUF_LEN, + NIPQUAD_FMT, NIPQUAD(argp->addr)); name = buffer; } - if (!(p = xdr_encode_string(p, name)) - || !(p = xdr_encode_string(p, utsname()->nodename))) + + return xdr_encode_nsm_string(p, name); +} + +/* + * The "my_id" argument specifies the hostname and RPC procedure + * to be called when the status manager receives notification + * (via the SM_NOTIFY call) that the state of host "mon_name" + * has changed. + */ +static __be32 *xdr_encode_my_id(__be32 *p, struct nsm_args *argp) +{ + p = xdr_encode_nsm_string(p, utsname()->nodename); + if (!p) return ERR_PTR(-EIO); + *p++ = htonl(argp->prog); *p++ = htonl(argp->vers); *p++ = htonl(argp->proc); @@ -176,18 +205,48 @@ xdr_encode_common(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) return p; } -static int -xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +/* + * The "mon_id" argument specifies the non-private arguments + * of an SM_MON or SM_UNMON call. + */ +static __be32 *xdr_encode_mon_id(__be32 *p, struct nsm_args *argp) { - p = xdr_encode_common(rqstp, p, argp); - if (IS_ERR(p)) - return PTR_ERR(p); + p = xdr_encode_mon_name(p, argp); + if (!p) + return ERR_PTR(-EIO); - /* Surprise - there may even be room for an IPv6 address now */ + return xdr_encode_my_id(p, argp); +} + +/* + * The "priv" argument may contain private information required + * by the SM_MON call. This information will be supplied in the + * SM_NOTIFY call. + * + * Linux provides the raw IP address of the monitored host, + * left in network byte order. + */ +static __be32 *xdr_encode_priv(__be32 *p, struct nsm_args *argp) +{ *p++ = argp->addr; *p++ = 0; *p++ = 0; *p++ = 0; + + return p; +} + +static int +xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) +{ + p = xdr_encode_mon_id(p, argp); + if (IS_ERR(p)) + return PTR_ERR(p); + + p = xdr_encode_priv(p, argp); + if (IS_ERR(p)) + return PTR_ERR(p); + rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); return 0; } @@ -195,7 +254,7 @@ xdr_encode_mon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) static int xdr_encode_unmon(struct rpc_rqst *rqstp, __be32 *p, struct nsm_args *argp) { - p = xdr_encode_common(rqstp, p, argp); + p = xdr_encode_mon_id(p, argp); if (IS_ERR(p)) return PTR_ERR(p); rqstp->rq_slen = xdr_adjust_iovec(rqstp->rq_svec, p); @@ -220,9 +279,11 @@ xdr_decode_stat(struct rpc_rqst *rqstp, __be32 *p, struct nsm_res *resp) } #define SM_my_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) -#define SM_my_id_sz (3+1+SM_my_name_sz) -#define SM_mon_id_sz (1+XDR_QUADLEN(20)+SM_my_id_sz) -#define SM_mon_sz (SM_mon_id_sz+4) +#define SM_my_id_sz (SM_my_name_sz+3) +#define SM_mon_name_sz (1+XDR_QUADLEN(SM_MAXSTRLEN)) +#define SM_mon_id_sz (SM_mon_name_sz+SM_my_id_sz) +#define SM_priv_sz (XDR_QUADLEN(SM_PRIV_SIZE)) +#define SM_mon_sz (SM_mon_id_sz+SM_priv_sz) #define SM_monres_sz 2 #define SM_unmonres_sz 1 diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 1ed8bd4de94..2169af4d545 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -25,6 +25,7 @@ #include <linux/smp.h> #include <linux/smp_lock.h> #include <linux/mutex.h> +#include <linux/kthread.h> #include <linux/freezer.h> #include <linux/sunrpc/types.h> @@ -48,14 +49,11 @@ EXPORT_SYMBOL(nlmsvc_ops); static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; -static pid_t nlmsvc_pid; +static struct task_struct *nlmsvc_task; static struct svc_serv *nlmsvc_serv; int nlmsvc_grace_period; unsigned long nlmsvc_timeout; -static DECLARE_COMPLETION(lockd_start_done); -static DECLARE_WAIT_QUEUE_HEAD(lockd_exit); - /* * These can be set at insmod time (useful for NFS as root filesystem), * and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003 @@ -74,7 +72,9 @@ static const unsigned long nlm_timeout_min = 3; static const unsigned long nlm_timeout_max = 20; static const int nlm_port_min = 0, nlm_port_max = 65535; +#ifdef CONFIG_SYSCTL static struct ctl_table_header * nlm_sysctl_table; +#endif static unsigned long get_lockd_grace_period(void) { @@ -111,35 +111,30 @@ static inline void clear_grace_period(void) /* * This is the lockd kernel thread */ -static void -lockd(struct svc_rqst *rqstp) +static int +lockd(void *vrqstp) { - int err = 0; + int err = 0, preverr = 0; + struct svc_rqst *rqstp = vrqstp; unsigned long grace_period_expire; - /* Lock module and set up kernel thread */ - /* lockd_up is waiting for us to startup, so will - * be holding a reference to this module, so it - * is safe to just claim another reference - */ - __module_get(THIS_MODULE); - lock_kernel(); - - /* - * Let our maker know we're running. - */ - nlmsvc_pid = current->pid; - nlmsvc_serv = rqstp->rq_server; - complete(&lockd_start_done); - - daemonize("lockd"); + /* try_to_freeze() is called from svc_recv() */ set_freezable(); - /* Process request with signals blocked, but allow SIGKILL. */ + /* Allow SIGKILL to tell lockd to drop all of its locks */ allow_signal(SIGKILL); dprintk("NFS locking service started (ver " LOCKD_VERSION ").\n"); + /* + * FIXME: it would be nice if lockd didn't spend its entire life + * running under the BKL. At the very least, it would be good to + * have someone clarify what it's intended to protect here. I've + * seen some handwavy posts about posix locking needing to be + * done under the BKL, but it's far from clear. + */ + lock_kernel(); + if (!nlm_timeout) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; @@ -148,10 +143,9 @@ lockd(struct svc_rqst *rqstp) /* * The main request loop. We don't terminate until the last - * NFS mount or NFS daemon has gone away, and we've been sent a - * signal, or else another process has taken over our job. + * NFS mount or NFS daemon has gone away. */ - while ((nlmsvc_users || !signalled()) && nlmsvc_pid == current->pid) { + while (!kthread_should_stop()) { long timeout = MAX_SCHEDULE_TIMEOUT; RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]); @@ -161,6 +155,7 @@ lockd(struct svc_rqst *rqstp) nlmsvc_invalidate_all(); grace_period_expire = set_grace_period(); } + continue; } /* @@ -179,14 +174,20 @@ lockd(struct svc_rqst *rqstp) * recvfrom routine. */ err = svc_recv(rqstp, timeout); - if (err == -EAGAIN || err == -EINTR) + if (err == -EAGAIN || err == -EINTR) { + preverr = err; continue; + } if (err < 0) { - printk(KERN_WARNING - "lockd: terminating on error %d\n", - -err); - break; + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, err); + preverr = err; + } + schedule_timeout_interruptible(HZ); + continue; } + preverr = err; dprintk("lockd: request from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); @@ -195,28 +196,19 @@ lockd(struct svc_rqst *rqstp) } flush_signals(current); + if (nlmsvc_ops) + nlmsvc_invalidate_all(); + nlm_shutdown_hosts(); - /* - * Check whether there's a new lockd process before - * shutting down the hosts and clearing the slot. - */ - if (!nlmsvc_pid || current->pid == nlmsvc_pid) { - if (nlmsvc_ops) - nlmsvc_invalidate_all(); - nlm_shutdown_hosts(); - nlmsvc_pid = 0; - nlmsvc_serv = NULL; - } else - printk(KERN_DEBUG - "lockd: new process, skipping host shutdown\n"); - wake_up(&lockd_exit); + unlock_kernel(); + + nlmsvc_task = NULL; + nlmsvc_serv = NULL; /* Exit the RPC thread */ svc_exit_thread(rqstp); - /* Release module */ - unlock_kernel(); - module_put_and_exit(0); + return 0; } /* @@ -261,14 +253,15 @@ static int make_socks(struct svc_serv *serv, int proto) int lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ { - struct svc_serv * serv; - int error = 0; + struct svc_serv *serv; + struct svc_rqst *rqstp; + int error = 0; mutex_lock(&nlmsvc_mutex); /* * Check whether we're already up and running. */ - if (nlmsvc_pid) { + if (nlmsvc_serv) { if (proto) error = make_socks(nlmsvc_serv, proto); goto out; @@ -295,13 +288,28 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ /* * Create the kernel thread and wait for it to start. */ - error = svc_create_thread(lockd, serv); - if (error) { + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(rqstp)) { + error = PTR_ERR(rqstp); + printk(KERN_WARNING + "lockd_up: svc_rqst allocation failed, error=%d\n", + error); + goto destroy_and_out; + } + + svc_sock_update_bufs(serv); + nlmsvc_serv = rqstp->rq_server; + + nlmsvc_task = kthread_run(lockd, rqstp, serv->sv_name); + if (IS_ERR(nlmsvc_task)) { + error = PTR_ERR(nlmsvc_task); + nlmsvc_task = NULL; + nlmsvc_serv = NULL; printk(KERN_WARNING - "lockd_up: create thread failed, error=%d\n", error); + "lockd_up: kthread_run failed, error=%d\n", error); + svc_exit_thread(rqstp); goto destroy_and_out; } - wait_for_completion(&lockd_start_done); /* * Note: svc_serv structures have an initial use count of 1, @@ -323,42 +331,28 @@ EXPORT_SYMBOL(lockd_up); void lockd_down(void) { - static int warned; - mutex_lock(&nlmsvc_mutex); if (nlmsvc_users) { if (--nlmsvc_users) goto out; - } else - printk(KERN_WARNING "lockd_down: no users! pid=%d\n", nlmsvc_pid); - - if (!nlmsvc_pid) { - if (warned++ == 0) - printk(KERN_WARNING "lockd_down: no lockd running.\n"); - goto out; + } else { + printk(KERN_ERR "lockd_down: no users! task=%p\n", + nlmsvc_task); + BUG(); } - warned = 0; - kill_proc(nlmsvc_pid, SIGKILL, 1); - /* - * Wait for the lockd process to exit, but since we're holding - * the lockd semaphore, we can't wait around forever ... - */ - clear_thread_flag(TIF_SIGPENDING); - interruptible_sleep_on_timeout(&lockd_exit, HZ); - if (nlmsvc_pid) { - printk(KERN_WARNING - "lockd_down: lockd failed to exit, clearing pid\n"); - nlmsvc_pid = 0; + if (!nlmsvc_task) { + printk(KERN_ERR "lockd_down: no lockd running.\n"); + BUG(); } - spin_lock_irq(¤t->sighand->siglock); - recalc_sigpending(); - spin_unlock_irq(¤t->sighand->siglock); + kthread_stop(nlmsvc_task); out: mutex_unlock(&nlmsvc_mutex); } EXPORT_SYMBOL(lockd_down); +#ifdef CONFIG_SYSCTL + /* * Sysctl parameters (same as module parameters, different interface). */ @@ -443,6 +437,8 @@ static ctl_table nlm_sysctl_root[] = { { .ctl_name = 0 } }; +#endif /* CONFIG_SYSCTL */ + /* * Module (and sysfs) parameters. */ @@ -516,15 +512,21 @@ module_param(nsm_use_hostnames, bool, 0644); static int __init init_nlm(void) { +#ifdef CONFIG_SYSCTL nlm_sysctl_table = register_sysctl_table(nlm_sysctl_root); return nlm_sysctl_table ? 0 : -ENOMEM; +#else + return 0; +#endif } static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); +#ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); +#endif } module_init(init_nlm); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index fe9bdb4a220..81aca859bfd 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -29,6 +29,7 @@ #include <linux/sunrpc/svc.h> #include <linux/lockd/nlm.h> #include <linux/lockd/lockd.h> +#include <linux/kthread.h> #define NLMDBG_FACILITY NLMDBG_SVCLOCK @@ -226,8 +227,7 @@ failed: } /* - * Delete a block. If the lock was cancelled or the grant callback - * failed, unlock is set to 1. + * Delete a block. * It is the caller's responsibility to check whether the file * can be closed hereafter. */ @@ -632,7 +632,7 @@ nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, block->b_flags |= B_TIMED_OUT; if (conf) { if (block->b_fl) - locks_copy_lock(block->b_fl, conf); + __locks_copy_lock(block->b_fl, conf); } } @@ -752,7 +752,7 @@ nlmsvc_grant_blocked(struct nlm_block *block) return; default: printk(KERN_WARNING "lockd: unexpected error %d in %s!\n", - -error, __FUNCTION__); + -error, __func__); nlmsvc_insert_block(block, 10 * HZ); nlmsvc_release_block(block); return; @@ -887,7 +887,7 @@ nlmsvc_retry_blocked(void) unsigned long timeout = MAX_SCHEDULE_TIMEOUT; struct nlm_block *block; - while (!list_empty(&nlm_blocked)) { + while (!list_empty(&nlm_blocked) && !kthread_should_stop()) { block = list_entry(nlm_blocked.next, struct nlm_block, b_list); if (block->b_when == NLM_NEVER) diff --git a/fs/lockd/svcshare.c b/fs/lockd/svcshare.c index 068886de4dd..b0ae0700870 100644 --- a/fs/lockd/svcshare.c +++ b/fs/lockd/svcshare.c @@ -71,7 +71,8 @@ nlmsvc_unshare_file(struct nlm_host *host, struct nlm_file *file, struct nlm_share *share, **shpp; struct xdr_netobj *oh = &argp->lock.oh; - for (shpp = &file->f_shares; (share = *shpp) != 0; shpp = &share->s_next) { + for (shpp = &file->f_shares; (share = *shpp) != NULL; + shpp = &share->s_next) { if (share->s_host == host && nlm_cmp_owner(share, oh)) { *shpp = share->s_next; kfree(share); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index dbbefbcd671..d1c48b539df 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -18,6 +18,8 @@ #include <linux/lockd/lockd.h> #include <linux/lockd/share.h> #include <linux/lockd/sm_inter.h> +#include <linux/module.h> +#include <linux/mount.h> #define NLMDBG_FACILITY NLMDBG_SVCSUBS @@ -194,6 +196,12 @@ again: return 0; } +static int +nlmsvc_always_match(void *dummy1, struct nlm_host *dummy2) +{ + return 1; +} + /* * Inspect a single file */ @@ -230,7 +238,8 @@ nlm_file_inuse(struct nlm_file *file) * Loop over all files in the file table. */ static int -nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match) +nlm_traverse_files(void *data, nlm_host_match_fn_t match, + int (*is_failover_file)(void *data, struct nlm_file *file)) { struct hlist_node *pos, *next; struct nlm_file *file; @@ -239,12 +248,14 @@ nlm_traverse_files(struct nlm_host *host, nlm_host_match_fn_t match) mutex_lock(&nlm_file_mutex); for (i = 0; i < FILE_NRHASH; i++) { hlist_for_each_entry_safe(file, pos, next, &nlm_files[i], f_list) { + if (is_failover_file && !is_failover_file(data, file)) + continue; file->f_count++; mutex_unlock(&nlm_file_mutex); /* Traverse locks, blocks and shares of this file * and update file->f_locks count */ - if (nlm_inspect_file(host, file, match)) + if (nlm_inspect_file(data, file, match)) ret = 1; mutex_lock(&nlm_file_mutex); @@ -303,21 +314,27 @@ nlm_release_file(struct nlm_file *file) * Used by nlmsvc_invalidate_all */ static int -nlmsvc_mark_host(struct nlm_host *host, struct nlm_host *dummy) +nlmsvc_mark_host(void *data, struct nlm_host *dummy) { + struct nlm_host *host = data; + host->h_inuse = 1; return 0; } static int -nlmsvc_same_host(struct nlm_host *host, struct nlm_host *other) +nlmsvc_same_host(void *data, struct nlm_host *other) { + struct nlm_host *host = data; + return host == other; } static int -nlmsvc_is_client(struct nlm_host *host, struct nlm_host *dummy) +nlmsvc_is_client(void *data, struct nlm_host *dummy) { + struct nlm_host *host = data; + if (host->h_server) { /* we are destroying locks even though the client * hasn't asked us too, so don't unmonitor the @@ -337,7 +354,7 @@ void nlmsvc_mark_resources(void) { dprintk("lockd: nlmsvc_mark_resources\n"); - nlm_traverse_files(NULL, nlmsvc_mark_host); + nlm_traverse_files(NULL, nlmsvc_mark_host, NULL); } /* @@ -348,7 +365,7 @@ nlmsvc_free_host_resources(struct nlm_host *host) { dprintk("lockd: nlmsvc_free_host_resources\n"); - if (nlm_traverse_files(host, nlmsvc_same_host)) { + if (nlm_traverse_files(host, nlmsvc_same_host, NULL)) { printk(KERN_WARNING "lockd: couldn't remove all locks held by %s\n", host->h_name); @@ -368,5 +385,41 @@ nlmsvc_invalidate_all(void) * turn, which is about as inefficient as it gets. * Now we just do it once in nlm_traverse_files. */ - nlm_traverse_files(NULL, nlmsvc_is_client); + nlm_traverse_files(NULL, nlmsvc_is_client, NULL); +} + +static int +nlmsvc_match_sb(void *datap, struct nlm_file *file) +{ + struct super_block *sb = datap; + + return sb == file->f_file->f_path.mnt->mnt_sb; +} + +int +nlmsvc_unlock_all_by_sb(struct super_block *sb) +{ + int ret; + + ret = nlm_traverse_files(sb, nlmsvc_always_match, nlmsvc_match_sb); + return ret ? -EIO : 0; +} +EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); + +static int +nlmsvc_match_ip(void *datap, struct nlm_host *host) +{ + __be32 *server_addr = datap; + + return host->h_saddr.sin_addr.s_addr == *server_addr; +} + +int +nlmsvc_unlock_all_by_ip(__be32 server_addr) +{ + int ret; + ret = nlm_traverse_files(&server_addr, nlmsvc_match_ip, NULL); + return ret ? -EIO : 0; + } +EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_ip); diff --git a/fs/locks.c b/fs/locks.c index 592faadbcec..663c069b59b 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -116,6 +116,7 @@ #include <linux/capability.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> @@ -224,7 +225,7 @@ static void locks_copy_private(struct file_lock *new, struct file_lock *fl) /* * Initialize a new lock from an existing file_lock structure. */ -static void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) +void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) { new->fl_owner = fl->fl_owner; new->fl_pid = fl->fl_pid; @@ -236,6 +237,7 @@ static void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) new->fl_ops = NULL; new->fl_lmops = NULL; } +EXPORT_SYMBOL(__locks_copy_lock); void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { @@ -833,7 +835,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (!posix_locks_conflict(request, fl)) continue; if (conflock) - locks_copy_lock(conflock, fl); + __locks_copy_lock(conflock, fl); error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; @@ -1367,18 +1369,20 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) lease = *flp; - error = -EAGAIN; - if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) - goto out; - if ((arg == F_WRLCK) - && ((atomic_read(&dentry->d_count) > 1) - || (atomic_read(&inode->i_count) > 1))) - goto out; + if (arg != F_UNLCK) { + error = -ENOMEM; + new_fl = locks_alloc_lock(); + if (new_fl == NULL) + goto out; - error = -ENOMEM; - new_fl = locks_alloc_lock(); - if (new_fl == NULL) - goto out; + error = -EAGAIN; + if ((arg == F_RDLCK) && (atomic_read(&inode->i_writecount) > 0)) + goto out; + if ((arg == F_WRLCK) + && ((atomic_read(&dentry->d_count) > 1) + || (atomic_read(&inode->i_count) > 1))) + goto out; + } /* * At this point, we know that if there is an exclusive @@ -1404,6 +1408,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) rdlease_count++; } + error = -EAGAIN; if ((arg == F_RDLCK && (wrlease_count > 0)) || (arg == F_WRLCK && ((rdlease_count + wrlease_count) > 0))) goto out; @@ -1490,8 +1495,7 @@ EXPORT_SYMBOL_GPL(vfs_setlease); int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { struct file_lock fl, *flp = &fl; - struct dentry *dentry = filp->f_path.dentry; - struct inode *inode = dentry->d_inode; + struct inode *inode = filp->f_path.dentry->d_inode; int error; locks_init_lock(&fl); diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 30f7d0ae221..05ff4f1d702 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -609,7 +609,7 @@ error_inode: if (corrupt < 0) { fat_fs_panic(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", - __FUNCTION__, sinfo.i_pos); + __func__, sinfo.i_pos); } goto out; } @@ -653,7 +653,7 @@ static const struct inode_operations msdos_dir_inode_operations = { .mkdir = msdos_mkdir, .rmdir = msdos_rmdir, .rename = msdos_rename, - .setattr = fat_notify_change, + .setattr = fat_setattr, .getattr = fat_getattr, }; diff --git a/fs/namei.c b/fs/namei.c index e179f71bfcb..32fd9655485 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -30,6 +30,7 @@ #include <linux/capability.h> #include <linux/file.h> #include <linux/fcntl.h> +#include <linux/device_cgroup.h> #include <asm/namei.h> #include <asm/uaccess.h> @@ -281,6 +282,10 @@ int permission(struct inode *inode, int mask, struct nameidata *nd) if (retval) return retval; + retval = devcgroup_inode_permission(inode, mask); + if (retval) + return retval; + return security_inode_permission(inode, mask, nd); } @@ -2028,6 +2033,10 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t dev) if (!dir->i_op || !dir->i_op->mknod) return -EPERM; + error = devcgroup_inode_mknod(mode, dev); + if (error) + return error; + error = security_inode_mknod(dir, dentry, mode, dev); if (error) return error; diff --git a/fs/namespace.c b/fs/namespace.c index 1bf302d0478..4fc302c2a0e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -14,7 +14,6 @@ #include <linux/smp_lock.h> #include <linux/init.h> #include <linux/kernel.h> -#include <linux/quotaops.h> #include <linux/acct.h> #include <linux/capability.h> #include <linux/cpumask.h> @@ -27,6 +26,7 @@ #include <linux/mount.h> #include <linux/ramfs.h> #include <linux/log2.h> +#include <linux/idr.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include "pnode.h" @@ -39,6 +39,8 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(vfsmount_lock); static int event; +static DEFINE_IDA(mnt_id_ida); +static DEFINE_IDA(mnt_group_ida); static struct list_head *mount_hashtable __read_mostly; static struct kmem_cache *mnt_cache __read_mostly; @@ -58,10 +60,63 @@ static inline unsigned long hash(struct vfsmount *mnt, struct dentry *dentry) #define MNT_WRITER_UNDERFLOW_LIMIT -(1<<16) +/* allocation is serialized by namespace_sem */ +static int mnt_alloc_id(struct vfsmount *mnt) +{ + int res; + +retry: + ida_pre_get(&mnt_id_ida, GFP_KERNEL); + spin_lock(&vfsmount_lock); + res = ida_get_new(&mnt_id_ida, &mnt->mnt_id); + spin_unlock(&vfsmount_lock); + if (res == -EAGAIN) + goto retry; + + return res; +} + +static void mnt_free_id(struct vfsmount *mnt) +{ + spin_lock(&vfsmount_lock); + ida_remove(&mnt_id_ida, mnt->mnt_id); + spin_unlock(&vfsmount_lock); +} + +/* + * Allocate a new peer group ID + * + * mnt_group_ida is protected by namespace_sem + */ +static int mnt_alloc_group_id(struct vfsmount *mnt) +{ + if (!ida_pre_get(&mnt_group_ida, GFP_KERNEL)) + return -ENOMEM; + + return ida_get_new_above(&mnt_group_ida, 1, &mnt->mnt_group_id); +} + +/* + * Release a peer group ID + */ +void mnt_release_group_id(struct vfsmount *mnt) +{ + ida_remove(&mnt_group_ida, mnt->mnt_group_id); + mnt->mnt_group_id = 0; +} + struct vfsmount *alloc_vfsmnt(const char *name) { struct vfsmount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL); if (mnt) { + int err; + + err = mnt_alloc_id(mnt); + if (err) { + kmem_cache_free(mnt_cache, mnt); + return NULL; + } + atomic_set(&mnt->mnt_count, 1); INIT_LIST_HEAD(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -353,6 +408,7 @@ EXPORT_SYMBOL(simple_set_mnt); void free_vfsmnt(struct vfsmount *mnt) { kfree(mnt->mnt_devname); + mnt_free_id(mnt); kmem_cache_free(mnt_cache, mnt); } @@ -499,6 +555,17 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, struct vfsmount *mnt = alloc_vfsmnt(old->mnt_devname); if (mnt) { + if (flag & (CL_SLAVE | CL_PRIVATE)) + mnt->mnt_group_id = 0; /* not a peer of original */ + else + mnt->mnt_group_id = old->mnt_group_id; + + if ((flag & CL_MAKE_SHARED) && !mnt->mnt_group_id) { + int err = mnt_alloc_group_id(mnt); + if (err) + goto out_free; + } + mnt->mnt_flags = old->mnt_flags; atomic_inc(&sb->s_active); mnt->mnt_sb = sb; @@ -528,6 +595,10 @@ static struct vfsmount *clone_mnt(struct vfsmount *old, struct dentry *root, } } return mnt; + + out_free: + free_vfsmnt(mnt); + return NULL; } static inline void __mntput(struct vfsmount *mnt) @@ -652,20 +723,21 @@ void save_mount_options(struct super_block *sb, char *options) } EXPORT_SYMBOL(save_mount_options); +#ifdef CONFIG_PROC_FS /* iterator */ static void *m_start(struct seq_file *m, loff_t *pos) { - struct mnt_namespace *n = m->private; + struct proc_mounts *p = m->private; down_read(&namespace_sem); - return seq_list_start(&n->list, *pos); + return seq_list_start(&p->ns->list, *pos); } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct mnt_namespace *n = m->private; + struct proc_mounts *p = m->private; - return seq_list_next(v, &n->list, pos); + return seq_list_next(v, &p->ns->list, pos); } static void m_stop(struct seq_file *m, void *v) @@ -673,20 +745,30 @@ static void m_stop(struct seq_file *m, void *v) up_read(&namespace_sem); } -static int show_vfsmnt(struct seq_file *m, void *v) +struct proc_fs_info { + int flag; + const char *str; +}; + +static void show_sb_opts(struct seq_file *m, struct super_block *sb) { - struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); - int err = 0; - static struct proc_fs_info { - int flag; - char *str; - } fs_info[] = { + static const struct proc_fs_info fs_info[] = { { MS_SYNCHRONOUS, ",sync" }, { MS_DIRSYNC, ",dirsync" }, { MS_MANDLOCK, ",mand" }, { 0, NULL } }; - static struct proc_fs_info mnt_info[] = { + const struct proc_fs_info *fs_infop; + + for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { + if (sb->s_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static void show_mnt_opts(struct seq_file *m, struct vfsmount *mnt) +{ + static const struct proc_fs_info mnt_info[] = { { MNT_NOSUID, ",nosuid" }, { MNT_NODEV, ",nodev" }, { MNT_NOEXEC, ",noexec" }, @@ -695,40 +777,108 @@ static int show_vfsmnt(struct seq_file *m, void *v) { MNT_RELATIME, ",relatime" }, { 0, NULL } }; - struct proc_fs_info *fs_infop; + const struct proc_fs_info *fs_infop; + + for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { + if (mnt->mnt_flags & fs_infop->flag) + seq_puts(m, fs_infop->str); + } +} + +static void show_type(struct seq_file *m, struct super_block *sb) +{ + mangle(m, sb->s_type->name); + if (sb->s_subtype && sb->s_subtype[0]) { + seq_putc(m, '.'); + mangle(m, sb->s_subtype); + } +} + +static int show_vfsmnt(struct seq_file *m, void *v) +{ + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + int err = 0; struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); seq_putc(m, ' '); seq_path(m, &mnt_path, " \t\n\\"); seq_putc(m, ' '); - mangle(m, mnt->mnt_sb->s_type->name); - if (mnt->mnt_sb->s_subtype && mnt->mnt_sb->s_subtype[0]) { - seq_putc(m, '.'); - mangle(m, mnt->mnt_sb->s_subtype); - } + show_type(m, mnt->mnt_sb); seq_puts(m, __mnt_is_readonly(mnt) ? " ro" : " rw"); - for (fs_infop = fs_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_sb->s_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } - for (fs_infop = mnt_info; fs_infop->flag; fs_infop++) { - if (mnt->mnt_flags & fs_infop->flag) - seq_puts(m, fs_infop->str); - } + show_sb_opts(m, mnt->mnt_sb); + show_mnt_opts(m, mnt); if (mnt->mnt_sb->s_op->show_options) err = mnt->mnt_sb->s_op->show_options(m, mnt); seq_puts(m, " 0 0\n"); return err; } -struct seq_operations mounts_op = { +const struct seq_operations mounts_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_vfsmnt }; +static int show_mountinfo(struct seq_file *m, void *v) +{ + struct proc_mounts *p = m->private; + struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); + struct super_block *sb = mnt->mnt_sb; + struct path mnt_path = { .dentry = mnt->mnt_root, .mnt = mnt }; + struct path root = p->root; + int err = 0; + + seq_printf(m, "%i %i %u:%u ", mnt->mnt_id, mnt->mnt_parent->mnt_id, + MAJOR(sb->s_dev), MINOR(sb->s_dev)); + seq_dentry(m, mnt->mnt_root, " \t\n\\"); + seq_putc(m, ' '); + seq_path_root(m, &mnt_path, &root, " \t\n\\"); + if (root.mnt != p->root.mnt || root.dentry != p->root.dentry) { + /* + * Mountpoint is outside root, discard that one. Ugly, + * but less so than trying to do that in iterator in a + * race-free way (due to renames). + */ + return SEQ_SKIP; + } + seq_puts(m, mnt->mnt_flags & MNT_READONLY ? " ro" : " rw"); + show_mnt_opts(m, mnt); + + /* Tagged fields ("foo:X" or "bar") */ + if (IS_MNT_SHARED(mnt)) + seq_printf(m, " shared:%i", mnt->mnt_group_id); + if (IS_MNT_SLAVE(mnt)) { + int master = mnt->mnt_master->mnt_group_id; + int dom = get_dominating_id(mnt, &p->root); + seq_printf(m, " master:%i", master); + if (dom && dom != master) + seq_printf(m, " propagate_from:%i", dom); + } + if (IS_MNT_UNBINDABLE(mnt)) + seq_puts(m, " unbindable"); + + /* Filesystem specific data */ + seq_puts(m, " - "); + show_type(m, sb); + seq_putc(m, ' '); + mangle(m, mnt->mnt_devname ? mnt->mnt_devname : "none"); + seq_puts(m, sb->s_flags & MS_RDONLY ? " ro" : " rw"); + show_sb_opts(m, sb); + if (sb->s_op->show_options) + err = sb->s_op->show_options(m, mnt); + seq_putc(m, '\n'); + return err; +} + +const struct seq_operations mountinfo_op = { + .start = m_start, + .next = m_next, + .stop = m_stop, + .show = show_mountinfo, +}; + static int show_vfsstat(struct seq_file *m, void *v) { struct vfsmount *mnt = list_entry(v, struct vfsmount, mnt_list); @@ -749,7 +899,7 @@ static int show_vfsstat(struct seq_file *m, void *v) /* file system type */ seq_puts(m, "with fstype "); - mangle(m, mnt->mnt_sb->s_type->name); + show_type(m, mnt->mnt_sb); /* optional statistics */ if (mnt->mnt_sb->s_op->show_stats) { @@ -761,12 +911,13 @@ static int show_vfsstat(struct seq_file *m, void *v) return err; } -struct seq_operations mountstats_op = { +const struct seq_operations mountstats_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_vfsstat, }; +#endif /* CONFIG_PROC_FS */ /** * may_umount_tree - check if a mount tree is busy @@ -909,10 +1060,11 @@ static int do_umount(struct vfsmount *mnt, int flags) * about for the moment. */ - lock_kernel(); - if (sb->s_op->umount_begin) - sb->s_op->umount_begin(mnt, flags); - unlock_kernel(); + if (flags & MNT_FORCE && sb->s_op->umount_begin) { + lock_kernel(); + sb->s_op->umount_begin(sb); + unlock_kernel(); + } /* * No sense to grab the lock for this test, but test itself looks @@ -931,7 +1083,6 @@ static int do_umount(struct vfsmount *mnt, int flags) down_write(&sb->s_umount); if (!(sb->s_flags & MS_RDONLY)) { lock_kernel(); - DQUOT_OFF(sb); retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); unlock_kernel(); } @@ -1025,17 +1176,6 @@ static int mount_is_safe(struct nameidata *nd) #endif } -static int lives_below_in_same_fs(struct dentry *d, struct dentry *dentry) -{ - while (1) { - if (d == dentry) - return 1; - if (d == NULL || d == d->d_parent) - return 0; - d = d->d_parent; - } -} - struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, int flag) { @@ -1052,7 +1192,7 @@ struct vfsmount *copy_tree(struct vfsmount *mnt, struct dentry *dentry, p = mnt; list_for_each_entry(r, &mnt->mnt_mounts, mnt_child) { - if (!lives_below_in_same_fs(r->mnt_mountpoint, dentry)) + if (!is_subdir(r->mnt_mountpoint, dentry)) continue; for (s = r; s; s = next_mnt(s, r)) { @@ -1108,6 +1248,33 @@ void drop_collected_mounts(struct vfsmount *mnt) release_mounts(&umount_list); } +static void cleanup_group_ids(struct vfsmount *mnt, struct vfsmount *end) +{ + struct vfsmount *p; + + for (p = mnt; p != end; p = next_mnt(p, mnt)) { + if (p->mnt_group_id && !IS_MNT_SHARED(p)) + mnt_release_group_id(p); + } +} + +static int invent_group_ids(struct vfsmount *mnt, bool recurse) +{ + struct vfsmount *p; + + for (p = mnt; p; p = recurse ? next_mnt(p, mnt) : NULL) { + if (!p->mnt_group_id && !IS_MNT_SHARED(p)) { + int err = mnt_alloc_group_id(p); + if (err) { + cleanup_group_ids(mnt, p); + return err; + } + } + } + + return 0; +} + /* * @source_mnt : mount tree to be attached * @nd : place the mount tree @source_mnt is attached @@ -1178,9 +1345,16 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, struct vfsmount *dest_mnt = path->mnt; struct dentry *dest_dentry = path->dentry; struct vfsmount *child, *p; + int err; - if (propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list)) - return -EINVAL; + if (IS_MNT_SHARED(dest_mnt)) { + err = invent_group_ids(source_mnt, true); + if (err) + goto out; + } + err = propagate_mnt(dest_mnt, dest_dentry, source_mnt, &tree_list); + if (err) + goto out_cleanup_ids; if (IS_MNT_SHARED(dest_mnt)) { for (p = source_mnt; p; p = next_mnt(p, source_mnt)) @@ -1203,6 +1377,12 @@ static int attach_recursive_mnt(struct vfsmount *source_mnt, } spin_unlock(&vfsmount_lock); return 0; + + out_cleanup_ids: + if (IS_MNT_SHARED(dest_mnt)) + cleanup_group_ids(source_mnt, NULL); + out: + return err; } static int graft_tree(struct vfsmount *mnt, struct path *path) @@ -1243,6 +1423,7 @@ static noinline int do_change_type(struct nameidata *nd, int flag) struct vfsmount *m, *mnt = nd->path.mnt; int recurse = flag & MS_REC; int type = flag & ~MS_REC; + int err = 0; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1251,12 +1432,20 @@ static noinline int do_change_type(struct nameidata *nd, int flag) return -EINVAL; down_write(&namespace_sem); + if (type == MS_SHARED) { + err = invent_group_ids(mnt, recurse); + if (err) + goto out_unlock; + } + spin_lock(&vfsmount_lock); for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); spin_unlock(&vfsmount_lock); + + out_unlock: up_write(&namespace_sem); - return 0; + return err; } /* @@ -2140,10 +2329,10 @@ void __init mnt_init(void) err = sysfs_init(); if (err) printk(KERN_WARNING "%s: sysfs_init error: %d\n", - __FUNCTION__, err); + __func__, err); fs_kobj = kobject_create_and_add("fs", NULL); if (!fs_kobj) - printk(KERN_WARNING "%s: kobj create error\n", __FUNCTION__); + printk(KERN_WARNING "%s: kobj create error\n", __func__); init_rootfs(); init_mount_tree(); } diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c index fbbb9f7afa1..2e5ab1204de 100644 --- a/fs/ncpfs/inode.c +++ b/fs/ncpfs/inode.c @@ -107,12 +107,6 @@ static const struct super_operations ncp_sops = .show_options = ncp_show_options, }; -extern struct dentry_operations ncp_root_dentry_operations; -#if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS) -extern const struct address_space_operations ncp_symlink_aops; -extern int ncp_symlink(struct inode*, struct dentry*, const char*); -#endif - /* * Fill in the ncpfs-specific information in the inode. */ diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c index ad8f167e54b..3a97c95e1ca 100644 --- a/fs/ncpfs/ioctl.c +++ b/fs/ncpfs/ioctl.c @@ -389,11 +389,11 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp, struct dentry* dentry = inode->i_sb->s_root; if (dentry) { - struct inode* inode = dentry->d_inode; + struct inode* s_inode = dentry->d_inode; - if (inode) { - sr.volNumber = NCP_FINFO(inode)->volNumber; - sr.dirEntNum = NCP_FINFO(inode)->dirEntNum; + if (s_inode) { + sr.volNumber = NCP_FINFO(s_inode)->volNumber; + sr.dirEntNum = NCP_FINFO(s_inode)->dirEntNum; sr.namespace = server->name_space[sr.volNumber]; } else DPRINTK("ncpfs: s_root->d_inode==NULL\n"); @@ -439,12 +439,12 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp, dentry = inode->i_sb->s_root; server->root_setuped = 1; if (dentry) { - struct inode* inode = dentry->d_inode; + struct inode* s_inode = dentry->d_inode; if (inode) { - NCP_FINFO(inode)->volNumber = vnum; - NCP_FINFO(inode)->dirEntNum = de; - NCP_FINFO(inode)->DosDirNum = dosde; + NCP_FINFO(s_inode)->volNumber = vnum; + NCP_FINFO(s_inode)->dirEntNum = de; + NCP_FINFO(s_inode)->DosDirNum = dosde; } else DPRINTK("ncpfs: s_root->d_inode==NULL\n"); } else @@ -519,7 +519,6 @@ static int __ncp_ioctl(struct inode *inode, struct file *filp, } { struct ncp_lock_ioctl rqdata; - int result; if (copy_from_user(&rqdata, argp, sizeof(rqdata))) return -EFAULT; diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c index df6d60bdfcd..97645f11211 100644 --- a/fs/ncpfs/ncplib_kernel.c +++ b/fs/ncpfs/ncplib_kernel.c @@ -102,48 +102,47 @@ static inline void ncp_init_request_s(struct ncp_server *server, int subfunction } static inline char * - ncp_reply_data(struct ncp_server *server, int offset) +ncp_reply_data(struct ncp_server *server, int offset) { return &(server->packet[sizeof(struct ncp_reply_header) + offset]); } -static inline __u8 BVAL(void* data) +static inline u8 BVAL(void *data) { - return get_unaligned((__u8*)data); + return *(u8 *)data; } -static __u8 - ncp_reply_byte(struct ncp_server *server, int offset) +static u8 ncp_reply_byte(struct ncp_server *server, int offset) { - return get_unaligned((__u8 *) ncp_reply_data(server, offset)); + return *(u8 *)ncp_reply_data(server, offset); } -static inline __u16 WVAL_LH(void* data) +static inline u16 WVAL_LH(void *data) { - return le16_to_cpu(get_unaligned((__le16*)data)); + return get_unaligned_le16(data); } -static __u16 - ncp_reply_le16(struct ncp_server *server, int offset) +static u16 +ncp_reply_le16(struct ncp_server *server, int offset) { - return le16_to_cpu(get_unaligned((__le16 *) ncp_reply_data(server, offset))); + return get_unaligned_le16(ncp_reply_data(server, offset)); } -static __u16 - ncp_reply_be16(struct ncp_server *server, int offset) +static u16 +ncp_reply_be16(struct ncp_server *server, int offset) { - return be16_to_cpu(get_unaligned((__be16 *) ncp_reply_data(server, offset))); + return get_unaligned_be16(ncp_reply_data(server, offset)); } -static inline __u32 DVAL_LH(void* data) +static inline u32 DVAL_LH(void *data) { - return le32_to_cpu(get_unaligned((__le32*)data)); + return get_unaligned_le32(data); } static __le32 - ncp_reply_dword(struct ncp_server *server, int offset) +ncp_reply_dword(struct ncp_server *server, int offset) { - return get_unaligned((__le32 *) ncp_reply_data(server, offset)); + return get_unaligned((__le32 *)ncp_reply_data(server, offset)); } static inline __u32 ncp_reply_dword_lh(struct ncp_server* server, int offset) { @@ -1006,8 +1005,8 @@ ncp_read_bounce(struct ncp_server *server, const char *file_id, result = ncp_request2(server, 72, bounce, bufsize); ncp_unlock_server(server); if (!result) { - int len = be16_to_cpu(get_unaligned((__be16*)((char*)bounce + - sizeof(struct ncp_reply_header)))); + int len = get_unaligned_be16((char *)bounce + + sizeof(struct ncp_reply_header)); result = -EIO; if (len <= to_read) { char* source; diff --git a/fs/ncpfs/ncpsign_kernel.c b/fs/ncpfs/ncpsign_kernel.c index 749a18d3359..7c0b5c21e6c 100644 --- a/fs/ncpfs/ncpsign_kernel.c +++ b/fs/ncpfs/ncpsign_kernel.c @@ -55,7 +55,7 @@ static void nwsign(char *r_data1, char *r_data2, char *outdata) { unsigned int w0,w1,w2,w3; static int rbit[4]={0, 2, 1, 3}; #ifdef __i386__ - unsigned int *data2=(int *)r_data2; + unsigned int *data2=(unsigned int *)r_data2; #else unsigned int data2[16]; for (i=0;i<16;i++) diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index df0f41e0988..ac6170c594a 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_NFS_FS) += nfs.o nfs-y := client.o dir.o file.o getroot.o inode.o super.o nfs2xdr.o \ - pagelist.o proc.o read.o symlink.o unlink.o \ + direct.o pagelist.o proc.o read.o symlink.o unlink.o \ write.o namespace.o mount_clnt.o nfs-$(CONFIG_ROOT_NFS) += nfsroot.o nfs-$(CONFIG_NFS_V3) += nfs3proc.o nfs3xdr.o @@ -14,5 +14,4 @@ nfs-$(CONFIG_NFS_V4) += nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o \ delegation.o idmap.o \ callback.o callback_xdr.o callback_proc.o \ nfs4namespace.o -nfs-$(CONFIG_NFS_DIRECTIO) += direct.o nfs-$(CONFIG_SYSCTL) += sysctl.o diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 66648dd92d9..5606ae3d72d 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -15,6 +15,7 @@ #include <linux/nfs_fs.h> #include <linux/mutex.h> #include <linux/freezer.h> +#include <linux/kthread.h> #include <net/inet_sock.h> @@ -27,9 +28,7 @@ struct nfs_callback_data { unsigned int users; struct svc_serv *serv; - pid_t pid; - struct completion started; - struct completion stopped; + struct task_struct *task; }; static struct nfs_callback_data nfs_callback_info; @@ -57,48 +56,44 @@ module_param_call(callback_tcpport, param_set_port, param_get_int, /* * This is the callback kernel thread. */ -static void nfs_callback_svc(struct svc_rqst *rqstp) +static int +nfs_callback_svc(void *vrqstp) { - int err; + int err, preverr = 0; + struct svc_rqst *rqstp = vrqstp; - __module_get(THIS_MODULE); - lock_kernel(); - - nfs_callback_info.pid = current->pid; - daemonize("nfsv4-svc"); - /* Process request with signals blocked, but allow SIGKILL. */ - allow_signal(SIGKILL); set_freezable(); - complete(&nfs_callback_info.started); - - for(;;) { - if (signalled()) { - if (nfs_callback_info.users == 0) - break; - flush_signals(current); - } + /* + * FIXME: do we really need to run this under the BKL? If so, please + * add a comment about what it's intended to protect. + */ + lock_kernel(); + while (!kthread_should_stop()) { /* * Listen for a request on the socket */ err = svc_recv(rqstp, MAX_SCHEDULE_TIMEOUT); - if (err == -EAGAIN || err == -EINTR) + if (err == -EAGAIN || err == -EINTR) { + preverr = err; continue; + } if (err < 0) { - printk(KERN_WARNING - "%s: terminating on error %d\n", - __FUNCTION__, -err); - break; + if (err != preverr) { + printk(KERN_WARNING "%s: unexpected error " + "from svc_recv (%d)\n", __func__, err); + preverr = err; + } + schedule_timeout_uninterruptible(HZ); + continue; } + preverr = err; svc_process(rqstp); } - - flush_signals(current); - svc_exit_thread(rqstp); - nfs_callback_info.pid = 0; - complete(&nfs_callback_info.stopped); unlock_kernel(); - module_put_and_exit(0); + nfs_callback_info.task = NULL; + svc_exit_thread(rqstp); + return 0; } /* @@ -107,14 +102,13 @@ static void nfs_callback_svc(struct svc_rqst *rqstp) int nfs_callback_up(void) { struct svc_serv *serv = NULL; + struct svc_rqst *rqstp; int ret = 0; lock_kernel(); mutex_lock(&nfs_callback_mutex); - if (nfs_callback_info.users++ || nfs_callback_info.pid != 0) + if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - init_completion(&nfs_callback_info.started); - init_completion(&nfs_callback_info.stopped); serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); ret = -ENOMEM; if (!serv) @@ -127,15 +121,28 @@ int nfs_callback_up(void) nfs_callback_tcpport = ret; dprintk("Callback port = 0x%x\n", nfs_callback_tcpport); - ret = svc_create_thread(nfs_callback_svc, serv); - if (ret < 0) + rqstp = svc_prepare_thread(serv, &serv->sv_pools[0]); + if (IS_ERR(rqstp)) { + ret = PTR_ERR(rqstp); goto out_err; + } + + svc_sock_update_bufs(serv); nfs_callback_info.serv = serv; - wait_for_completion(&nfs_callback_info.started); + + nfs_callback_info.task = kthread_run(nfs_callback_svc, rqstp, + "nfsv4-svc"); + if (IS_ERR(nfs_callback_info.task)) { + ret = PTR_ERR(nfs_callback_info.task); + nfs_callback_info.serv = NULL; + nfs_callback_info.task = NULL; + svc_exit_thread(rqstp); + goto out_err; + } out: /* * svc_create creates the svc_serv with sv_nrthreads == 1, and then - * svc_create_thread increments that. So we need to call svc_destroy + * svc_prepare_thread increments that. So we need to call svc_destroy * on both success and failure so that the refcount is 1 when the * thread exits. */ @@ -152,19 +159,15 @@ out_err: } /* - * Kill the server process if it is not already up. + * Kill the server process if it is not already down. */ void nfs_callback_down(void) { lock_kernel(); mutex_lock(&nfs_callback_mutex); nfs_callback_info.users--; - do { - if (nfs_callback_info.users != 0 || nfs_callback_info.pid == 0) - break; - if (kill_proc(nfs_callback_info.pid, SIGKILL, 1) < 0) - break; - } while (wait_for_completion_timeout(&nfs_callback_info.stopped, 5*HZ) == 0); + if (nfs_callback_info.users == 0 && nfs_callback_info.task != NULL) + kthread_stop(nfs_callback_info.task); mutex_unlock(&nfs_callback_mutex); unlock_kernel(); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index c5c0175898f..89ac5bb0401 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -112,6 +112,7 @@ struct nfs_client_initdata { static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) { struct nfs_client *clp; + struct rpc_cred *cred; if ((clp = kzalloc(sizeof(*clp), GFP_KERNEL)) == NULL) goto error_0; @@ -150,6 +151,9 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; #endif + cred = rpc_lookup_machine_cred(); + if (!IS_ERR(cred)) + clp->cl_machine_cred = cred; return clp; @@ -170,6 +174,8 @@ static void nfs4_shutdown_client(struct nfs_client *clp) BUG_ON(!RB_EMPTY_ROOT(&clp->cl_state_owners)); if (__test_and_clear_bit(NFS_CS_IDMAP, &clp->cl_res_state)) nfs_idmap_delete(clp); + + rpc_destroy_wait_queue(&clp->cl_rpcwaitq); #endif } @@ -189,6 +195,9 @@ static void nfs_free_client(struct nfs_client *clp) if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) nfs_callback_down(); + if (clp->cl_machine_cred != NULL) + put_rpccred(clp->cl_machine_cred); + kfree(clp->cl_hostname); kfree(clp); @@ -680,10 +689,22 @@ static int nfs_init_server(struct nfs_server *server, if (error < 0) goto error; + server->port = data->nfs_server.port; + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); if (error < 0) goto error; + /* Preserve the values of mount_server-related mount options */ + if (data->mount_server.addrlen) { + memcpy(&server->mountd_address, &data->mount_server.address, + data->mount_server.addrlen); + server->mountd_addrlen = data->mount_server.addrlen; + } + server->mountd_version = data->mount_server.version; + server->mountd_port = data->mount_server.port; + server->mountd_protocol = data->mount_server.protocol; + server->namelen = data->namlen; /* Create a client RPC handle for the NFSv3 ACL management interface */ nfs_init_server_aclclient(server); @@ -1062,6 +1083,8 @@ static int nfs4_init_server(struct nfs_server *server, server->acdirmin = data->acdirmin * HZ; server->acdirmax = data->acdirmax * HZ; + server->port = data->nfs_server.port; + error = nfs_init_server_rpcclient(server, &timeparms, data->auth_flavors[0]); error: @@ -1298,6 +1321,7 @@ static const struct file_operations nfs_server_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; static int nfs_volume_list_open(struct inode *inode, struct file *file); @@ -1318,6 +1342,7 @@ static const struct file_operations nfs_volume_list_fops = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; /* @@ -1477,33 +1502,29 @@ int __init nfs_fs_proc_init(void) { struct proc_dir_entry *p; - proc_fs_nfs = proc_mkdir("nfsfs", proc_root_fs); + proc_fs_nfs = proc_mkdir("fs/nfsfs", NULL); if (!proc_fs_nfs) goto error_0; proc_fs_nfs->owner = THIS_MODULE; /* a file of servers with which we're dealing */ - p = create_proc_entry("servers", S_IFREG|S_IRUGO, proc_fs_nfs); + p = proc_create("servers", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_server_list_fops); if (!p) goto error_1; - p->proc_fops = &nfs_server_list_fops; - p->owner = THIS_MODULE; - /* a file of volumes that we have mounted */ - p = create_proc_entry("volumes", S_IFREG|S_IRUGO, proc_fs_nfs); + p = proc_create("volumes", S_IFREG|S_IRUGO, + proc_fs_nfs, &nfs_volume_list_fops); if (!p) goto error_2; - - p->proc_fops = &nfs_volume_list_fops; - p->owner = THIS_MODULE; return 0; error_2: remove_proc_entry("servers", proc_fs_nfs); error_1: - remove_proc_entry("nfsfs", proc_root_fs); + remove_proc_entry("fs/nfsfs", NULL); error_0: return -ENOMEM; } @@ -1515,7 +1536,7 @@ void nfs_fs_proc_exit(void) { remove_proc_entry("volumes", proc_fs_nfs); remove_proc_entry("servers", proc_fs_nfs); - remove_proc_entry("nfsfs", proc_root_fs); + remove_proc_entry("fs/nfsfs", NULL); } #endif /* CONFIG_PROC_FS */ diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d9e30ac2798..f288b3ecab4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1967,7 +1967,7 @@ force_lookup: if (!NFS_PROTO(inode)->access) goto out_notsup; - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (!IS_ERR(cred)) { res = nfs_do_access(inode, cred, mask); put_rpccred(cred); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 16844f98f50..4757a2b326a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -229,14 +229,20 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq) static void nfs_direct_read_result(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - if (nfs_readpage_result(task, data) != 0) - return; + nfs_readpage_result(task, data); +} + +static void nfs_direct_read_release(void *calldata) +{ + + struct nfs_read_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; spin_lock(&dreq->lock); - if (unlikely(task->tk_status < 0)) { - dreq->error = task->tk_status; + if (unlikely(status < 0)) { + dreq->error = status; spin_unlock(&dreq->lock); } else { dreq->count += data->res.count; @@ -249,11 +255,12 @@ static void nfs_direct_read_result(struct rpc_task *task, void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); + nfs_readdata_release(calldata); } static const struct rpc_call_ops nfs_read_direct_ops = { .rpc_call_done = nfs_direct_read_result, - .rpc_release = nfs_readdata_release, + .rpc_release = nfs_direct_read_release, }; /* @@ -280,6 +287,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, .rpc_client = NFS_CLIENT(inode), .rpc_message = &msg, .callback_ops = &nfs_read_direct_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; unsigned int pgbase; @@ -323,7 +331,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = ctx; + data->args.context = get_nfs_open_context(ctx); data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -339,8 +347,9 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, NFS_PROTO(inode)->read_setup(data, &msg); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + break; + rpc_put_task(task); dprintk("NFS: %5u initiated direct read call " "(req %s/%Ld, %zu bytes @ offset %Lu)\n", @@ -446,6 +455,7 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) struct rpc_task_setup task_setup_data = { .rpc_client = NFS_CLIENT(inode), .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; @@ -499,27 +509,34 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) static void nfs_direct_commit_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; /* Call the NFS version-specific code */ - if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) - return; - if (unlikely(task->tk_status < 0)) { + NFS_PROTO(data->inode)->commit_done(task, data); +} + +static void nfs_direct_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; + + if (status < 0) { dprintk("NFS: %5u commit failed with error %d.\n", - task->tk_pid, task->tk_status); + data->task.tk_pid, status); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } else if (memcmp(&dreq->verf, &data->verf, sizeof(data->verf))) { - dprintk("NFS: %5u commit verify failed\n", task->tk_pid); + dprintk("NFS: %5u commit verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } - dprintk("NFS: %5u commit returned %d\n", task->tk_pid, task->tk_status); + dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); + nfs_commitdata_release(calldata); } static const struct rpc_call_ops nfs_commit_direct_ops = { .rpc_call_done = nfs_direct_commit_result, - .rpc_release = nfs_commit_release, + .rpc_release = nfs_direct_commit_release, }; static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) @@ -537,6 +554,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) .rpc_message = &msg, .callback_ops = &nfs_commit_direct_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; @@ -546,6 +564,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; + data->args.context = get_nfs_open_context(dreq->ctx); data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -585,7 +604,7 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode static void nfs_alloc_commit_data(struct nfs_direct_req *dreq) { - dreq->commit_data = nfs_commit_alloc(); + dreq->commit_data = nfs_commitdata_alloc(); if (dreq->commit_data != NULL) dreq->commit_data->req = (struct nfs_page *) dreq; } @@ -606,11 +625,20 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode static void nfs_direct_write_result(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; - int status = task->tk_status; if (nfs_writeback_done(task, data) != 0) return; +} + +/* + * NB: Return the value of the first error return code. Subsequent + * errors after the first one are ignored. + */ +static void nfs_direct_write_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; + int status = data->task.tk_status; spin_lock(&dreq->lock); @@ -632,23 +660,13 @@ static void nfs_direct_write_result(struct rpc_task *task, void *calldata) break; case NFS_ODIRECT_DO_COMMIT: if (memcmp(&dreq->verf, &data->verf, sizeof(dreq->verf))) { - dprintk("NFS: %5u write verify failed\n", task->tk_pid); + dprintk("NFS: %5u write verify failed\n", data->task.tk_pid); dreq->flags = NFS_ODIRECT_RESCHED_WRITES; } } } out_unlock: spin_unlock(&dreq->lock); -} - -/* - * NB: Return the value of the first error return code. Subsequent - * errors after the first one are ignored. - */ -static void nfs_direct_write_release(void *calldata) -{ - struct nfs_write_data *data = calldata; - struct nfs_direct_req *dreq = (struct nfs_direct_req *) data->req; if (put_dreq(dreq)) nfs_direct_write_complete(dreq, data->inode); @@ -682,6 +700,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, .rpc_client = NFS_CLIENT(inode), .rpc_message = &msg, .callback_ops = &nfs_write_direct_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; size_t wsize = NFS_SERVER(inode)->wsize; @@ -728,7 +747,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = ctx; + data->args.context = get_nfs_open_context(ctx); data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -745,8 +764,9 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, NFS_PROTO(inode)->write_setup(data, &msg); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + break; + rpc_put_task(task); dprintk("NFS: %5u initiated direct write call " "(req %s/%Ld, %zu bytes @ offset %Lu)\n", diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 5d2e9d9a4e2..3536b01164f 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -238,10 +238,8 @@ nfs_file_read(struct kiocb *iocb, const struct iovec *iov, ssize_t result; size_t count = iov_length(iov, nr_segs); -#ifdef CONFIG_NFS_DIRECTIO if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_read(iocb, iov, nr_segs, pos); -#endif dfprintk(VFS, "nfs: read(%s/%s, %lu@%lu)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -387,9 +385,7 @@ const struct address_space_operations nfs_file_aops = { .write_end = nfs_write_end, .invalidatepage = nfs_invalidate_page, .releasepage = nfs_release_page, -#ifdef CONFIG_NFS_DIRECTIO .direct_IO = nfs_direct_IO, -#endif .launder_page = nfs_launder_page, }; @@ -447,10 +443,8 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, ssize_t result; size_t count = iov_length(iov, nr_segs); -#ifdef CONFIG_NFS_DIRECTIO if (iocb->ki_filp->f_flags & O_DIRECT) return nfs_file_direct_write(iocb, iov, nr_segs, pos); -#endif dfprintk(VFS, "nfs: write(%s/%s(%ld), %lu@%Ld)\n", dentry->d_parent->d_name.name, dentry->d_name.name, @@ -576,17 +570,9 @@ static int do_setlk(struct file *filp, int cmd, struct file_lock *fl) lock_kernel(); /* Use local locking if mounted with "-onolock" */ - if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) { + if (!(NFS_SERVER(inode)->flags & NFS_MOUNT_NONLM)) status = NFS_PROTO(inode)->lock(filp, cmd, fl); - /* If we were signalled we still need to ensure that - * we clean up any state on the server. We therefore - * record the lock call as having succeeded in order to - * ensure that locks_remove_posix() cleans it out when - * the process exits. - */ - if (status == -EINTR || status == -ERESTARTSYS) - do_vfs_lock(filp, fl); - } else + else status = do_vfs_lock(filp, fl); unlock_kernel(); if (status < 0) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 6f88d7c77ac..5cb3345eb69 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -523,8 +523,12 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait) { - struct inode *inode = ctx->path.dentry->d_inode; + struct inode *inode; + if (ctx == NULL) + return; + + inode = ctx->path.dentry->d_inode; if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) return; list_del(&ctx->list); @@ -610,7 +614,7 @@ int nfs_open(struct inode *inode, struct file *filp) struct nfs_open_context *ctx; struct rpc_cred *cred; - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); ctx = alloc_nfs_open_context(filp->f_path.mnt, filp->f_path.dentry, cred); @@ -1218,6 +1222,36 @@ static void nfs_destroy_inodecache(void) kmem_cache_destroy(nfs_inode_cachep); } +struct workqueue_struct *nfsiod_workqueue; + +/* + * start up the nfsiod workqueue + */ +static int nfsiod_start(void) +{ + struct workqueue_struct *wq; + dprintk("RPC: creating workqueue nfsiod\n"); + wq = create_singlethread_workqueue("nfsiod"); + if (wq == NULL) + return -ENOMEM; + nfsiod_workqueue = wq; + return 0; +} + +/* + * Destroy the nfsiod workqueue + */ +static void nfsiod_stop(void) +{ + struct workqueue_struct *wq; + + wq = nfsiod_workqueue; + if (wq == NULL) + return; + nfsiod_workqueue = NULL; + destroy_workqueue(wq); +} + /* * Initialize NFS */ @@ -1225,6 +1259,10 @@ static int __init init_nfs_fs(void) { int err; + err = nfsiod_start(); + if (err) + goto out6; + err = nfs_fs_proc_init(); if (err) goto out5; @@ -1271,6 +1309,8 @@ out3: out4: nfs_fs_proc_exit(); out5: + nfsiod_stop(); +out6: return err; } @@ -1286,6 +1326,7 @@ static void __exit exit_nfs_fs(void) #endif unregister_nfs_fs(); nfs_fs_proc_exit(); + nfsiod_stop(); } /* Not quite true; I just maintain it */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 931992763e6..04ae867dddb 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -46,9 +46,9 @@ struct nfs_parsed_mount_data { struct sockaddr_storage address; size_t addrlen; char *hostname; - unsigned int version; + u32 version; unsigned short port; - int protocol; + unsigned short protocol; } mount_server; struct { @@ -56,7 +56,8 @@ struct nfs_parsed_mount_data { size_t addrlen; char *hostname; char *export_path; - int protocol; + unsigned short port; + unsigned short protocol; } nfs_server; struct security_mnt_opts lsm_opts; @@ -115,13 +116,8 @@ extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); extern void nfs_destroy_writepagecache(void); -#ifdef CONFIG_NFS_DIRECTIO extern int __init nfs_init_directcache(void); extern void nfs_destroy_directcache(void); -#else -#define nfs_init_directcache() (0) -#define nfs_destroy_directcache() do {} while(0) -#endif /* nfs2xdr.c */ extern int nfs_stat_to_errno(int); @@ -146,6 +142,7 @@ extern struct rpc_procinfo nfs4_procedures[]; extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask); /* inode.c */ +extern struct workqueue_struct *nfsiod_workqueue; extern struct inode *nfs_alloc_inode(struct super_block *sb); extern void nfs_destroy_inode(struct inode *); extern int nfs_write_inode(struct inode *,int); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 607f6eb9cdb..af4d0f1e402 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -20,7 +20,7 @@ static void nfs_expire_automounts(struct work_struct *work); -LIST_HEAD(nfs_automount_list); +static LIST_HEAD(nfs_automount_list); static DECLARE_DELAYED_WORK(nfs_automount_task, nfs_expire_automounts); int nfs_mountpoint_expiry_timeout = 500 * HZ; diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 1f7ea675e0c..28bab67d151 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -267,7 +267,7 @@ nfs_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_fattr(p, res->fattr); count = ntohl(*p++); @@ -428,11 +428,11 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) size_t hdrlen; unsigned int pglen, recvd; u32 len; - int status, nr; + int status, nr = 0; __be32 *end, *entry, *kaddr; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); hdrlen = (u8 *) p - (u8 *) iov->iov_base; if (iov->iov_len < hdrlen) { @@ -452,7 +452,12 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) kaddr = p = kmap_atomic(*page, KM_USER0); end = (__be32 *)((char *)p + pglen); entry = p; - for (nr = 0; *p++; nr++) { + + /* Make sure the packet actually has a value_follows and EOF entry */ + if ((entry + 1) > end) + goto short_pkt; + + for (; *p++; nr++) { if (p + 2 > end) goto short_pkt; p++; /* fileid */ @@ -467,18 +472,32 @@ nfs_xdr_readdirres(struct rpc_rqst *req, __be32 *p, void *dummy) goto short_pkt; entry = p; } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; + + /* + * Apparently some server sends responses that are a valid size, but + * contain no entries, and have value_follows==0 and EOF==0. For + * those, just set the EOF marker. + */ + if (!nr && entry[1] == 0) { + dprintk("NFS: readdir reply truncated!\n"); + entry[1] = 1; + } out: kunmap_atomic(kaddr, KM_USER0); return nr; short_pkt: + /* + * When we get a short packet there are 2 possibilities. We can + * return an error, or fix up the response to look like a valid + * response and return what we have so far. If there are no + * entries and the packet was short, then return -EIO. If there + * are valid entries in the response, return them and pretend that + * the call was successful, but incomplete. The caller can retry the + * readdir starting at the last cookie. + */ entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } + if (!nr) + nr = -errno_NFSERR_IO; goto out; err_unmap: nr = -errno_NFSERR_IO; @@ -518,7 +537,7 @@ nfs_xdr_stat(struct rpc_rqst *req, __be32 *p, void *dummy) int status; if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); return status; } @@ -532,7 +551,7 @@ nfs_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); xdr_decode_fattr(p, fattr); return 0; } @@ -547,7 +566,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_fhandle(p, res->fh); xdr_decode_fattr(p, res->fattr); return 0; @@ -585,7 +604,7 @@ nfs_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, void *dummy) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Convert length of symlink */ len = ntohl(*p++); if (len >= rcvbuf->page_len) { @@ -634,7 +653,7 @@ nfs_xdr_statfsres(struct rpc_rqst *req, __be32 *p, struct nfs2_fsstat *res) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->tsize = ntohl(*p++); res->bsize = ntohl(*p++); @@ -653,39 +672,39 @@ static struct { int errno; } nfs_errtbl[] = { { NFS_OK, 0 }, - { NFSERR_PERM, EPERM }, - { NFSERR_NOENT, ENOENT }, - { NFSERR_IO, errno_NFSERR_IO }, - { NFSERR_NXIO, ENXIO }, -/* { NFSERR_EAGAIN, EAGAIN }, */ - { NFSERR_ACCES, EACCES }, - { NFSERR_EXIST, EEXIST }, - { NFSERR_XDEV, EXDEV }, - { NFSERR_NODEV, ENODEV }, - { NFSERR_NOTDIR, ENOTDIR }, - { NFSERR_ISDIR, EISDIR }, - { NFSERR_INVAL, EINVAL }, - { NFSERR_FBIG, EFBIG }, - { NFSERR_NOSPC, ENOSPC }, - { NFSERR_ROFS, EROFS }, - { NFSERR_MLINK, EMLINK }, - { NFSERR_NAMETOOLONG, ENAMETOOLONG }, - { NFSERR_NOTEMPTY, ENOTEMPTY }, - { NFSERR_DQUOT, EDQUOT }, - { NFSERR_STALE, ESTALE }, - { NFSERR_REMOTE, EREMOTE }, + { NFSERR_PERM, -EPERM }, + { NFSERR_NOENT, -ENOENT }, + { NFSERR_IO, -errno_NFSERR_IO}, + { NFSERR_NXIO, -ENXIO }, +/* { NFSERR_EAGAIN, -EAGAIN }, */ + { NFSERR_ACCES, -EACCES }, + { NFSERR_EXIST, -EEXIST }, + { NFSERR_XDEV, -EXDEV }, + { NFSERR_NODEV, -ENODEV }, + { NFSERR_NOTDIR, -ENOTDIR }, + { NFSERR_ISDIR, -EISDIR }, + { NFSERR_INVAL, -EINVAL }, + { NFSERR_FBIG, -EFBIG }, + { NFSERR_NOSPC, -ENOSPC }, + { NFSERR_ROFS, -EROFS }, + { NFSERR_MLINK, -EMLINK }, + { NFSERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFSERR_NOTEMPTY, -ENOTEMPTY }, + { NFSERR_DQUOT, -EDQUOT }, + { NFSERR_STALE, -ESTALE }, + { NFSERR_REMOTE, -EREMOTE }, #ifdef EWFLUSH - { NFSERR_WFLUSH, EWFLUSH }, + { NFSERR_WFLUSH, -EWFLUSH }, #endif - { NFSERR_BADHANDLE, EBADHANDLE }, - { NFSERR_NOT_SYNC, ENOTSYNC }, - { NFSERR_BAD_COOKIE, EBADCOOKIE }, - { NFSERR_NOTSUPP, ENOTSUPP }, - { NFSERR_TOOSMALL, ETOOSMALL }, - { NFSERR_SERVERFAULT, ESERVERFAULT }, - { NFSERR_BADTYPE, EBADTYPE }, - { NFSERR_JUKEBOX, EJUKEBOX }, - { -1, EIO } + { NFSERR_BADHANDLE, -EBADHANDLE }, + { NFSERR_NOT_SYNC, -ENOTSYNC }, + { NFSERR_BAD_COOKIE, -EBADCOOKIE }, + { NFSERR_NOTSUPP, -ENOTSUPP }, + { NFSERR_TOOSMALL, -ETOOSMALL }, + { NFSERR_SERVERFAULT, -ESERVERFAULT }, + { NFSERR_BADTYPE, -EBADTYPE }, + { NFSERR_JUKEBOX, -EJUKEBOX }, + { -1, -EIO } }; /* diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 3917e2fa4e4..11cdddec143 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -508,14 +508,14 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res struct page **page; size_t hdrlen; u32 len, recvd, pglen; - int status, nr; + int status, nr = 0; __be32 *entry, *end, *kaddr; status = ntohl(*p++); /* Decode post_op_attrs */ p = xdr_decode_post_op_attr(p, res->dir_attr); if (status) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Decode verifier cookie */ if (res->verf) { res->verf[0] = *p++; @@ -542,7 +542,12 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res kaddr = p = kmap_atomic(*page, KM_USER0); end = (__be32 *)((char *)p + pglen); entry = p; - for (nr = 0; *p++; nr++) { + + /* Make sure the packet actually has a value_follows and EOF entry */ + if ((entry + 1) > end) + goto short_pkt; + + for (; *p++; nr++) { if (p + 3 > end) goto short_pkt; p += 2; /* inode # */ @@ -581,18 +586,32 @@ nfs3_xdr_readdirres(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirres *res goto short_pkt; entry = p; } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; + + /* + * Apparently some server sends responses that are a valid size, but + * contain no entries, and have value_follows==0 and EOF==0. For + * those, just set the EOF marker. + */ + if (!nr && entry[1] == 0) { + dprintk("NFS: readdir reply truncated!\n"); + entry[1] = 1; + } out: kunmap_atomic(kaddr, KM_USER0); return nr; short_pkt: + /* + * When we get a short packet there are 2 possibilities. We can + * return an error, or fix up the response to look like a valid + * response and return what we have so far. If there are no + * entries and the packet was short, then return -EIO. If there + * are valid entries in the response, return them and pretend that + * the call was successful, but incomplete. The caller can retry the + * readdir starting at the last cookie. + */ entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } + if (!nr) + nr = -errno_NFSERR_IO; goto out; err_unmap: nr = -errno_NFSERR_IO; @@ -732,7 +751,7 @@ nfs3_xdr_attrstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status; if ((status = ntohl(*p++))) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); xdr_decode_fattr(p, fattr); return 0; } @@ -747,7 +766,7 @@ nfs3_xdr_wccstat(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status; if ((status = ntohl(*p++))) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); xdr_decode_wcc_data(p, fattr); return status; } @@ -767,7 +786,7 @@ nfs3_xdr_lookupres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) int status; if ((status = ntohl(*p++))) { - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); } else { if (!(p = xdr_decode_fhandle(p, res->fh))) return -errno_NFSERR_IO; @@ -787,7 +806,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->access = ntohl(*p++); return 0; } @@ -824,7 +843,7 @@ nfs3_xdr_readlinkres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) p = xdr_decode_post_op_attr(p, fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Convert length of symlink */ len = ntohl(*p++); @@ -872,7 +891,7 @@ nfs3_xdr_readres(struct rpc_rqst *req, __be32 *p, struct nfs_readres *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); /* Decode reply count and EOF flag. NFSv3 is somewhat redundant * in that it puts the count both in the res struct and in the @@ -922,7 +941,7 @@ nfs3_xdr_writeres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) p = xdr_decode_wcc_data(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->count = ntohl(*p++); res->verf->committed = (enum nfs3_stable_how)ntohl(*p++); @@ -953,7 +972,7 @@ nfs3_xdr_createres(struct rpc_rqst *req, __be32 *p, struct nfs3_diropres *res) res->fattr->valid = 0; } } else { - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); } p = xdr_decode_wcc_data(p, res->dir_attr); return status; @@ -968,7 +987,7 @@ nfs3_xdr_renameres(struct rpc_rqst *req, __be32 *p, struct nfs3_renameres *res) int status; if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); p = xdr_decode_wcc_data(p, res->fromattr); p = xdr_decode_wcc_data(p, res->toattr); return status; @@ -983,7 +1002,7 @@ nfs3_xdr_linkres(struct rpc_rqst *req, __be32 *p, struct nfs3_linkres *res) int status; if ((status = ntohl(*p++)) != 0) - status = -nfs_stat_to_errno(status); + status = nfs_stat_to_errno(status); p = xdr_decode_post_op_attr(p, res->fattr); p = xdr_decode_wcc_data(p, res->dir_attr); return status; @@ -1001,7 +1020,7 @@ nfs3_xdr_fsstatres(struct rpc_rqst *req, __be32 *p, struct nfs_fsstat *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_hyper(p, &res->tbytes); p = xdr_decode_hyper(p, &res->fbytes); @@ -1026,7 +1045,7 @@ nfs3_xdr_fsinfores(struct rpc_rqst *req, __be32 *p, struct nfs_fsinfo *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->rtmax = ntohl(*p++); res->rtpref = ntohl(*p++); @@ -1054,7 +1073,7 @@ nfs3_xdr_pathconfres(struct rpc_rqst *req, __be32 *p, struct nfs_pathconf *res) p = xdr_decode_post_op_attr(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->max_link = ntohl(*p++); res->max_namelen = ntohl(*p++); @@ -1073,7 +1092,7 @@ nfs3_xdr_commitres(struct rpc_rqst *req, __be32 *p, struct nfs_writeres *res) status = ntohl(*p++); p = xdr_decode_wcc_data(p, res->fattr); if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); res->verf->verifier[0] = *p++; res->verf->verifier[1] = *p++; @@ -1095,7 +1114,7 @@ nfs3_xdr_getaclres(struct rpc_rqst *req, __be32 *p, int err, base; if (status != 0) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); p = xdr_decode_post_op_attr(p, res->fattr); res->mask = ntohl(*p++); if (res->mask & ~(NFS_ACL|NFS_ACLCNT|NFS_DFACL|NFS_DFACLCNT)) @@ -1122,7 +1141,7 @@ nfs3_xdr_setaclres(struct rpc_rqst *req, __be32 *p, struct nfs_fattr *fattr) int status = ntohl(*p++); if (status) - return -nfs_stat_to_errno(status); + return nfs_stat_to_errno(status); xdr_decode_post_op_attr(p, fattr); return 0; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7ce07862c2f..dbc09271af0 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -51,6 +51,7 @@ #include "nfs4_fs.h" #include "delegation.h" +#include "internal.h" #include "iostat.h" #define NFSDBG_FACILITY NFSDBG_PROC @@ -239,6 +240,8 @@ static void nfs4_init_opendata_res(struct nfs4_opendata *p) { p->o_res.f_attr = &p->f_attr; p->o_res.dir_attr = &p->dir_attr; + p->o_res.seqid = p->o_arg.seqid; + p->c_res.seqid = p->c_arg.seqid; p->o_res.server = p->o_arg.server; nfs_fattr_init(&p->f_attr); nfs_fattr_init(&p->dir_attr); @@ -729,7 +732,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) renew_lease(data->o_res.server, data->timestamp); data->rpc_done = 1; } - nfs_increment_open_seqid(data->rpc_status, data->c_arg.seqid); } static void nfs4_open_confirm_release(void *calldata) @@ -773,6 +775,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data) .rpc_message = &msg, .callback_ops = &nfs4_open_confirm_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status; @@ -858,7 +861,6 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) if (!(data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM)) nfs_confirm_seqid(&data->owner->so_seqid, 0); } - nfs_increment_open_seqid(data->rpc_status, data->o_arg.seqid); data->rpc_done = 1; } @@ -910,6 +912,7 @@ static int _nfs4_proc_open(struct nfs4_opendata *data) .rpc_message = &msg, .callback_ops = &nfs4_open_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status; @@ -979,11 +982,8 @@ static int _nfs4_open_expired(struct nfs_open_context *ctx, struct nfs4_state *s if (IS_ERR(opendata)) return PTR_ERR(opendata); ret = nfs4_open_recover(opendata, state); - if (ret == -ESTALE) { - /* Invalidate the state owner so we don't ever use it again */ - nfs4_drop_state_owner(state->owner); + if (ret == -ESTALE) d_drop(ctx->path.dentry); - } nfs4_opendata_put(opendata); return ret; } @@ -1226,7 +1226,6 @@ static void nfs4_close_done(struct rpc_task *task, void *data) /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors */ - nfs_increment_open_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: nfs_set_open_stateid(state, &calldata->res.stateid, 0); @@ -1315,6 +1314,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) .rpc_client = server->client, .rpc_message = &msg, .callback_ops = &nfs4_close_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int status = -ENOMEM; @@ -1332,6 +1332,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait) goto out_free_calldata; calldata->arg.bitmask = server->attr_bitmask; calldata->res.fattr = &calldata->fattr; + calldata->res.seqid = calldata->arg.seqid; calldata->res.server = server; calldata->path.mnt = mntget(path->mnt); calldata->path.dentry = dget(path->dentry); @@ -1404,7 +1405,7 @@ nfs4_atomic_open(struct inode *dir, struct dentry *dentry, struct nameidata *nd) BUG_ON(nd->intent.open.flags & O_CREAT); } - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return (struct dentry *)cred; parent = dentry->d_parent; @@ -1439,7 +1440,7 @@ nfs4_open_revalidate(struct inode *dir, struct dentry *dentry, int openflags, st struct rpc_cred *cred; struct nfs4_state *state; - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); state = nfs4_do_open(dir, &path, openflags, NULL, cred); @@ -1656,7 +1657,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, nfs_fattr_init(fattr); - cred = rpcauth_lookupcred(NFS_CLIENT(inode)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) return PTR_ERR(cred); @@ -1892,7 +1893,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr, struct rpc_cred *cred; int status = 0; - cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + cred = rpc_lookup_cred(); if (IS_ERR(cred)) { status = PTR_ERR(cred); goto out; @@ -2761,10 +2762,10 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server) case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_STALE_STATEID: case -NFS4ERR_EXPIRED: - rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL, NULL); + rpc_sleep_on(&clp->cl_rpcwaitq, task, NULL); nfs4_schedule_state_recovery(clp); if (test_bit(NFS4CLNT_STATE_RECOVER, &clp->cl_state) == 0) - rpc_wake_up_task(task); + rpc_wake_up_queued_task(&clp->cl_rpcwaitq, task); task->tk_status = 0; return -EAGAIN; case -NFS4ERR_DELAY: @@ -2884,7 +2885,7 @@ int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short po RPC_DISPLAY_ADDR), rpc_peeraddr2str(clp->cl_rpcclient, RPC_DISPLAY_PROTO), - cred->cr_ops->cr_name, + clp->cl_rpcclient->cl_auth->au_ops->au_name, clp->cl_id_uniquifier); setclientid.sc_netid_len = scnprintf(setclientid.sc_netid, sizeof(setclientid.sc_netid), @@ -3158,6 +3159,7 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; + p->res.seqid = seqid; p->arg.stateid = &lsp->ls_stateid; p->lsp = lsp; atomic_inc(&lsp->ls_count); @@ -3183,7 +3185,6 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) if (RPC_ASSASSINATED(task)) return; - nfs_increment_lock_seqid(task->tk_status, calldata->arg.seqid); switch (task->tk_status) { case 0: memcpy(calldata->lsp->ls_stateid.data, @@ -3235,6 +3236,7 @@ static struct rpc_task *nfs4_do_unlck(struct file_lock *fl, .rpc_client = NFS_CLIENT(lsp->ls_state->inode), .rpc_message = &msg, .callback_ops = &nfs4_locku_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; @@ -3261,6 +3263,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * struct nfs4_lock_state *lsp; struct rpc_task *task; int status = 0; + unsigned char fl_flags = request->fl_flags; status = nfs4_set_lock_state(state, request); /* Unlock _before_ we do the RPC call */ @@ -3284,6 +3287,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock * status = nfs4_wait_for_completion_rpc_task(task); rpc_put_task(task); out: + request->fl_flags = fl_flags; return status; } @@ -3320,6 +3324,7 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl, p->arg.lock_stateid = &lsp->ls_stateid; p->arg.lock_owner.clientid = server->nfs_client->cl_clientid; p->arg.lock_owner.id = lsp->ls_id.id; + p->res.lock_seqid = p->arg.lock_seqid; p->lsp = lsp; atomic_inc(&lsp->ls_count); p->ctx = get_nfs_open_context(ctx); @@ -3346,6 +3351,7 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) return; data->arg.open_stateid = &state->stateid; data->arg.new_lock_owner = 1; + data->res.open_seqid = data->arg.open_seqid; } else data->arg.new_lock_owner = 0; data->timestamp = jiffies; @@ -3363,7 +3369,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) if (RPC_ASSASSINATED(task)) goto out; if (data->arg.new_lock_owner != 0) { - nfs_increment_open_seqid(data->rpc_status, data->arg.open_seqid); if (data->rpc_status == 0) nfs_confirm_seqid(&data->lsp->ls_seqid, 0); else @@ -3375,7 +3380,6 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) data->lsp->ls_flags |= NFS_LOCK_INITIALIZED; renew_lease(NFS_SERVER(data->ctx->path.dentry->d_inode), data->timestamp); } - nfs_increment_lock_seqid(data->rpc_status, data->arg.lock_seqid); out: dprintk("%s: done, ret = %d!\n", __FUNCTION__, data->rpc_status); } @@ -3419,6 +3423,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f .rpc_client = NFS_CLIENT(state->inode), .rpc_message = &msg, .callback_ops = &nfs4_lock_ops, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC, }; int ret; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index b962397004c..46eb624e4f1 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -71,6 +71,29 @@ static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred) return status; } +static struct rpc_cred *nfs4_get_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred = NULL; + + spin_lock(&clp->cl_lock); + if (clp->cl_machine_cred != NULL) + cred = get_rpccred(clp->cl_machine_cred); + spin_unlock(&clp->cl_lock); + return cred; +} + +static void nfs4_clear_machine_cred(struct nfs_client *clp) +{ + struct rpc_cred *cred; + + spin_lock(&clp->cl_lock); + cred = clp->cl_machine_cred; + clp->cl_machine_cred = NULL; + spin_unlock(&clp->cl_lock); + if (cred != NULL) + put_rpccred(cred); +} + struct rpc_cred *nfs4_get_renew_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; @@ -91,13 +114,18 @@ static struct rpc_cred *nfs4_get_setclientid_cred(struct nfs_client *clp) { struct nfs4_state_owner *sp; struct rb_node *pos; + struct rpc_cred *cred; + cred = nfs4_get_machine_cred(clp); + if (cred != NULL) + goto out; pos = rb_first(&clp->cl_state_owners); if (pos != NULL) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); - return get_rpccred(sp->so_cred); + cred = get_rpccred(sp->so_cred); } - return NULL; +out: + return cred; } static void nfs_alloc_unique_id(struct rb_root *root, struct nfs_unique_id *new, @@ -292,8 +320,10 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct spin_unlock(&clp->cl_lock); if (sp == new) get_rpccred(cred); - else + else { + rpc_destroy_wait_queue(&new->so_sequence.wait); kfree(new); + } return sp; } @@ -310,6 +340,7 @@ void nfs4_put_state_owner(struct nfs4_state_owner *sp) return; nfs4_remove_state_owner(clp, sp); spin_unlock(&clp->cl_lock); + rpc_destroy_wait_queue(&sp->so_sequence.wait); put_rpccred(cred); kfree(sp); } @@ -529,6 +560,7 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) spin_lock(&clp->cl_lock); nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); spin_unlock(&clp->cl_lock); + rpc_destroy_wait_queue(&lsp->ls_sequence.wait); kfree(lsp); } @@ -731,7 +763,7 @@ int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task) list_add_tail(&seqid->list, &sequence->list); if (list_first_entry(&sequence->list, struct nfs_seqid, list) == seqid) goto unlock; - rpc_sleep_on(&sequence->wait, task, NULL, NULL); + rpc_sleep_on(&sequence->wait, task, NULL); status = -EAGAIN; unlock: spin_unlock(&sequence->lock); @@ -920,10 +952,10 @@ restart_loop: if (cred != NULL) { /* Yes there are: try to renew the old lease */ status = nfs4_proc_renew(clp, cred); + put_rpccred(cred); switch (status) { case 0: case -NFS4ERR_CB_PATH_DOWN: - put_rpccred(cred); goto out; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: @@ -932,14 +964,19 @@ restart_loop: } else { /* "reboot" to ensure we clear all state on the server */ clp->cl_boot_time = CURRENT_TIME; - cred = nfs4_get_setclientid_cred(clp); } /* We're going to have to re-establish a clientid */ nfs4_state_mark_reclaim(clp); status = -ENOENT; + cred = nfs4_get_setclientid_cred(clp); if (cred != NULL) { status = nfs4_init_client(clp, cred); put_rpccred(cred); + /* Handle case where the user hasn't set up machine creds */ + if (status == -EACCES && cred == clp->cl_machine_cred) { + nfs4_clear_machine_cred(clp); + goto restart_loop; + } } if (status) goto out_error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index db1ed9c46ed..5a2d64927b3 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -110,7 +110,7 @@ static int nfs4_stat_to_errno(int); #define decode_savefh_maxsz (op_decode_hdr_maxsz) #define encode_restorefh_maxsz (op_encode_hdr_maxsz) #define decode_restorefh_maxsz (op_decode_hdr_maxsz) -#define encode_fsinfo_maxsz (op_encode_hdr_maxsz + 2) +#define encode_fsinfo_maxsz (encode_getattr_maxsz) #define decode_fsinfo_maxsz (op_decode_hdr_maxsz + 11) #define encode_renew_maxsz (op_encode_hdr_maxsz + 3) #define decode_renew_maxsz (op_decode_hdr_maxsz) @@ -1191,8 +1191,8 @@ static int encode_readdir(struct xdr_stream *xdr, const struct nfs4_readdir_arg attrs[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID; WRITE32(attrs[0] & readdir->bitmask[0]); WRITE32(attrs[1] & readdir->bitmask[1]); - dprintk("%s: cookie = %Lu, verifier = 0x%x%x, bitmap = 0x%x%x\n", - __FUNCTION__, + dprintk("%s: cookie = %Lu, verifier = %08x:%08x, bitmap = %08x:%08x\n", + __func__, (unsigned long long)readdir->cookie, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1], @@ -2241,7 +2241,7 @@ static int decode_op_hdr(struct xdr_stream *xdr, enum nfs_opnum4 expected) } READ32(nfserr); if (nfserr != NFS_OK) - return -nfs4_stat_to_errno(nfserr); + return nfs4_stat_to_errno(nfserr); return 0; } @@ -2291,7 +2291,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3 bitmap[0] &= ~FATTR4_WORD0_SUPPORTED_ATTRS; } else bitmask[0] = bitmask[1] = 0; - dprintk("%s: bitmask=0x%x%x\n", __FUNCTION__, bitmask[0], bitmask[1]); + dprintk("%s: bitmask=%08x:%08x\n", __func__, bitmask[0], bitmask[1]); return 0; } @@ -3005,6 +3005,8 @@ static int decode_close(struct xdr_stream *xdr, struct nfs_closeres *res) int status; status = decode_op_hdr(xdr, OP_CLOSE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3296,11 +3298,17 @@ static int decode_lock(struct xdr_stream *xdr, struct nfs_lock_res *res) int status; status = decode_op_hdr(xdr, OP_LOCK); + if (status == -EIO) + goto out; if (status == 0) { READ_BUF(NFS4_STATEID_SIZE); COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); } else if (status == -NFS4ERR_DENIED) - return decode_lock_denied(xdr, NULL); + status = decode_lock_denied(xdr, NULL); + if (res->open_seqid != NULL) + nfs_increment_open_seqid(status, res->open_seqid); + nfs_increment_lock_seqid(status, res->lock_seqid); +out: return status; } @@ -3319,6 +3327,8 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) int status; status = decode_op_hdr(xdr, OP_LOCKU); + if (status != -EIO) + nfs_increment_lock_seqid(status, res->seqid); if (status == 0) { READ_BUF(NFS4_STATEID_SIZE); COPYMEM(res->stateid.data, NFS4_STATEID_SIZE); @@ -3384,6 +3394,8 @@ static int decode_open(struct xdr_stream *xdr, struct nfs_openres *res) int status; status = decode_op_hdr(xdr, OP_OPEN); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3416,6 +3428,8 @@ static int decode_open_confirm(struct xdr_stream *xdr, struct nfs_open_confirmre int status; status = decode_op_hdr(xdr, OP_OPEN_CONFIRM); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3429,6 +3443,8 @@ static int decode_open_downgrade(struct xdr_stream *xdr, struct nfs_closeres *re int status; status = decode_op_hdr(xdr, OP_OPEN_DOWNGRADE); + if (status != -EIO) + nfs_increment_open_seqid(status, res->seqid); if (status) return status; READ_BUF(NFS4_STATEID_SIZE); @@ -3481,7 +3497,7 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n size_t hdrlen; u32 recvd, pglen = rcvbuf->page_len; __be32 *end, *entry, *p, *kaddr; - unsigned int nr; + unsigned int nr = 0; int status; status = decode_op_hdr(xdr, OP_READDIR); @@ -3489,8 +3505,8 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n return status; READ_BUF(8); COPYMEM(readdir->verifier.data, 8); - dprintk("%s: verifier = 0x%x%x\n", - __FUNCTION__, + dprintk("%s: verifier = %08x:%08x\n", + __func__, ((u32 *)readdir->verifier.data)[0], ((u32 *)readdir->verifier.data)[1]); @@ -3505,7 +3521,12 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n kaddr = p = kmap_atomic(page, KM_USER0); end = p + ((pglen + readdir->pgbase) >> 2); entry = p; - for (nr = 0; *p++; nr++) { + + /* Make sure the packet actually has a value_follows and EOF entry */ + if ((entry + 1) > end) + goto short_pkt; + + for (; *p++; nr++) { u32 len, attrlen, xlen; if (end - p < 3) goto short_pkt; @@ -3532,20 +3553,32 @@ static int decode_readdir(struct xdr_stream *xdr, struct rpc_rqst *req, struct n p += attrlen; /* attributes */ entry = p; } - if (!nr && (entry[0] != 0 || entry[1] == 0)) - goto short_pkt; + /* + * Apparently some server sends responses that are a valid size, but + * contain no entries, and have value_follows==0 and EOF==0. For + * those, just set the EOF marker. + */ + if (!nr && entry[1] == 0) { + dprintk("NFS: readdir reply truncated!\n"); + entry[1] = 1; + } out: kunmap_atomic(kaddr, KM_USER0); return 0; short_pkt: + /* + * When we get a short packet there are 2 possibilities. We can + * return an error, or fix up the response to look like a valid + * response and return what we have so far. If there are no + * entries and the packet was short, then return -EIO. If there + * are valid entries in the response, return them and pretend that + * the call was successful, but incomplete. The caller can retry the + * readdir starting at the last cookie. + */ dprintk("%s: short packet at entry %d\n", __FUNCTION__, nr); entry[0] = entry[1] = 0; - /* truncate listing ? */ - if (!nr) { - dprintk("NFS: readdir reply truncated!\n"); - entry[1] = 1; - } - goto out; + if (nr) + goto out; err_unmap: kunmap_atomic(kaddr, KM_USER0); return -errno_NFSERR_IO; @@ -3727,7 +3760,7 @@ static int decode_setclientid(struct xdr_stream *xdr, struct nfs_client *clp) READ_BUF(len); return -NFSERR_CLID_INUSE; } else - return -nfs4_stat_to_errno(nfserr); + return nfs4_stat_to_errno(nfserr); return 0; } @@ -4389,7 +4422,7 @@ static int nfs4_xdr_dec_fsinfo(struct rpc_rqst *req, __be32 *p, struct nfs_fsinf if (!status) status = decode_fsinfo(&xdr, fsinfo); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4479,7 +4512,7 @@ static int nfs4_xdr_dec_setclientid(struct rpc_rqst *req, __be32 *p, if (!status) status = decode_setclientid(&xdr, clp); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4501,7 +4534,7 @@ static int nfs4_xdr_dec_setclientid_confirm(struct rpc_rqst *req, __be32 *p, str if (!status) status = decode_fsinfo(&xdr, fsinfo); if (!status) - status = -nfs4_stat_to_errno(hdr.status); + status = nfs4_stat_to_errno(hdr.status); return status; } @@ -4611,42 +4644,42 @@ static struct { int errno; } nfs_errtbl[] = { { NFS4_OK, 0 }, - { NFS4ERR_PERM, EPERM }, - { NFS4ERR_NOENT, ENOENT }, - { NFS4ERR_IO, errno_NFSERR_IO }, - { NFS4ERR_NXIO, ENXIO }, - { NFS4ERR_ACCESS, EACCES }, - { NFS4ERR_EXIST, EEXIST }, - { NFS4ERR_XDEV, EXDEV }, - { NFS4ERR_NOTDIR, ENOTDIR }, - { NFS4ERR_ISDIR, EISDIR }, - { NFS4ERR_INVAL, EINVAL }, - { NFS4ERR_FBIG, EFBIG }, - { NFS4ERR_NOSPC, ENOSPC }, - { NFS4ERR_ROFS, EROFS }, - { NFS4ERR_MLINK, EMLINK }, - { NFS4ERR_NAMETOOLONG, ENAMETOOLONG }, - { NFS4ERR_NOTEMPTY, ENOTEMPTY }, - { NFS4ERR_DQUOT, EDQUOT }, - { NFS4ERR_STALE, ESTALE }, - { NFS4ERR_BADHANDLE, EBADHANDLE }, - { NFS4ERR_BADOWNER, EINVAL }, - { NFS4ERR_BADNAME, EINVAL }, - { NFS4ERR_BAD_COOKIE, EBADCOOKIE }, - { NFS4ERR_NOTSUPP, ENOTSUPP }, - { NFS4ERR_TOOSMALL, ETOOSMALL }, - { NFS4ERR_SERVERFAULT, ESERVERFAULT }, - { NFS4ERR_BADTYPE, EBADTYPE }, - { NFS4ERR_LOCKED, EAGAIN }, - { NFS4ERR_RESOURCE, EREMOTEIO }, - { NFS4ERR_SYMLINK, ELOOP }, - { NFS4ERR_OP_ILLEGAL, EOPNOTSUPP }, - { NFS4ERR_DEADLOCK, EDEADLK }, - { NFS4ERR_WRONGSEC, EPERM }, /* FIXME: this needs + { NFS4ERR_PERM, -EPERM }, + { NFS4ERR_NOENT, -ENOENT }, + { NFS4ERR_IO, -errno_NFSERR_IO}, + { NFS4ERR_NXIO, -ENXIO }, + { NFS4ERR_ACCESS, -EACCES }, + { NFS4ERR_EXIST, -EEXIST }, + { NFS4ERR_XDEV, -EXDEV }, + { NFS4ERR_NOTDIR, -ENOTDIR }, + { NFS4ERR_ISDIR, -EISDIR }, + { NFS4ERR_INVAL, -EINVAL }, + { NFS4ERR_FBIG, -EFBIG }, + { NFS4ERR_NOSPC, -ENOSPC }, + { NFS4ERR_ROFS, -EROFS }, + { NFS4ERR_MLINK, -EMLINK }, + { NFS4ERR_NAMETOOLONG, -ENAMETOOLONG }, + { NFS4ERR_NOTEMPTY, -ENOTEMPTY }, + { NFS4ERR_DQUOT, -EDQUOT }, + { NFS4ERR_STALE, -ESTALE }, + { NFS4ERR_BADHANDLE, -EBADHANDLE }, + { NFS4ERR_BADOWNER, -EINVAL }, + { NFS4ERR_BADNAME, -EINVAL }, + { NFS4ERR_BAD_COOKIE, -EBADCOOKIE }, + { NFS4ERR_NOTSUPP, -ENOTSUPP }, + { NFS4ERR_TOOSMALL, -ETOOSMALL }, + { NFS4ERR_SERVERFAULT, -ESERVERFAULT }, + { NFS4ERR_BADTYPE, -EBADTYPE }, + { NFS4ERR_LOCKED, -EAGAIN }, + { NFS4ERR_RESOURCE, -EREMOTEIO }, + { NFS4ERR_SYMLINK, -ELOOP }, + { NFS4ERR_OP_ILLEGAL, -EOPNOTSUPP }, + { NFS4ERR_DEADLOCK, -EDEADLK }, + { NFS4ERR_WRONGSEC, -EPERM }, /* FIXME: this needs * to be handled by a * middle-layer. */ - { -1, EIO } + { -1, -EIO } }; /* @@ -4663,14 +4696,14 @@ nfs4_stat_to_errno(int stat) } if (stat <= 10000 || stat > 10100) { /* The server is looney tunes. */ - return ESERVERFAULT; + return -ESERVERFAULT; } /* If we cannot translate the error, the recovery routines should * handle it. * Note: remaining NFSv4 error codes have values > 10000, so should * not conflict with native Linux error codes. */ - return stat; + return -stat; } #define PROC(proc, argtype, restype) \ diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 5a70be589bb..16f57e0af99 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -58,22 +58,19 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_rcu_free(struct rcu_head *head) +static void nfs_readdata_free(struct nfs_read_data *p) { - struct nfs_read_data *p = container_of(head, struct nfs_read_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -static void nfs_readdata_free(struct nfs_read_data *rdata) -{ - call_rcu_bh(&rdata->task.u.tk_rcu, nfs_readdata_rcu_free); -} - void nfs_readdata_release(void *data) { - nfs_readdata_free(data); + struct nfs_read_data *rdata = data; + + put_nfs_open_context(rdata->args.context); + nfs_readdata_free(rdata); } static @@ -156,7 +153,7 @@ static void nfs_readpage_release(struct nfs_page *req) /* * Set up the NFS read request struct */ -static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, +static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset) { @@ -174,6 +171,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = RPC_TASK_ASYNC | swap_flags, }; @@ -186,7 +184,7 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pagevec; data->args.count = count; - data->args.context = req->wb_context; + data->args.context = get_nfs_open_context(req->wb_context); data->res.fattr = &data->fattr; data->res.count = count; @@ -204,8 +202,10 @@ static void nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, (unsigned long long)data->args.offset); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } static void @@ -242,6 +242,7 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne size_t rsize = NFS_SERVER(inode)->rsize, nbytes; unsigned int offset; int requests = 0; + int ret = 0; LIST_HEAD(list); nfs_list_remove_request(req); @@ -253,7 +254,6 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne data = nfs_readdata_alloc(1); if (!data) goto out_bad; - INIT_LIST_HEAD(&data->pages); list_add(&data->pages, &list); requests++; nbytes -= len; @@ -264,6 +264,8 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne offset = 0; nbytes = count; do { + int ret2; + data = list_entry(list.next, struct nfs_read_data, pages); list_del_init(&data->pages); @@ -271,13 +273,15 @@ static int nfs_pagein_multi(struct inode *inode, struct list_head *head, unsigne if (nbytes < rsize) rsize = nbytes; - nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, + ret2 = nfs_read_rpcsetup(req, data, &nfs_read_partial_ops, rsize, offset); + if (ret == 0) + ret = ret2; offset += rsize; nbytes -= rsize; } while (nbytes != 0); - return 0; + return ret; out_bad: while (!list_empty(&list)) { @@ -295,12 +299,12 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned struct nfs_page *req; struct page **pages; struct nfs_read_data *data; + int ret = -ENOMEM; data = nfs_readdata_alloc(npages); if (!data) goto out_bad; - INIT_LIST_HEAD(&data->pages); pages = data->pagevec; while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -311,11 +315,10 @@ static int nfs_pagein_one(struct inode *inode, struct list_head *head, unsigned } req = nfs_list_entry(data->pages.next); - nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); - return 0; + return nfs_read_rpcsetup(req, data, &nfs_read_full_ops, count, 0); out_bad: nfs_async_read_error(head); - return -ENOMEM; + return ret; } /* @@ -342,26 +345,25 @@ int nfs_readpage_result(struct rpc_task *task, struct nfs_read_data *data) return 0; } -static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) +static void nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) { struct nfs_readargs *argp = &data->args; struct nfs_readres *resp = &data->res; if (resp->eof || resp->count == argp->count) - return 0; + return; /* This is a short read! */ nfs_inc_stats(data->inode, NFSIOS_SHORTREAD); /* Has the server at least made some progress? */ if (resp->count == 0) - return 0; + return; /* Yes, so retry the read at the end of the data */ argp->offset += resp->count; argp->pgbase += resp->count; argp->count -= resp->count; rpc_restart_call(task); - return -EAGAIN; } /* @@ -370,29 +372,37 @@ static int nfs_readpage_retry(struct rpc_task *task, struct nfs_read_data *data) static void nfs_readpage_result_partial(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - struct nfs_page *req = data->req; - struct page *page = req->wb_page; if (nfs_readpage_result(task, data) != 0) return; + if (task->tk_status < 0) + return; - if (likely(task->tk_status >= 0)) { - nfs_readpage_truncate_uninitialised_page(data); - if (nfs_readpage_retry(task, data) != 0) - return; - } - if (unlikely(task->tk_status < 0)) + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_retry(task, data); +} + +static void nfs_readpage_release_partial(void *calldata) +{ + struct nfs_read_data *data = calldata; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + int status = data->task.tk_status; + + if (status < 0) SetPageError(page); + if (atomic_dec_and_test(&req->wb_complete)) { if (!PageError(page)) SetPageUptodate(page); nfs_readpage_release(req); } + nfs_readdata_release(calldata); } static const struct rpc_call_ops nfs_read_partial_ops = { .rpc_call_done = nfs_readpage_result_partial, - .rpc_release = nfs_readdata_release, + .rpc_release = nfs_readpage_release_partial, }; static void nfs_readpage_set_pages_uptodate(struct nfs_read_data *data) @@ -427,29 +437,35 @@ static void nfs_readpage_result_full(struct rpc_task *task, void *calldata) if (nfs_readpage_result(task, data) != 0) return; + if (task->tk_status < 0) + return; /* * Note: nfs_readpage_retry may change the values of * data->args. In the multi-page case, we therefore need * to ensure that we call nfs_readpage_set_pages_uptodate() * first. */ - if (likely(task->tk_status >= 0)) { - nfs_readpage_truncate_uninitialised_page(data); - nfs_readpage_set_pages_uptodate(data); - if (nfs_readpage_retry(task, data) != 0) - return; - } + nfs_readpage_truncate_uninitialised_page(data); + nfs_readpage_set_pages_uptodate(data); + nfs_readpage_retry(task, data); +} + +static void nfs_readpage_release_full(void *calldata) +{ + struct nfs_read_data *data = calldata; + while (!list_empty(&data->pages)) { struct nfs_page *req = nfs_list_entry(data->pages.next); nfs_list_remove_request(req); nfs_readpage_release(req); } + nfs_readdata_release(calldata); } static const struct rpc_call_ops nfs_read_full_ops = { .rpc_call_done = nfs_readpage_result_full, - .rpc_release = nfs_readdata_release, + .rpc_release = nfs_readpage_release_full, }; /* diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9219024f31..7226a506f3c 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -198,7 +198,7 @@ static match_table_t nfs_secflavor_tokens = { }; -static void nfs_umount_begin(struct vfsmount *, int); +static void nfs_umount_begin(struct super_block *); static int nfs_statfs(struct dentry *, struct kstatfs *); static int nfs_show_options(struct seq_file *, struct vfsmount *); static int nfs_show_stats(struct seq_file *, struct vfsmount *); @@ -441,10 +441,52 @@ static const char *nfs_pseudoflavour_to_name(rpc_authflavor_t flavour) return sec_flavours[i].str; } +static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) +{ + struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + + switch (sap->sa_family) { + case AF_INET: { + struct sockaddr_in *sin = (struct sockaddr_in *)sap; + seq_printf(m, ",mountaddr=" NIPQUAD_FMT, + NIPQUAD(sin->sin_addr.s_addr)); + break; + } + case AF_INET6: { + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + seq_printf(m, ",mountaddr=" NIP6_FMT, + NIP6(sin6->sin6_addr)); + break; + } + default: + if (showdefaults) + seq_printf(m, ",mountaddr=unspecified"); + } + + if (nfss->mountd_version || showdefaults) + seq_printf(m, ",mountvers=%u", nfss->mountd_version); + if (nfss->mountd_port || showdefaults) + seq_printf(m, ",mountport=%u", nfss->mountd_port); + + switch (nfss->mountd_protocol) { + case IPPROTO_UDP: + seq_printf(m, ",mountproto=udp"); + break; + case IPPROTO_TCP: + seq_printf(m, ",mountproto=tcp"); + break; + default: + if (showdefaults) + seq_printf(m, ",mountproto=auto"); + } +} + /* * Describe the mount options in force on this server representation */ -static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, int showdefaults) +static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, + int showdefaults) { static const struct proc_nfs_info { int flag; @@ -452,6 +494,8 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, const char *nostr; } nfs_info[] = { { NFS_MOUNT_SOFT, ",soft", ",hard" }, + { NFS_MOUNT_INTR, ",intr", ",nointr" }, + { NFS_MOUNT_POSIX, ",posix", "" }, { NFS_MOUNT_NOCTO, ",nocto", "" }, { NFS_MOUNT_NOAC, ",noac", "" }, { NFS_MOUNT_NONLM, ",nolock", "" }, @@ -462,18 +506,22 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, }; const struct proc_nfs_info *nfs_infop; struct nfs_client *clp = nfss->nfs_client; - - seq_printf(m, ",vers=%d", clp->rpc_ops->version); - seq_printf(m, ",rsize=%d", nfss->rsize); - seq_printf(m, ",wsize=%d", nfss->wsize); + u32 version = clp->rpc_ops->version; + + seq_printf(m, ",vers=%u", version); + seq_printf(m, ",rsize=%u", nfss->rsize); + seq_printf(m, ",wsize=%u", nfss->wsize); + if (nfss->bsize != 0) + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); if (nfss->acregmin != 3*HZ || showdefaults) - seq_printf(m, ",acregmin=%d", nfss->acregmin/HZ); + seq_printf(m, ",acregmin=%u", nfss->acregmin/HZ); if (nfss->acregmax != 60*HZ || showdefaults) - seq_printf(m, ",acregmax=%d", nfss->acregmax/HZ); + seq_printf(m, ",acregmax=%u", nfss->acregmax/HZ); if (nfss->acdirmin != 30*HZ || showdefaults) - seq_printf(m, ",acdirmin=%d", nfss->acdirmin/HZ); + seq_printf(m, ",acdirmin=%u", nfss->acdirmin/HZ); if (nfss->acdirmax != 60*HZ || showdefaults) - seq_printf(m, ",acdirmax=%d", nfss->acdirmax/HZ); + seq_printf(m, ",acdirmax=%u", nfss->acdirmax/HZ); for (nfs_infop = nfs_info; nfs_infop->flag; nfs_infop++) { if (nfss->flags & nfs_infop->flag) seq_puts(m, nfs_infop->str); @@ -482,9 +530,24 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, } seq_printf(m, ",proto=%s", rpc_peeraddr2str(nfss->client, RPC_DISPLAY_PROTO)); + if (version == 4) { + if (nfss->port != NFS_PORT) + seq_printf(m, ",port=%u", nfss->port); + } else + if (nfss->port) + seq_printf(m, ",port=%u", nfss->port); + seq_printf(m, ",timeo=%lu", 10U * nfss->client->cl_timeout->to_initval / HZ); seq_printf(m, ",retrans=%u", nfss->client->cl_timeout->to_retries); seq_printf(m, ",sec=%s", nfs_pseudoflavour_to_name(nfss->client->cl_auth->au_flavor)); + + if (version != 4) + nfs_show_mountd_options(m, nfss, showdefaults); + +#ifdef CONFIG_NFS_V4 + if (clp->rpc_ops->version == 4) + seq_printf(m, ",clientaddr=%s", clp->cl_ipaddr); +#endif } /* @@ -529,10 +592,10 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) seq_printf(m, "\n\tcaps:\t"); seq_printf(m, "caps=0x%x", nfss->caps); - seq_printf(m, ",wtmult=%d", nfss->wtmult); - seq_printf(m, ",dtsize=%d", nfss->dtsize); - seq_printf(m, ",bsize=%d", nfss->bsize); - seq_printf(m, ",namelen=%d", nfss->namelen); + seq_printf(m, ",wtmult=%u", nfss->wtmult); + seq_printf(m, ",dtsize=%u", nfss->dtsize); + seq_printf(m, ",bsize=%u", nfss->bsize); + seq_printf(m, ",namlen=%u", nfss->namelen); #ifdef CONFIG_NFS_V4 if (nfss->nfs_client->rpc_ops->version == 4) { @@ -546,9 +609,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) /* * Display security flavor in effect for this mount */ - seq_printf(m, "\n\tsec:\tflavor=%d", auth->au_ops->au_flavor); + seq_printf(m, "\n\tsec:\tflavor=%u", auth->au_ops->au_flavor); if (auth->au_flavor) - seq_printf(m, ",pseudoflavor=%d", auth->au_flavor); + seq_printf(m, ",pseudoflavor=%u", auth->au_flavor); /* * Display superblock I/O counters @@ -584,13 +647,11 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt) * Begin unmount by attempting to remove all automounted mountpoints we added * in response to xdev traversals and referrals */ -static void nfs_umount_begin(struct vfsmount *vfsmnt, int flags) +static void nfs_umount_begin(struct super_block *sb) { - struct nfs_server *server = NFS_SB(vfsmnt->mnt_sb); + struct nfs_server *server = NFS_SB(sb); struct rpc_clnt *rpc; - if (!(flags & MNT_FORCE)) - return; /* -EIO all pending I/O */ rpc = server->client_acl; if (!IS_ERR(rpc)) @@ -683,7 +744,6 @@ static int nfs_parse_mount_options(char *raw, struct nfs_parsed_mount_data *mnt) { char *p, *string, *secdata; - unsigned short port = 0; int rc; if (!raw) { @@ -798,7 +858,7 @@ static int nfs_parse_mount_options(char *raw, return 0; if (option < 0 || option > 65535) return 0; - port = option; + mnt->nfs_server.port = option; break; case Opt_rsize: if (match_int(args, &mnt->rsize)) @@ -1048,7 +1108,8 @@ static int nfs_parse_mount_options(char *raw, } } - nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, port); + nfs_set_port((struct sockaddr *)&mnt->nfs_server.address, + mnt->nfs_server.port); return 1; @@ -1169,7 +1230,9 @@ static int nfs_validate_mount_data(void *options, args->acregmax = 60; args->acdirmin = 30; args->acdirmax = 60; + args->mount_server.port = 0; /* autobind unless user sets port */ args->mount_server.protocol = XPRT_TRANSPORT_UDP; + args->nfs_server.port = 0; /* autobind unless user sets port */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; switch (data->version) { @@ -1208,7 +1271,6 @@ static int nfs_validate_mount_data(void *options, args->flags = data->flags; args->rsize = data->rsize; args->wsize = data->wsize; - args->flags = data->flags; args->timeo = data->timeo; args->retrans = data->retrans; args->acregmin = data->acregmin; @@ -1230,6 +1292,8 @@ static int nfs_validate_mount_data(void *options, args->namlen = data->namlen; args->bsize = data->bsize; args->auth_flavors[0] = data->pseudoflavor; + if (!args->nfs_server.hostname) + goto out_nomem; /* * The legacy version 6 binary mount data from userspace has a @@ -1276,6 +1340,8 @@ static int nfs_validate_mount_data(void *options, len = c - dev_name; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (!args->nfs_server.hostname) + goto out_nomem; c++; if (strlen(c) > NFS_MAXPATHLEN) @@ -1319,6 +1385,10 @@ out_v3_not_compiled: return -EPROTONOSUPPORT; #endif /* !CONFIG_NFS_V3 */ +out_nomem: + dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); + return -ENOMEM; + out_no_address: dfprintk(MOUNT, "NFS: mount program didn't pass remote address\n"); return -EINVAL; @@ -1505,6 +1575,11 @@ static int nfs_compare_super(struct super_block *sb, void *data) return nfs_compare_mount_options(sb, server, mntflags); } +static int nfs_bdi_register(struct nfs_server *server) +{ + return bdi_register_dev(&server->backing_dev_info, server->s_dev); +} + static int nfs_get_sb(struct file_system_type *fs_type, int flags, const char *dev_name, void *raw_data, struct vfsmount *mnt) { @@ -1547,6 +1622,10 @@ static int nfs_get_sb(struct file_system_type *fs_type, if (s->s_fs_info != server) { nfs_free_server(server); server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_super; } if (!s->s_root) { @@ -1594,6 +1673,7 @@ static void nfs_kill_super(struct super_block *s) { struct nfs_server *server = NFS_SB(s); + bdi_unregister(&server->backing_dev_info); kill_anon_super(s); nfs_free_server(server); } @@ -1638,6 +1718,10 @@ static int nfs_xdev_get_sb(struct file_system_type *fs_type, int flags, if (s->s_fs_info != server) { nfs_free_server(server); server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_super; } if (!s->s_root) { @@ -1706,28 +1790,6 @@ static void nfs4_fill_super(struct super_block *sb) } /* - * If the user didn't specify a port, set the port number to - * the NFS version 4 default port. - */ -static void nfs4_default_port(struct sockaddr *sap) -{ - switch (sap->sa_family) { - case AF_INET: { - struct sockaddr_in *ap = (struct sockaddr_in *)sap; - if (ap->sin_port == 0) - ap->sin_port = htons(NFS_PORT); - break; - } - case AF_INET6: { - struct sockaddr_in6 *ap = (struct sockaddr_in6 *)sap; - if (ap->sin6_port == 0) - ap->sin6_port = htons(NFS_PORT); - break; - } - } -} - -/* * Validate NFSv4 mount options */ static int nfs4_validate_mount_data(void *options, @@ -1751,6 +1813,7 @@ static int nfs4_validate_mount_data(void *options, args->acregmax = 60; args->acdirmin = 30; args->acdirmax = 60; + args->nfs_server.port = NFS_PORT; /* 2049 unless user set port= */ args->nfs_server.protocol = XPRT_TRANSPORT_TCP; switch (data->version) { @@ -1767,9 +1830,6 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) goto out_no_address; - nfs4_default_port((struct sockaddr *) - &args->nfs_server.address); - switch (data->auth_flavourlen) { case 0: args->auth_flavors[0] = RPC_AUTH_UNIX; @@ -1827,9 +1887,6 @@ static int nfs4_validate_mount_data(void *options, &args->nfs_server.address)) return -EINVAL; - nfs4_default_port((struct sockaddr *) - &args->nfs_server.address); - switch (args->auth_flavor_len) { case 0: args->auth_flavors[0] = RPC_AUTH_UNIX; @@ -1852,12 +1909,16 @@ static int nfs4_validate_mount_data(void *options, return -ENAMETOOLONG; /* N.B. caller will free nfs_server.hostname in all cases */ args->nfs_server.hostname = kstrndup(dev_name, len, GFP_KERNEL); + if (!args->nfs_server.hostname) + goto out_nomem; c++; /* step over the ':' */ len = strlen(c); if (len > NFS4_MAXPATHLEN) return -ENAMETOOLONG; args->nfs_server.export_path = kstrndup(c, len, GFP_KERNEL); + if (!args->nfs_server.export_path) + goto out_nomem; dprintk("NFS: MNTPATH: '%s'\n", args->nfs_server.export_path); @@ -1879,6 +1940,10 @@ out_inval_auth: data->auth_flavourlen); return -EINVAL; +out_nomem: + dfprintk(MOUNT, "NFS4: not enough memory to handle mount options\n"); + return -ENOMEM; + out_no_address: dfprintk(MOUNT, "NFS4: mount program didn't pass remote address\n"); return -EINVAL; @@ -1933,6 +1998,10 @@ static int nfs4_get_sb(struct file_system_type *fs_type, if (s->s_fs_info != server) { nfs_free_server(server); server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_super; } if (!s->s_root) { @@ -2019,6 +2088,10 @@ static int nfs4_xdev_get_sb(struct file_system_type *fs_type, int flags, if (s->s_fs_info != server) { nfs_free_server(server); server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_super; } if (!s->s_root) { @@ -2098,6 +2171,10 @@ static int nfs4_referral_get_sb(struct file_system_type *fs_type, int flags, if (s->s_fs_info != server) { nfs_free_server(server); server = NULL; + } else { + error = nfs_bdi_register(server); + if (error) + goto error_splat_super; } if (!s->s_root) { diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 83e865a16ad..412738dbfbc 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -10,7 +10,6 @@ * nfs symlink handling code */ -#define NFS_NEED_XDR_TYPES #include <linux/time.h> #include <linux/errno.h> #include <linux/sunrpc/clnt.h> diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index 75741536342..3adf8b26646 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -234,7 +234,7 @@ nfs_async_unlink(struct inode *dir, struct dentry *dentry) if (data == NULL) goto out; - data->cred = rpcauth_lookupcred(NFS_CLIENT(dir)->cl_auth, 0); + data->cred = rpc_lookup_cred(); if (IS_ERR(data->cred)) { status = PTR_ERR(data->cred); goto out_free; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index bed63416a55..1ade11d1ba0 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -48,7 +48,7 @@ static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; static mempool_t *nfs_commit_mempool; -struct nfs_write_data *nfs_commit_alloc(void) +struct nfs_write_data *nfs_commitdata_alloc(void) { struct nfs_write_data *p = mempool_alloc(nfs_commit_mempool, GFP_NOFS); @@ -59,19 +59,13 @@ struct nfs_write_data *nfs_commit_alloc(void) return p; } -static void nfs_commit_rcu_free(struct rcu_head *head) +void nfs_commit_free(struct nfs_write_data *p) { - struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_commit_mempool); } -void nfs_commit_free(struct nfs_write_data *wdata) -{ - call_rcu_bh(&wdata->task.u.tk_rcu, nfs_commit_rcu_free); -} - struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) { struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, GFP_NOFS); @@ -93,21 +87,18 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_rcu_free(struct rcu_head *head) +static void nfs_writedata_free(struct nfs_write_data *p) { - struct nfs_write_data *p = container_of(head, struct nfs_write_data, task.u.tk_rcu); if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -static void nfs_writedata_free(struct nfs_write_data *wdata) +void nfs_writedata_release(void *data) { - call_rcu_bh(&wdata->task.u.tk_rcu, nfs_writedata_rcu_free); -} + struct nfs_write_data *wdata = data; -void nfs_writedata_release(void *wdata) -{ + put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } @@ -291,8 +282,6 @@ static int nfs_page_async_flush(struct nfs_pageio_descriptor *pgio, spin_unlock(&inode->i_lock); if (!nfs_pageio_add_request(pgio, req)) { nfs_redirty_request(req); - nfs_end_page_writeback(page); - nfs_clear_page_tag_locked(req); return pgio->pg_error; } return 0; @@ -366,15 +355,13 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) /* * Insert a write request into an inode */ -static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) +static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) { struct nfs_inode *nfsi = NFS_I(inode); int error; error = radix_tree_insert(&nfsi->nfs_page_tree, req->wb_index, req); - BUG_ON(error == -EEXIST); - if (error) - return error; + BUG_ON(error); if (!nfsi->npages) { igrab(inode); if (nfs_have_delegation(inode, FMODE_WRITE)) @@ -384,8 +371,8 @@ static int nfs_inode_add_request(struct inode *inode, struct nfs_page *req) set_page_private(req->wb_page, (unsigned long)req); nfsi->npages++; kref_get(&req->wb_kref); - radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, NFS_PAGE_TAG_LOCKED); - return 0; + radix_tree_tag_set(&nfsi->nfs_page_tree, req->wb_index, + NFS_PAGE_TAG_LOCKED); } /* @@ -413,7 +400,7 @@ static void nfs_inode_remove_request(struct nfs_page *req) } static void -nfs_redirty_request(struct nfs_page *req) +nfs_mark_request_dirty(struct nfs_page *req) { __set_page_dirty_nobuffers(req->wb_page); } @@ -467,7 +454,7 @@ int nfs_reschedule_unstable_write(struct nfs_page *req) return 1; } if (test_and_clear_bit(PG_NEED_RESCHED, &req->wb_flags)) { - nfs_redirty_request(req); + nfs_mark_request_dirty(req); return 1; } return 0; @@ -597,6 +584,13 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, /* Loop over all inode entries and see if we find * A request for the page we wish to update */ + if (new) { + if (radix_tree_preload(GFP_NOFS)) { + nfs_release_request(new); + return ERR_PTR(-ENOMEM); + } + } + spin_lock(&inode->i_lock); req = nfs_page_find_request_locked(page); if (req) { @@ -607,28 +601,27 @@ static struct nfs_page * nfs_update_request(struct nfs_open_context* ctx, error = nfs_wait_on_request(req); nfs_release_request(req); if (error < 0) { - if (new) + if (new) { + radix_tree_preload_end(); nfs_release_request(new); + } return ERR_PTR(error); } continue; } spin_unlock(&inode->i_lock); - if (new) + if (new) { + radix_tree_preload_end(); nfs_release_request(new); + } break; } if (new) { - int error; nfs_lock_request_dontget(new); - error = nfs_inode_add_request(inode, new); - if (error) { - spin_unlock(&inode->i_lock); - nfs_unlock_request(new); - return ERR_PTR(error); - } + nfs_inode_add_request(inode, new); spin_unlock(&inode->i_lock); + radix_tree_preload_end(); req = new; goto zero_page; } @@ -785,7 +778,7 @@ static int flush_task_priority(int how) /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_write_rpcsetup(struct nfs_page *req, +static int nfs_write_rpcsetup(struct nfs_page *req, struct nfs_write_data *data, const struct rpc_call_ops *call_ops, unsigned int count, unsigned int offset, @@ -806,6 +799,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, .rpc_message = &msg, .callback_ops = call_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = flags, .priority = priority, }; @@ -822,7 +816,7 @@ static void nfs_write_rpcsetup(struct nfs_page *req, data->args.pgbase = req->wb_pgbase + offset; data->args.pages = data->pagevec; data->args.count = count; - data->args.context = req->wb_context; + data->args.context = get_nfs_open_context(req->wb_context); data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; @@ -847,8 +841,21 @@ static void nfs_write_rpcsetup(struct nfs_page *req, (unsigned long long)data->args.offset); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; +} + +/* If a nfs_flush_* function fails, it should remove reqs from @head and + * call this on each, which will prepare them to be retried on next + * writeback using standard nfs. + */ +static void nfs_redirty_request(struct nfs_page *req) +{ + nfs_mark_request_dirty(req); + nfs_end_page_writeback(req->wb_page); + nfs_clear_page_tag_locked(req); } /* @@ -863,6 +870,7 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned size_t wsize = NFS_SERVER(inode)->wsize, nbytes; unsigned int offset; int requests = 0; + int ret = 0; LIST_HEAD(list); nfs_list_remove_request(req); @@ -884,6 +892,8 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned offset = 0; nbytes = count; do { + int ret2; + data = list_entry(list.next, struct nfs_write_data, pages); list_del_init(&data->pages); @@ -891,13 +901,15 @@ static int nfs_flush_multi(struct inode *inode, struct list_head *head, unsigned if (nbytes < wsize) wsize = nbytes; - nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, + ret2 = nfs_write_rpcsetup(req, data, &nfs_write_partial_ops, wsize, offset, how); + if (ret == 0) + ret = ret2; offset += wsize; nbytes -= wsize; } while (nbytes != 0); - return 0; + return ret; out_bad: while (!list_empty(&list)) { @@ -906,8 +918,6 @@ out_bad: nfs_writedata_release(data); } nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); return -ENOMEM; } @@ -940,16 +950,12 @@ static int nfs_flush_one(struct inode *inode, struct list_head *head, unsigned i req = nfs_list_entry(data->pages.next); /* Set up the argument struct */ - nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); - - return 0; + return nfs_write_rpcsetup(req, data, &nfs_write_full_ops, count, 0, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); nfs_list_remove_request(req); nfs_redirty_request(req); - nfs_end_page_writeback(req->wb_page); - nfs_clear_page_tag_locked(req); } return -ENOMEM; } @@ -972,7 +978,6 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; struct nfs_page *req = data->req; - struct page *page = req->wb_page; dprintk("NFS: write (%s/%Ld %d@%Ld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, @@ -980,13 +985,20 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) req->wb_bytes, (long long)req_offset(req)); - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); +} - if (task->tk_status < 0) { +static void nfs_writeback_release_partial(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_page *req = data->req; + struct page *page = req->wb_page; + int status = data->task.tk_status; + + if (status < 0) { nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, task->tk_status); - dprintk(", error = %d\n", task->tk_status); + nfs_context_set_write_error(req->wb_context, status); + dprintk(", error = %d\n", status); goto out; } @@ -1011,11 +1023,12 @@ static void nfs_writeback_done_partial(struct rpc_task *task, void *calldata) out: if (atomic_dec_and_test(&req->wb_complete)) nfs_writepage_release(req); + nfs_writedata_release(calldata); } static const struct rpc_call_ops nfs_write_partial_ops = { .rpc_call_done = nfs_writeback_done_partial, - .rpc_release = nfs_writedata_release, + .rpc_release = nfs_writeback_release_partial, }; /* @@ -1028,17 +1041,21 @@ static const struct rpc_call_ops nfs_write_partial_ops = { static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_page *req; - struct page *page; - if (nfs_writeback_done(task, data) != 0) - return; + nfs_writeback_done(task, data); +} + +static void nfs_writeback_release_full(void *calldata) +{ + struct nfs_write_data *data = calldata; + int status = data->task.tk_status; /* Update attributes as result of writeback. */ while (!list_empty(&data->pages)) { - req = nfs_list_entry(data->pages.next); + struct nfs_page *req = nfs_list_entry(data->pages.next); + struct page *page = req->wb_page; + nfs_list_remove_request(req); - page = req->wb_page; dprintk("NFS: write (%s/%Ld %d@%Ld)", req->wb_context->path.dentry->d_inode->i_sb->s_id, @@ -1046,10 +1063,10 @@ static void nfs_writeback_done_full(struct rpc_task *task, void *calldata) req->wb_bytes, (long long)req_offset(req)); - if (task->tk_status < 0) { + if (status < 0) { nfs_set_pageerror(page); - nfs_context_set_write_error(req->wb_context, task->tk_status); - dprintk(", error = %d\n", task->tk_status); + nfs_context_set_write_error(req->wb_context, status); + dprintk(", error = %d\n", status); goto remove_request; } @@ -1069,11 +1086,12 @@ remove_request: next: nfs_clear_page_tag_locked(req); } + nfs_writedata_release(calldata); } static const struct rpc_call_ops nfs_write_full_ops = { .rpc_call_done = nfs_writeback_done_full, - .rpc_release = nfs_writedata_release, + .rpc_release = nfs_writeback_release_full, }; @@ -1159,15 +1177,18 @@ int nfs_writeback_done(struct rpc_task *task, struct nfs_write_data *data) #if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4) -void nfs_commit_release(void *wdata) +void nfs_commitdata_release(void *data) { + struct nfs_write_data *wdata = data; + + put_nfs_open_context(wdata->args.context); nfs_commit_free(wdata); } /* * Set up the argument/result storage required for the RPC call. */ -static void nfs_commit_rpcsetup(struct list_head *head, +static int nfs_commit_rpcsetup(struct list_head *head, struct nfs_write_data *data, int how) { @@ -1187,6 +1208,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, .rpc_message = &msg, .callback_ops = &nfs_commit_ops, .callback_data = data, + .workqueue = nfsiod_workqueue, .flags = flags, .priority = priority, }; @@ -1203,6 +1225,7 @@ static void nfs_commit_rpcsetup(struct list_head *head, /* Note: we always request a commit of the entire inode */ data->args.offset = 0; data->args.count = 0; + data->args.context = get_nfs_open_context(first->wb_context); data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -1214,8 +1237,10 @@ static void nfs_commit_rpcsetup(struct list_head *head, dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid); task = rpc_run_task(&task_setup_data); - if (!IS_ERR(task)) - rpc_put_task(task); + if (IS_ERR(task)) + return PTR_ERR(task); + rpc_put_task(task); + return 0; } /* @@ -1227,15 +1252,13 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) struct nfs_write_data *data; struct nfs_page *req; - data = nfs_commit_alloc(); + data = nfs_commitdata_alloc(); if (!data) goto out_bad; /* Set up the argument struct */ - nfs_commit_rpcsetup(head, data, how); - - return 0; + return nfs_commit_rpcsetup(head, data, how); out_bad: while (!list_empty(head)) { req = nfs_list_entry(head->next); @@ -1255,7 +1278,6 @@ nfs_commit_list(struct inode *inode, struct list_head *head, int how) static void nfs_commit_done(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_page *req; dprintk("NFS: %5u nfs_commit_done (status %d)\n", task->tk_pid, task->tk_status); @@ -1263,6 +1285,13 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) /* Call the NFS version-specific code */ if (NFS_PROTO(data->inode)->commit_done(task, data) != 0) return; +} + +static void nfs_commit_release(void *calldata) +{ + struct nfs_write_data *data = calldata; + struct nfs_page *req; + int status = data->task.tk_status; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1277,10 +1306,10 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) (long long)NFS_FILEID(req->wb_context->path.dentry->d_inode), req->wb_bytes, (long long)req_offset(req)); - if (task->tk_status < 0) { - nfs_context_set_write_error(req->wb_context, task->tk_status); + if (status < 0) { + nfs_context_set_write_error(req->wb_context, status); nfs_inode_remove_request(req); - dprintk(", error = %d\n", task->tk_status); + dprintk(", error = %d\n", status); goto next; } @@ -1297,10 +1326,11 @@ static void nfs_commit_done(struct rpc_task *task, void *calldata) } /* We have a mismatch. Write the page again */ dprintk(" mismatch\n"); - nfs_redirty_request(req); + nfs_mark_request_dirty(req); next: nfs_clear_page_tag_locked(req); } + nfs_commitdata_release(calldata); } static const struct rpc_call_ops nfs_commit_ops = { @@ -1487,18 +1517,19 @@ static int nfs_wb_page_priority(struct inode *inode, struct page *page, }; int ret; - BUG_ON(!PageLocked(page)); - if (clear_page_dirty_for_io(page)) { - ret = nfs_writepage_locked(page, &wbc); + do { + if (clear_page_dirty_for_io(page)) { + ret = nfs_writepage_locked(page, &wbc); + if (ret < 0) + goto out_error; + } else if (!PagePrivate(page)) + break; + ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); if (ret < 0) - goto out; - } - if (!PagePrivate(page)) - return 0; - ret = nfs_sync_mapping_wait(page->mapping, &wbc, how); - if (ret >= 0) - return 0; -out: + goto out_error; + } while (PagePrivate(page)); + return 0; +out_error: __mark_inode_dirty(inode, I_DIRTY_PAGES); return ret; } diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index d13403e3362..294992e9bf6 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -10,6 +10,7 @@ #include <linux/sunrpc/svcauth.h> #include <linux/nfsd/nfsd.h> #include <linux/nfsd/export.h> +#include "auth.h" int nfsexp_flags(struct svc_rqst *rqstp, struct svc_export *exp) { diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 8a6f7c924c7..33bfcf09db4 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -35,6 +35,7 @@ #include <linux/lockd/bind.h> #include <linux/sunrpc/msg_prot.h> #include <linux/sunrpc/gss_api.h> +#include <net/ipv6.h> #define NFSDDBG_FACILITY NFSDDBG_EXPORT @@ -1548,6 +1549,7 @@ exp_addclient(struct nfsctl_client *ncp) { struct auth_domain *dom; int i, err; + struct in6_addr addr6; /* First, consistency check. */ err = -EINVAL; @@ -1566,9 +1568,10 @@ exp_addclient(struct nfsctl_client *ncp) goto out_unlock; /* Insert client into hashtable. */ - for (i = 0; i < ncp->cl_naddr; i++) - auth_unix_add_addr(ncp->cl_addrlist[i], dom); - + for (i = 0; i < ncp->cl_naddr; i++) { + ipv6_addr_set_v4mapped(ncp->cl_addrlist[i].s_addr, &addr6); + auth_unix_add_addr(&addr6, dom); + } auth_unix_forget_old(dom); auth_domain_put(dom); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index aae2b29ae2c..0b3ffa9840c 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -104,7 +104,7 @@ xdr_writemem(__be32 *p, const void *ptr, int nbytes) } while (0) #define RESERVE_SPACE(nbytes) do { \ p = xdr_reserve_space(xdr, nbytes); \ - if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __FUNCTION__); \ + if (!p) dprintk("NFSD: RESERVE_SPACE(%d) failed in function %s\n", (int) (nbytes), __func__); \ BUG_ON(!p); \ } while (0) @@ -134,7 +134,7 @@ xdr_error: \ p = xdr_inline_decode(xdr, nbytes); \ if (!p) { \ dprintk("NFSD: %s: reply buffer overflowed in line %d.\n", \ - __FUNCTION__, __LINE__); \ + __func__, __LINE__); \ return -EIO; \ } \ } while (0) @@ -344,6 +344,21 @@ static struct rpc_version * nfs_cb_version[] = { &nfs_cb_version4, }; +static struct rpc_program cb_program; + +static struct rpc_stat cb_stats = { + .program = &cb_program +}; + +#define NFS4_CALLBACK 0x40000000 +static struct rpc_program cb_program = { + .name = "nfs4_cb", + .number = NFS4_CALLBACK, + .nrvers = ARRAY_SIZE(nfs_cb_version), + .version = nfs_cb_version, + .stats = &cb_stats, +}; + /* Reference counting, callback cleanup, etc., all look racy as heck. * And why is cb_set an atomic? */ @@ -358,13 +373,12 @@ static int do_probe_callback(void *data) .to_maxval = (NFSD_LEASE_TIME/2) * HZ, .to_exponential = 1, }; - struct rpc_program * program = &cb->cb_program; struct rpc_create_args args = { .protocol = IPPROTO_TCP, .address = (struct sockaddr *)&addr, .addrsize = sizeof(addr), .timeout = &timeparms, - .program = program, + .program = &cb_program, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING), @@ -382,16 +396,8 @@ static int do_probe_callback(void *data) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize rpc_program */ - program->name = "nfs4_cb"; - program->number = cb->cb_prog; - program->nrvers = ARRAY_SIZE(nfs_cb_version); - program->version = nfs_cb_version; - program->stats = &cb->cb_stat; - /* Initialize rpc_stat */ - memset(program->stats, 0, sizeof(cb->cb_stat)); - program->stats->program = program; + memset(args.program->stats, 0, sizeof(struct rpc_stat)); /* Create RPC client */ client = rpc_create(&args); diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index 996bd88b75b..5b398421b05 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -202,7 +202,7 @@ static struct cache_detail idtoname_cache = { .alloc = ent_alloc, }; -int +static int idtoname_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 81a75f3081f..8799b870818 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1639,6 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta locks_init_lock(&fl); fl.fl_lmops = &nfsd_lease_mng_ops; fl.fl_flags = FL_LEASE; + fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl.fl_end = OFFSET_MAX; fl.fl_owner = (fl_owner_t)dp; fl.fl_file = stp->st_vfs_file; @@ -1647,8 +1648,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callbacks fl_mylease and fl_change are used */ - if ((status = vfs_setlease(stp->st_vfs_file, - flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK, &flp))) { + if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); unhash_delegation(dp); flag = NFS4_OPEN_DELEGATE_NONE; @@ -1763,10 +1763,6 @@ out: return status; } -static struct workqueue_struct *laundry_wq; -static void laundromat_main(struct work_struct *); -static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main); - __be32 nfsd4_renew(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, clientid_t *clid) @@ -1874,7 +1870,11 @@ nfs4_laundromat(void) return clientid_val; } -void +static struct workqueue_struct *laundry_wq; +static void laundromat_main(struct work_struct *); +static DECLARE_DELAYED_WORK(laundromat_work, laundromat_main); + +static void laundromat_main(struct work_struct *not_used) { time_t t; @@ -1975,6 +1975,26 @@ io_during_grace_disallowed(struct inode *inode, int flags) && mandatory_lock(inode); } +static int check_stateid_generation(stateid_t *in, stateid_t *ref) +{ + /* If the client sends us a stateid from the future, it's buggy: */ + if (in->si_generation > ref->si_generation) + return nfserr_bad_stateid; + /* + * The following, however, can happen. For example, if the + * client sends an open and some IO at the same time, the open + * may bump si_generation while the IO is still in flight. + * Thanks to hard links and renames, the client never knows what + * file an open will affect. So it could avoid that situation + * only by serializing all opens and IO from the same open + * owner. To recover from the old_stateid error, the client + * will just have to retry the IO: + */ + if (in->si_generation < ref->si_generation) + return nfserr_old_stateid; + return nfs_ok; +} + /* * Checks for stateid operations */ @@ -2023,12 +2043,8 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl goto out; stidp = &stp->st_stateid; } - if (stateid->si_generation > stidp->si_generation) - goto out; - - /* OLD STATEID */ - status = nfserr_old_stateid; - if (stateid->si_generation < stidp->si_generation) + status = check_stateid_generation(stateid, stidp); + if (status) goto out; if (stp) { if ((status = nfs4_check_openmode(stp,flags))) @@ -2036,7 +2052,7 @@ nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int fl renew_client(stp->st_stateowner->so_client); if (filpp) *filpp = stp->st_vfs_file; - } else if (dp) { + } else { if ((status = nfs4_check_delegmode(dp, flags))) goto out; renew_client(dp->dl_client); @@ -2065,6 +2081,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei { struct nfs4_stateid *stp; struct nfs4_stateowner *sop; + __be32 status; dprintk("NFSD: preprocess_seqid_op: seqid=%d " "stateid = (%08x/%08x/%08x/%08x)\n", seqid, @@ -2127,7 +2144,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei } } - if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp)) { + if (nfs4_check_fh(current_fh, stp)) { dprintk("NFSD: preprocess_seqid_op: fh-stateid mismatch!\n"); return nfserr_bad_stateid; } @@ -2150,15 +2167,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei " confirmed yet!\n"); return nfserr_bad_stateid; } - if (stateid->si_generation > stp->st_stateid.si_generation) { - dprintk("NFSD: preprocess_seqid_op: future stateid?!\n"); - return nfserr_bad_stateid; - } - - if (stateid->si_generation < stp->st_stateid.si_generation) { - dprintk("NFSD: preprocess_seqid_op: old stateid!\n"); - return nfserr_old_stateid; - } + status = check_stateid_generation(stateid, &stp->st_stateid); + if (status) + return status; renew_client(sop->so_client); return nfs_ok; @@ -2194,7 +2205,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, oc->oc_seqid, &oc->oc_req_stateid, - CHECK_FH | CONFIRM | OPEN_STATE, + CONFIRM | OPEN_STATE, &oc->oc_stateowner, &stp, NULL))) goto out; @@ -2265,7 +2276,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, od->od_seqid, &od->od_stateid, - CHECK_FH | OPEN_STATE, + OPEN_STATE, &od->od_stateowner, &stp, NULL))) goto out; @@ -2318,7 +2329,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, close->cl_seqid, &close->cl_stateid, - CHECK_FH | OPEN_STATE | CLOSE_STATE, + OPEN_STATE | CLOSE_STATE, &close->cl_stateowner, &stp, NULL))) goto out; status = nfs_ok; @@ -2623,7 +2634,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(&cstate->current_fh, lock->lk_new_open_seqid, &lock->lk_new_open_stateid, - CHECK_FH | OPEN_STATE, + OPEN_STATE, &lock->lk_replay_owner, &open_stp, lock); if (status) @@ -2650,7 +2661,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfs4_preprocess_seqid_op(&cstate->current_fh, lock->lk_old_lock_seqid, &lock->lk_old_lock_stateid, - CHECK_FH | LOCK_STATE, + LOCK_STATE, &lock->lk_replay_owner, &lock_stp, lock); if (status) goto out; @@ -2701,9 +2712,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * Note: locks.c uses the BKL to protect the inode's lock list. */ - /* XXX?: Just to divert the locks_release_private at the start of - * locks_copy_lock: */ - locks_init_lock(&conflock); err = vfs_lock_file(filp, cmd, &file_lock, &conflock); switch (-err) { case 0: /* success! */ @@ -2847,7 +2855,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, locku->lu_seqid, &locku->lu_stateid, - CHECK_FH | LOCK_STATE, + LOCK_STATE, &locku->lu_stateowner, &stp, NULL))) goto out; diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 0e6a179ecca..c513bbdf2d3 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -376,20 +376,6 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia goto xdr_error; } } - if (bmval[1] & FATTR4_WORD1_TIME_METADATA) { - /* We require the high 32 bits of 'seconds' to be 0, and we ignore - all 32 bits of 'nseconds'. */ - READ_BUF(12); - len += 12; - READ32(dummy32); - if (dummy32) - return nfserr_inval; - READ32(iattr->ia_ctime.tv_sec); - READ32(iattr->ia_ctime.tv_nsec); - if (iattr->ia_ctime.tv_nsec >= (u32)1000000000) - return nfserr_inval; - iattr->ia_valid |= ATTR_CTIME; - } if (bmval[1] & FATTR4_WORD1_TIME_MODIFY_SET) { READ_BUF(4); len += 4; @@ -1867,6 +1853,15 @@ out_serverfault: goto out; } +static inline int attributes_need_mount(u32 *bmval) +{ + if (bmval[0] & ~(FATTR4_WORD0_RDATTR_ERROR | FATTR4_WORD0_LEASE_TIME)) + return 1; + if (bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) + return 1; + return 0; +} + static __be32 nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, const char *name, int namlen, __be32 *p, int *buflen) @@ -1888,9 +1883,7 @@ nfsd4_encode_dirent_fattr(struct nfsd4_readdir *cd, * we will not follow the cross mount and will fill the attribtutes * directly from the mountpoint dentry. */ - if (d_mountpoint(dentry) && - (cd->rd_bmval[0] & ~FATTR4_WORD0_RDATTR_ERROR) == 0 && - (cd->rd_bmval[1] & ~FATTR4_WORD1_MOUNTED_ON_FILEID) == 0) + if (d_mountpoint(dentry) && !attributes_need_mount(cd->rd_bmval)) ignore_crossmnt = 1; else if (d_mountpoint(dentry)) { int err; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 8516137cdbb..5ac00c4fee9 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -22,6 +22,7 @@ #include <linux/seq_file.h> #include <linux/pagemap.h> #include <linux/init.h> +#include <linux/inet.h> #include <linux/string.h> #include <linux/smp_lock.h> #include <linux/ctype.h> @@ -35,8 +36,10 @@ #include <linux/nfsd/cache.h> #include <linux/nfsd/xdr.h> #include <linux/nfsd/syscall.h> +#include <linux/lockd/lockd.h> #include <asm/uaccess.h> +#include <net/ipv6.h> /* * We have a single directory with 9 nodes in it. @@ -52,6 +55,8 @@ enum { NFSD_Getfs, NFSD_List, NFSD_Fh, + NFSD_FO_UnlockIP, + NFSD_FO_UnlockFS, NFSD_Threads, NFSD_Pool_Threads, NFSD_Versions, @@ -88,6 +93,9 @@ static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); #endif +static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size); +static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size); + static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Svc] = write_svc, [NFSD_Add] = write_add, @@ -97,6 +105,8 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Getfd] = write_getfd, [NFSD_Getfs] = write_getfs, [NFSD_Fh] = write_filehandle, + [NFSD_FO_UnlockIP] = failover_unlock_ip, + [NFSD_FO_UnlockFS] = failover_unlock_fs, [NFSD_Threads] = write_threads, [NFSD_Pool_Threads] = write_pool_threads, [NFSD_Versions] = write_versions, @@ -149,7 +159,6 @@ static const struct file_operations transaction_ops = { .release = simple_transaction_release, }; -extern struct seq_operations nfs_exports_op; static int exports_open(struct inode *inode, struct file *file) { return seq_open(file, &nfs_exports_op); @@ -160,6 +169,7 @@ static const struct file_operations exports_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; /*----------------------------------------------------------------------------*/ @@ -222,6 +232,7 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) struct auth_domain *clp; int err = 0; struct knfsd_fh *res; + struct in6_addr in6; if (size < sizeof(*data)) return -EINVAL; @@ -236,7 +247,11 @@ static ssize_t write_getfs(struct file *file, char *buf, size_t size) res = (struct knfsd_fh*)buf; exp_readlock(); - if (!(clp = auth_unix_lookup(sin->sin_addr))) + + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); + + clp = auth_unix_lookup(&in6); + if (!clp) err = -EPERM; else { err = exp_rootfh(clp, data->gd_path, res, data->gd_maxlen); @@ -257,6 +272,7 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) int err = 0; struct knfsd_fh fh; char *res; + struct in6_addr in6; if (size < sizeof(*data)) return -EINVAL; @@ -271,7 +287,11 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) res = buf; sin = (struct sockaddr_in *)&data->gd_addr; exp_readlock(); - if (!(clp = auth_unix_lookup(sin->sin_addr))) + + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, &in6); + + clp = auth_unix_lookup(&in6); + if (!clp) err = -EPERM; else { err = exp_rootfh(clp, data->gd_path, &fh, NFS_FHSIZE); @@ -288,6 +308,58 @@ static ssize_t write_getfd(struct file *file, char *buf, size_t size) return err; } +static ssize_t failover_unlock_ip(struct file *file, char *buf, size_t size) +{ + __be32 server_ip; + char *fo_path, c; + int b1, b2, b3, b4; + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + /* get ipv4 address */ + if (sscanf(fo_path, "%u.%u.%u.%u%c", &b1, &b2, &b3, &b4, &c) != 4) + return -EINVAL; + server_ip = htonl((((((b1<<8)|b2)<<8)|b3)<<8)|b4); + + return nlmsvc_unlock_all_by_ip(server_ip); +} + +static ssize_t failover_unlock_fs(struct file *file, char *buf, size_t size) +{ + struct nameidata nd; + char *fo_path; + int error; + + /* sanity check */ + if (size == 0) + return -EINVAL; + + if (buf[size-1] != '\n') + return -EINVAL; + + fo_path = buf; + if (qword_get(&buf, fo_path, size) < 0) + return -EINVAL; + + error = path_lookup(fo_path, 0, &nd); + if (error) + return error; + + error = nlmsvc_unlock_all_by_sb(nd.path.mnt->mnt_sb); + + path_put(&nd.path); + return error; +} + static ssize_t write_filehandle(struct file *file, char *buf, size_t size) { /* request is: @@ -347,8 +419,6 @@ static ssize_t write_filehandle(struct file *file, char *buf, size_t size) return mesg - buf; } -extern int nfsd_nrthreads(void); - static ssize_t write_threads(struct file *file, char *buf, size_t size) { /* if size > 0, look for a number of threads and call nfsd_svc @@ -371,10 +441,6 @@ static ssize_t write_threads(struct file *file, char *buf, size_t size) return strlen(buf); } -extern int nfsd_nrpools(void); -extern int nfsd_get_nrthreads(int n, int *); -extern int nfsd_set_nrthreads(int n, int *); - static ssize_t write_pool_threads(struct file *file, char *buf, size_t size) { /* if size > 0, look for an array of number of threads per node @@ -696,6 +762,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Getfd] = {".getfd", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Getfs] = {".getfs", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_List] = {"exports", &exports_operations, S_IRUGO}, + [NFSD_FO_UnlockIP] = {"unlock_ip", + &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_FO_UnlockFS] = {"unlock_filesystem", + &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, @@ -732,10 +802,9 @@ static int create_proc_exports_entry(void) entry = proc_mkdir("fs/nfs", NULL); if (!entry) return -ENOMEM; - entry = create_proc_entry("fs/nfs/exports", 0, NULL); + entry = proc_create("exports", 0, entry, &exports_operations); if (!entry) return -ENOMEM; - entry->proc_fops = &exports_operations; return 0; } #else /* CONFIG_PROC_FS */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 3e6b3f41ee1..100ae564116 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -113,6 +113,124 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, } /* + * Use the given filehandle to look up the corresponding export and + * dentry. On success, the results are used to set fh_export and + * fh_dentry. + */ +static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) +{ + struct knfsd_fh *fh = &fhp->fh_handle; + struct fid *fid = NULL, sfid; + struct svc_export *exp; + struct dentry *dentry; + int fileid_type; + int data_left = fh->fh_size/4; + __be32 error; + + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + if (rqstp->rq_vers == 4 && fh->fh_size == 0) + return nfserr_nofilehandle; + + if (fh->fh_version == 1) { + int len; + + if (--data_left < 0) + return error; + if (fh->fh_auth_type != 0) + return error; + len = key_len(fh->fh_fsid_type) / 4; + if (len == 0) + return error; + if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { + /* deprecated, convert to type 3 */ + len = key_len(FSID_ENCODE_DEV)/4; + fh->fh_fsid_type = FSID_ENCODE_DEV; + fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); + fh->fh_fsid[1] = fh->fh_fsid[2]; + } + data_left -= len; + if (data_left < 0) + return error; + exp = rqst_exp_find(rqstp, fh->fh_fsid_type, fh->fh_auth); + fid = (struct fid *)(fh->fh_auth + len); + } else { + __u32 tfh[2]; + dev_t xdev; + ino_t xino; + + if (fh->fh_size != NFS_FHSIZE) + return error; + /* assume old filehandle format */ + xdev = old_decode_dev(fh->ofh_xdev); + xino = u32_to_ino_t(fh->ofh_xino); + mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); + exp = rqst_exp_find(rqstp, FSID_DEV, tfh); + } + + error = nfserr_stale; + if (PTR_ERR(exp) == -ENOENT) + return error; + + if (IS_ERR(exp)) + return nfserrno(PTR_ERR(exp)); + + error = nfsd_setuser_and_check_port(rqstp, exp); + if (error) + goto out; + + /* + * Look up the dentry using the NFS file handle. + */ + error = nfserr_stale; + if (rqstp->rq_vers > 2) + error = nfserr_badhandle; + + if (fh->fh_version != 1) { + sfid.i32.ino = fh->ofh_ino; + sfid.i32.gen = fh->ofh_generation; + sfid.i32.parent_ino = fh->ofh_dirino; + fid = &sfid; + data_left = 3; + if (fh->ofh_dirino == 0) + fileid_type = FILEID_INO32_GEN; + else + fileid_type = FILEID_INO32_GEN_PARENT; + } else + fileid_type = fh->fh_fileid_type; + + if (fileid_type == FILEID_ROOT) + dentry = dget(exp->ex_path.dentry); + else { + dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, + data_left, fileid_type, + nfsd_acceptable, exp); + } + if (dentry == NULL) + goto out; + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) != -EINVAL) + error = nfserrno(PTR_ERR(dentry)); + goto out; + } + + if (S_ISDIR(dentry->d_inode->i_mode) && + (dentry->d_flags & DCACHE_DISCONNECTED)) { + printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", + dentry->d_parent->d_name.name, dentry->d_name.name); + } + + fhp->fh_dentry = dentry; + fhp->fh_export = exp; + nfsd_nr_verified++; + return 0; +out: + exp_put(exp); + return error; +} + +/* * Perform sanity checks on the dentry in a client's file handle. * * Note that the file handle dentry may need to be freed even after @@ -124,115 +242,18 @@ static __be32 nfsd_setuser_and_check_port(struct svc_rqst *rqstp, __be32 fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) { - struct knfsd_fh *fh = &fhp->fh_handle; - struct svc_export *exp = NULL; + struct svc_export *exp; struct dentry *dentry; - __be32 error = 0; + __be32 error; dprintk("nfsd: fh_verify(%s)\n", SVCFH_fmt(fhp)); if (!fhp->fh_dentry) { - struct fid *fid = NULL, sfid; - int fileid_type; - int data_left = fh->fh_size/4; - - error = nfserr_stale; - if (rqstp->rq_vers > 2) - error = nfserr_badhandle; - if (rqstp->rq_vers == 4 && fh->fh_size == 0) - return nfserr_nofilehandle; - - if (fh->fh_version == 1) { - int len; - if (--data_left<0) goto out; - switch (fh->fh_auth_type) { - case 0: break; - default: goto out; - } - len = key_len(fh->fh_fsid_type) / 4; - if (len == 0) goto out; - if (fh->fh_fsid_type == FSID_MAJOR_MINOR) { - /* deprecated, convert to type 3 */ - len = key_len(FSID_ENCODE_DEV)/4; - fh->fh_fsid_type = FSID_ENCODE_DEV; - fh->fh_fsid[0] = new_encode_dev(MKDEV(ntohl(fh->fh_fsid[0]), ntohl(fh->fh_fsid[1]))); - fh->fh_fsid[1] = fh->fh_fsid[2]; - } - if ((data_left -= len)<0) goto out; - exp = rqst_exp_find(rqstp, fh->fh_fsid_type, - fh->fh_auth); - fid = (struct fid *)(fh->fh_auth + len); - } else { - __u32 tfh[2]; - dev_t xdev; - ino_t xino; - if (fh->fh_size != NFS_FHSIZE) - goto out; - /* assume old filehandle format */ - xdev = old_decode_dev(fh->ofh_xdev); - xino = u32_to_ino_t(fh->ofh_xino); - mk_fsid(FSID_DEV, tfh, xdev, xino, 0, NULL); - exp = rqst_exp_find(rqstp, FSID_DEV, tfh); - } - - error = nfserr_stale; - if (PTR_ERR(exp) == -ENOENT) - goto out; - - if (IS_ERR(exp)) { - error = nfserrno(PTR_ERR(exp)); - goto out; - } - - error = nfsd_setuser_and_check_port(rqstp, exp); + error = nfsd_set_fh_dentry(rqstp, fhp); if (error) goto out; - - /* - * Look up the dentry using the NFS file handle. - */ - error = nfserr_stale; - if (rqstp->rq_vers > 2) - error = nfserr_badhandle; - - if (fh->fh_version != 1) { - sfid.i32.ino = fh->ofh_ino; - sfid.i32.gen = fh->ofh_generation; - sfid.i32.parent_ino = fh->ofh_dirino; - fid = &sfid; - data_left = 3; - if (fh->ofh_dirino == 0) - fileid_type = FILEID_INO32_GEN; - else - fileid_type = FILEID_INO32_GEN_PARENT; - } else - fileid_type = fh->fh_fileid_type; - - if (fileid_type == FILEID_ROOT) - dentry = dget(exp->ex_path.dentry); - else { - dentry = exportfs_decode_fh(exp->ex_path.mnt, fid, - data_left, fileid_type, - nfsd_acceptable, exp); - } - if (dentry == NULL) - goto out; - if (IS_ERR(dentry)) { - if (PTR_ERR(dentry) != -EINVAL) - error = nfserrno(PTR_ERR(dentry)); - goto out; - } - - if (S_ISDIR(dentry->d_inode->i_mode) && - (dentry->d_flags & DCACHE_DISCONNECTED)) { - printk("nfsd: find_fh_dentry returned a DISCONNECTED directory: %s/%s\n", - dentry->d_parent->d_name.name, dentry->d_name.name); - } - - fhp->fh_dentry = dentry; - fhp->fh_export = exp; - nfsd_nr_verified++; - cache_get(&exp->h); + dentry = fhp->fh_dentry; + exp = fhp->fh_export; } else { /* * just rechecking permissions @@ -242,7 +263,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) dprintk("nfsd: fh_verify - just checking\n"); dentry = fhp->fh_dentry; exp = fhp->fh_export; - cache_get(&exp->h); /* * Set user creds for this exportpoint; necessary even * in the "just checking" case because this may be a @@ -281,8 +301,6 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) access, ntohl(error)); } out: - if (exp && !IS_ERR(exp)) - exp_put(exp); if (error == nfserr_stale) nfsdstats.fh_stale++; return error; diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9647b0f7bc0..941041f4b13 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -244,7 +244,6 @@ static int nfsd_init_socks(int port) if (error < 0) return error; -#ifdef CONFIG_NFSD_TCP error = lockd_up(IPPROTO_TCP); if (error >= 0) { error = svc_create_xprt(nfsd_serv, "tcp", port, @@ -254,7 +253,6 @@ static int nfsd_init_socks(int port) } if (error < 0) return error; -#endif return 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 304bf5f643c..a3a291f771f 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -264,7 +264,6 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, struct inode *inode; int accmode = MAY_SATTR; int ftype = 0; - int imode; __be32 err; int host_err; int size_change = 0; @@ -360,25 +359,25 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, DQUOT_INIT(inode); } - imode = inode->i_mode; + /* sanitize the mode change */ if (iap->ia_valid & ATTR_MODE) { iap->ia_mode &= S_IALLUGO; - imode = iap->ia_mode |= (imode & ~S_IALLUGO); - /* if changing uid/gid revoke setuid/setgid in mode */ - if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) { - iap->ia_valid |= ATTR_KILL_PRIV; + iap->ia_mode |= (inode->i_mode & ~S_IALLUGO); + } + + /* Revoke setuid/setgid on chown */ + if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || + ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { + iap->ia_valid |= ATTR_KILL_PRIV; + if (iap->ia_valid & ATTR_MODE) { + /* we're setting mode too, just clear the s*id bits */ iap->ia_mode &= ~S_ISUID; + if (iap->ia_mode & S_IXGRP) + iap->ia_mode &= ~S_ISGID; + } else { + /* set ATTR_KILL_* bits and let VFS handle it */ + iap->ia_valid |= (ATTR_KILL_SUID | ATTR_KILL_SGID); } - if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid) - iap->ia_mode &= ~S_ISGID; - } else { - /* - * Revoke setuid/setgid bit on chown/chgrp - */ - if ((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) - iap->ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV; - if ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid) - iap->ia_valid |= ATTR_KILL_SGID; } /* Change the attributes. */ @@ -988,7 +987,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, * flushing the data to disk is handled separately below. */ - if (file->f_op->fsync == 0) {/* COMMIT3 cannot work */ + if (!file->f_op->fsync) {/* COMMIT3 cannot work */ stable = 2; *stablep = 2; /* FILE_SYNC */ } @@ -1152,7 +1151,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, } #endif /* CONFIG_NFSD_V3 */ -__be32 +static __be32 nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp, struct iattr *iap) { diff --git a/fs/ntfs/debug.h b/fs/ntfs/debug.h index 8ac37c33d12..5e6724c1afd 100644 --- a/fs/ntfs/debug.h +++ b/fs/ntfs/debug.h @@ -45,7 +45,7 @@ static void ntfs_debug(const char *f, ...); extern void __ntfs_debug (const char *file, int line, const char *function, const char *format, ...) __attribute__ ((format (printf, 4, 5))); #define ntfs_debug(f, a...) \ - __ntfs_debug(__FILE__, __LINE__, __FUNCTION__, f, ##a) + __ntfs_debug(__FILE__, __LINE__, __func__, f, ##a) extern void ntfs_debug_dump_runlist(const runlist_element *rl); @@ -58,10 +58,10 @@ extern void ntfs_debug_dump_runlist(const runlist_element *rl); extern void __ntfs_warning(const char *function, const struct super_block *sb, const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); -#define ntfs_warning(sb, f, a...) __ntfs_warning(__FUNCTION__, sb, f, ##a) +#define ntfs_warning(sb, f, a...) __ntfs_warning(__func__, sb, f, ##a) extern void __ntfs_error(const char *function, const struct super_block *sb, const char *fmt, ...) __attribute__ ((format (printf, 3, 4))); -#define ntfs_error(sb, f, a...) __ntfs_error(__FUNCTION__, sb, f, ##a) +#define ntfs_error(sb, f, a...) __ntfs_error(__func__, sb, f, ##a) #endif /* _LINUX_NTFS_DEBUG_H */ diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 2ad5c8b104b..790defb847e 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c @@ -1191,7 +1191,7 @@ static int ntfs_mft_bitmap_find_and_alloc_free_rec_nolock(ntfs_volume *vol, if (size) { page = ntfs_map_page(mftbmp_mapping, ofs >> PAGE_CACHE_SHIFT); - if (unlikely(IS_ERR(page))) { + if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to read mft " "bitmap, aborting."); return PTR_ERR(page); @@ -2118,7 +2118,7 @@ static int ntfs_mft_record_format(const ntfs_volume *vol, const s64 mft_no) } /* Read, map, and pin the page containing the mft record. */ page = ntfs_map_page(mft_vi->i_mapping, index); - if (unlikely(IS_ERR(page))) { + if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to map page containing mft record " "to format 0x%llx.", (long long)mft_no); return PTR_ERR(page); @@ -2519,7 +2519,7 @@ mft_rec_already_initialized: ofs = (bit << vol->mft_record_size_bits) & ~PAGE_CACHE_MASK; /* Read, map, and pin the page containing the mft record. */ page = ntfs_map_page(vol->mft_ino->i_mapping, index); - if (unlikely(IS_ERR(page))) { + if (IS_ERR(page)) { ntfs_error(vol->sb, "Failed to map page containing allocated " "mft record 0x%llx.", (long long)bit); err = PTR_ERR(page); diff --git a/fs/ocfs2/cluster/sys.c b/fs/ocfs2/cluster/sys.c index 98429fd6849..bc702dab5d1 100644 --- a/fs/ocfs2/cluster/sys.c +++ b/fs/ocfs2/cluster/sys.c @@ -65,7 +65,7 @@ int o2cb_sys_init(void) { int ret; - o2cb_kset = kset_create_and_add("o2cb", NULL, NULL); + o2cb_kset = kset_create_and_add("o2cb", NULL, fs_kobj); if (!o2cb_kset) return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 5f6d858770a..1b81dcba175 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -44,7 +44,8 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -int stringify_lockname(const char *lockname, int locklen, char *buf, int len); +static int stringify_lockname(const char *lockname, int locklen, char *buf, + int len); void dlm_print_one_lock_resource(struct dlm_lock_resource *res) { @@ -251,7 +252,8 @@ EXPORT_SYMBOL_GPL(dlm_errname); * * For more on lockname formats, please refer to dlmglue.c and ocfs2_lockid.h. */ -int stringify_lockname(const char *lockname, int locklen, char *buf, int len) +static int stringify_lockname(const char *lockname, int locklen, char *buf, + int len) { int out = 0; __be64 inode_blkno_be; @@ -368,7 +370,7 @@ static void dlm_debug_free(struct kref *kref) kfree(dc); } -void dlm_debug_put(struct dlm_debug_ctxt *dc) +static void dlm_debug_put(struct dlm_debug_ctxt *dc) { if (dc) kref_put(&dc->debug_refcnt, dlm_debug_free); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 61a000f8524..e48aba698b7 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -327,7 +327,7 @@ clear_fields: static struct backing_dev_info dlmfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static struct inode *dlmfs_get_root_inode(struct super_block *sb) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 9154c82d325..57e0d30cde9 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1048,6 +1048,10 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) mlog_entry("(0x%p, '%.*s')\n", dentry, dentry->d_name.len, dentry->d_name.name); + /* ensuring we don't even attempt to truncate a symlink */ + if (S_ISLNK(inode->i_mode)) + attr->ia_valid &= ~ATTR_SIZE; + if (attr->ia_valid & ATTR_MODE) mlog(0, "mode change: %d\n", attr->ia_mode); if (attr->ia_valid & ATTR_UID) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ce0dc147602..be774bdc8b3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -260,7 +260,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) bh = osb->local_alloc_bh; alloc = (struct ocfs2_dinode *) bh->b_data; - alloc_copy = kmalloc(bh->b_size, GFP_KERNEL); + alloc_copy = kmalloc(bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; goto out_commit; @@ -931,7 +931,7 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, * local alloc shutdown won't try to double free main bitmap * bits. Make a copy so the sync function knows which bits to * free. */ - alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_KERNEL); + alloc_copy = kmalloc(osb->local_alloc_bh->b_size, GFP_NOFS); if (!alloc_copy) { status = -ENOMEM; mlog_errno(status); diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index ac1d74c63bf..bbd1667aa7d 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -385,7 +385,7 @@ static int o2cb_cluster_this_node(unsigned int *node) return 0; } -struct ocfs2_stack_operations o2cb_stack_ops = { +static struct ocfs2_stack_operations o2cb_stack_ops = { .connect = o2cb_cluster_connect, .disconnect = o2cb_cluster_disconnect, .hangup = o2cb_cluster_hangup, diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 7428663f9cb..b503772cd0e 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -635,7 +635,7 @@ static const struct file_operations ocfs2_control_fops = { .owner = THIS_MODULE, }; -struct miscdevice ocfs2_control_device = { +static struct miscdevice ocfs2_control_device = { .minor = MISC_DYNAMIC_MINOR, .name = "ocfs2_control", .fops = &ocfs2_control_fops, diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index 7134007ba22..ba9dbb51d25 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -167,9 +167,11 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .readlink = page_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, + .setattr = ocfs2_setattr, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, + .setattr = ocfs2_setattr, }; diff --git a/fs/open.c b/fs/open.c index b70e7666bb2..a1450086e92 100644 --- a/fs/open.c +++ b/fs/open.c @@ -7,6 +7,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/quotaops.h> #include <linux/fsnotify.h> #include <linux/module.h> @@ -837,7 +838,7 @@ static struct file *__dentry_open(struct dentry *dentry, struct vfsmount *mnt, if (f->f_flags & O_DIRECT) { if (!f->f_mapping->a_ops || ((!f->f_mapping->a_ops->direct_IO) && - (!f->f_mapping->a_ops->get_xip_page))) { + (!f->f_mapping->a_ops->get_xip_mem))) { fput(f); f = ERR_PTR(-EINVAL); } diff --git a/fs/partitions/ldm.c b/fs/partitions/ldm.c index e7dd1d4e347..0fdda2e8a4c 100644 --- a/fs/partitions/ldm.c +++ b/fs/partitions/ldm.c @@ -41,12 +41,12 @@ #ifndef CONFIG_LDM_DEBUG #define ldm_debug(...) do {} while (0) #else -#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __FUNCTION__, f, ##a) +#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) #endif -#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __FUNCTION__, f, ##a) -#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __FUNCTION__, f, ##a) -#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __FUNCTION__, f, ##a) +#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) +#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) +#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) __attribute__ ((format (printf, 3, 4))) static void _ldm_printk (const char *level, const char *function, diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 5567ec0d03a..796511886f2 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -18,7 +18,7 @@ * * Re-organised Feb 1998 Russell King */ - +#include <linux/msdos_fs.h> #include "check.h" #include "msdos.h" @@ -419,6 +419,7 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) Sector sect; unsigned char *data; struct partition *p; + struct fat_boot_sector *fb; int slot; data = read_dev_sector(bdev, 0, §); @@ -444,8 +445,21 @@ int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) p = (struct partition *) (data + 0x1be); for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { - put_dev_sector(sect); - return 0; + /* + * Even without a valid boot inidicator value + * its still possible this is valid FAT filesystem + * without a partition table. + */ + fb = (struct fat_boot_sector *) data; + if (slot == 1 && fb->reserved && fb->fats + && fat_valid_media(fb->media)) { + printk("\n"); + put_dev_sector(sect); + return 1; + } else { + put_dev_sector(sect); + return 0; + } } } diff --git a/fs/pipe.c b/fs/pipe.c index f73492b6817..3499f9ff631 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1076,6 +1076,23 @@ int do_pipe(int *fd) } /* + * sys_pipe() is the normal C calling standard for creating + * a pipe. It's not the way Unix traditionally does this, though. + */ +asmlinkage long __weak sys_pipe(int __user *fildes) +{ + int fd[2]; + int error; + + error = do_pipe(fd); + if (!error) { + if (copy_to_user(fildes, fd, sizeof(fd))) + error = -EFAULT; + } + return error; +} + +/* * pipefs should _never_ be mounted by userland - too much of security hassle, * no real gain from having the whole whorehouse mounted. So we don't need * any operations on the root directory. However, we need a non-trivial diff --git a/fs/pnode.c b/fs/pnode.c index f968e35d978..8d5f392ec3d 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -28,6 +28,57 @@ static inline struct vfsmount *next_slave(struct vfsmount *p) return list_entry(p->mnt_slave.next, struct vfsmount, mnt_slave); } +/* + * Return true if path is reachable from root + * + * namespace_sem is held, and mnt is attached + */ +static bool is_path_reachable(struct vfsmount *mnt, struct dentry *dentry, + const struct path *root) +{ + while (mnt != root->mnt && mnt->mnt_parent != mnt) { + dentry = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; + } + return mnt == root->mnt && is_subdir(dentry, root->dentry); +} + +static struct vfsmount *get_peer_under_root(struct vfsmount *mnt, + struct mnt_namespace *ns, + const struct path *root) +{ + struct vfsmount *m = mnt; + + do { + /* Check the namespace first for optimization */ + if (m->mnt_ns == ns && is_path_reachable(m, m->mnt_root, root)) + return m; + + m = next_peer(m); + } while (m != mnt); + + return NULL; +} + +/* + * Get ID of closest dominating peer group having a representative + * under the given root. + * + * Caller must hold namespace_sem + */ +int get_dominating_id(struct vfsmount *mnt, const struct path *root) +{ + struct vfsmount *m; + + for (m = mnt->mnt_master; m != NULL; m = m->mnt_master) { + struct vfsmount *d = get_peer_under_root(m, mnt->mnt_ns, root); + if (d) + return d->mnt_group_id; + } + + return 0; +} + static int do_make_slave(struct vfsmount *mnt) { struct vfsmount *peer_mnt = mnt, *master = mnt->mnt_master; @@ -46,7 +97,11 @@ static int do_make_slave(struct vfsmount *mnt) if (peer_mnt == mnt) peer_mnt = NULL; } + if (IS_MNT_SHARED(mnt) && list_empty(&mnt->mnt_share)) + mnt_release_group_id(mnt); + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; if (peer_mnt) master = peer_mnt; @@ -68,7 +123,6 @@ static int do_make_slave(struct vfsmount *mnt) } mnt->mnt_master = master; CLEAR_MNT_SHARED(mnt); - INIT_LIST_HEAD(&mnt->mnt_slave_list); return 0; } diff --git a/fs/pnode.h b/fs/pnode.h index 973c3f825e7..958665d662a 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -36,4 +36,5 @@ int propagate_mnt(struct vfsmount *, struct dentry *, struct vfsmount *, int propagate_umount(struct list_head *); int propagate_mount_busy(struct vfsmount *, int); void mnt_release_group_id(struct vfsmount *); +int get_dominating_id(struct vfsmount *mnt, const struct path *root); #endif /* _LINUX_PNODE_H */ diff --git a/fs/proc/array.c b/fs/proc/array.c index 07d6c4853fe..dca997a93bf 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -73,6 +73,7 @@ #include <linux/signal.h> #include <linux/highmem.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/times.h> #include <linux/cpuset.h> #include <linux/rcupdate.h> @@ -425,12 +426,13 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, cutime = cstime = utime = stime = cputime_zero; cgtime = gtime = cputime_zero; - rcu_read_lock(); if (lock_task_sighand(task, &flags)) { struct signal_struct *sig = task->signal; if (sig->tty) { - tty_pgrp = pid_nr_ns(sig->tty->pgrp, ns); + struct pid *pgrp = tty_get_pgrp(sig->tty); + tty_pgrp = pid_nr_ns(pgrp, ns); + put_pid(pgrp); tty_nr = new_encode_dev(tty_devnum(sig->tty)); } @@ -469,7 +471,6 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, unlock_task_sighand(task, &flags); } - rcu_read_unlock(); if (!whole || num_threads < 2) wchan = get_wchan(task); diff --git a/fs/proc/base.c b/fs/proc/base.c index 7313c62e3e9..808cbdc193d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -56,6 +56,7 @@ #include <linux/init.h> #include <linux/capability.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/string.h> #include <linux/seq_file.h> #include <linux/namei.h> @@ -195,12 +196,32 @@ static int proc_root_link(struct inode *inode, struct path *path) return result; } -#define MAY_PTRACE(task) \ - (task == current || \ - (task->parent == current && \ - (task->ptrace & PT_PTRACED) && \ - (task_is_stopped_or_traced(task)) && \ - security_ptrace(current,task) == 0)) +/* + * Return zero if current may access user memory in @task, -error if not. + */ +static int check_mem_permission(struct task_struct *task) +{ + /* + * A task can always look at itself, in case it chooses + * to use system calls instead of load instructions. + */ + if (task == current) + return 0; + + /* + * If current is actively ptrace'ing, and would also be + * permitted to freshly attach with ptrace now, permit it. + */ + if (task->parent == current && (task->ptrace & PT_PTRACED) && + task_is_stopped_or_traced(task) && + ptrace_may_attach(task)) + return 0; + + /* + * Noone else is allowed. + */ + return -EPERM; +} struct mm_struct *mm_for_maps(struct task_struct *task) { @@ -502,17 +523,14 @@ static const struct inode_operations proc_def_inode_operations = { .setattr = proc_setattr, }; -extern const struct seq_operations mounts_op; -struct proc_mounts { - struct seq_file m; - int event; -}; - -static int mounts_open(struct inode *inode, struct file *file) +static int mounts_open_common(struct inode *inode, struct file *file, + const struct seq_operations *op) { struct task_struct *task = get_proc_task(inode); struct nsproxy *nsp; struct mnt_namespace *ns = NULL; + struct fs_struct *fs = NULL; + struct path root; struct proc_mounts *p; int ret = -EINVAL; @@ -525,40 +543,61 @@ static int mounts_open(struct inode *inode, struct file *file) get_mnt_ns(ns); } rcu_read_unlock(); - + if (ns) + fs = get_fs_struct(task); put_task_struct(task); } - if (ns) { - ret = -ENOMEM; - p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); - if (p) { - file->private_data = &p->m; - ret = seq_open(file, &mounts_op); - if (!ret) { - p->m.private = ns; - p->event = ns->event; - return 0; - } - kfree(p); - } - put_mnt_ns(ns); - } + if (!ns) + goto err; + if (!fs) + goto err_put_ns; + + read_lock(&fs->lock); + root = fs->root; + path_get(&root); + read_unlock(&fs->lock); + put_fs_struct(fs); + + ret = -ENOMEM; + p = kmalloc(sizeof(struct proc_mounts), GFP_KERNEL); + if (!p) + goto err_put_path; + + file->private_data = &p->m; + ret = seq_open(file, op); + if (ret) + goto err_free; + + p->m.private = p; + p->ns = ns; + p->root = root; + p->event = ns->event; + + return 0; + + err_free: + kfree(p); + err_put_path: + path_put(&root); + err_put_ns: + put_mnt_ns(ns); + err: return ret; } static int mounts_release(struct inode *inode, struct file *file) { - struct seq_file *m = file->private_data; - struct mnt_namespace *ns = m->private; - put_mnt_ns(ns); + struct proc_mounts *p = file->private_data; + path_put(&p->root); + put_mnt_ns(p->ns); return seq_release(inode, file); } static unsigned mounts_poll(struct file *file, poll_table *wait) { struct proc_mounts *p = file->private_data; - struct mnt_namespace *ns = p->m.private; + struct mnt_namespace *ns = p->ns; unsigned res = 0; poll_wait(file, &ns->poll, wait); @@ -573,6 +612,11 @@ static unsigned mounts_poll(struct file *file, poll_table *wait) return res; } +static int mounts_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mounts_op); +} + static const struct file_operations proc_mounts_operations = { .open = mounts_open, .read = seq_read, @@ -581,38 +625,22 @@ static const struct file_operations proc_mounts_operations = { .poll = mounts_poll, }; -extern const struct seq_operations mountstats_op; -static int mountstats_open(struct inode *inode, struct file *file) +static int mountinfo_open(struct inode *inode, struct file *file) { - int ret = seq_open(file, &mountstats_op); - - if (!ret) { - struct seq_file *m = file->private_data; - struct nsproxy *nsp; - struct mnt_namespace *mnt_ns = NULL; - struct task_struct *task = get_proc_task(inode); - - if (task) { - rcu_read_lock(); - nsp = task_nsproxy(task); - if (nsp) { - mnt_ns = nsp->mnt_ns; - if (mnt_ns) - get_mnt_ns(mnt_ns); - } - rcu_read_unlock(); + return mounts_open_common(inode, file, &mountinfo_op); +} - put_task_struct(task); - } +static const struct file_operations proc_mountinfo_operations = { + .open = mountinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = mounts_release, + .poll = mounts_poll, +}; - if (mnt_ns) - m->private = mnt_ns; - else { - seq_release(inode, file); - ret = -EINVAL; - } - } - return ret; +static int mountstats_open(struct inode *inode, struct file *file) +{ + return mounts_open_common(inode, file, &mountstats_op); } static const struct file_operations proc_mountstats_operations = { @@ -715,7 +743,7 @@ static ssize_t mem_read(struct file * file, char __user * buf, if (!task) goto out_no_task; - if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) + if (check_mem_permission(task)) goto out; ret = -ENOMEM; @@ -741,7 +769,7 @@ static ssize_t mem_read(struct file * file, char __user * buf, this_len = (count > PAGE_SIZE) ? PAGE_SIZE : count; retval = access_process_vm(task, src, page, this_len, 0); - if (!retval || !MAY_PTRACE(task) || !ptrace_may_attach(task)) { + if (!retval || check_mem_permission(task)) { if (!ret) ret = -EIO; break; @@ -785,7 +813,7 @@ static ssize_t mem_write(struct file * file, const char __user *buf, if (!task) goto out_no_task; - if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) + if (check_mem_permission(task)) goto out; copied = -ENOMEM; @@ -1174,6 +1202,81 @@ static const struct file_operations proc_pid_sched_operations = { #endif +/* + * We added or removed a vma mapping the executable. The vmas are only mapped + * during exec and are not mapped with the mmap system call. + * Callers must hold down_write() on the mm's mmap_sem for these + */ +void added_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas++; +} + +void removed_exe_file_vma(struct mm_struct *mm) +{ + mm->num_exe_file_vmas--; + if ((mm->num_exe_file_vmas == 0) && mm->exe_file){ + fput(mm->exe_file); + mm->exe_file = NULL; + } + +} + +void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) +{ + if (new_exe_file) + get_file(new_exe_file); + if (mm->exe_file) + fput(mm->exe_file); + mm->exe_file = new_exe_file; + mm->num_exe_file_vmas = 0; +} + +struct file *get_mm_exe_file(struct mm_struct *mm) +{ + struct file *exe_file; + + /* We need mmap_sem to protect against races with removal of + * VM_EXECUTABLE vmas */ + down_read(&mm->mmap_sem); + exe_file = mm->exe_file; + if (exe_file) + get_file(exe_file); + up_read(&mm->mmap_sem); + return exe_file; +} + +void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm) +{ + /* It's safe to write the exe_file pointer without exe_file_lock because + * this is called during fork when the task is not yet in /proc */ + newmm->exe_file = get_mm_exe_file(oldmm); +} + +static int proc_exe_link(struct inode *inode, struct path *exe_path) +{ + struct task_struct *task; + struct mm_struct *mm; + struct file *exe_file; + + task = get_proc_task(inode); + if (!task) + return -ENOENT; + mm = get_task_mm(task); + put_task_struct(task); + if (!mm) + return -ENOENT; + exe_file = get_mm_exe_file(mm); + mmput(mm); + if (exe_file) { + *exe_path = exe_file->f_path; + path_get(&exe_file->f_path); + fput(exe_file); + return 0; + } else + return -ENOENT; +} + static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; @@ -2309,6 +2412,7 @@ static const struct pid_entry tgid_base_stuff[] = { LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), + REG("mountinfo", S_IRUGO, mountinfo), REG("mountstats", S_IRUSR, mountstats), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), @@ -2641,6 +2745,7 @@ static const struct pid_entry tid_base_stuff[] = { LNK("root", root), LNK("exe", exe), REG("mounts", S_IRUGO, mounts), + REG("mountinfo", S_IRUGO, mountinfo), #ifdef CONFIG_PROC_PAGE_MONITOR REG("clear_refs", S_IWUSR, clear_refs), REG("smaps", S_IRUGO, smaps), diff --git a/fs/proc/generic.c b/fs/proc/generic.c index a36ad3c75cf..43e54e86cef 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -69,12 +69,7 @@ proc_file_read(struct file *file, char __user *buf, size_t nbytes, count = min_t(size_t, PROC_BLOCK_SIZE, nbytes); start = NULL; - if (dp->get_info) { - /* Handle old net routines */ - n = dp->get_info(page, &start, *ppos, count); - if (n < count) - eof = 1; - } else if (dp->read_proc) { + if (dp->read_proc) { /* * How to be a proc read function * ------------------------------ @@ -277,8 +272,11 @@ static int xlate_proc_name(const char *name, int len; int rtn = 0; + de = *ret; + if (!de) + de = &proc_root; + spin_lock(&proc_subdir_lock); - de = &proc_root; while (1) { next = strchr(cp, '/'); if (!next) @@ -385,20 +383,18 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir, lock_kernel(); spin_lock(&proc_subdir_lock); - if (de) { - for (de = de->subdir; de ; de = de->next) { - if (de->namelen != dentry->d_name.len) - continue; - if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { - unsigned int ino; + for (de = de->subdir; de ; de = de->next) { + if (de->namelen != dentry->d_name.len) + continue; + if (!memcmp(dentry->d_name.name, de->name, de->namelen)) { + unsigned int ino; - ino = de->low_ino; - de_get(de); - spin_unlock(&proc_subdir_lock); - error = -EINVAL; - inode = proc_get_inode(dir->i_sb, ino, de); - goto out_unlock; - } + ino = de->low_ino; + de_get(de); + spin_unlock(&proc_subdir_lock); + error = -EINVAL; + inode = proc_get_inode(dir->i_sb, ino, de); + goto out_unlock; } } spin_unlock(&proc_subdir_lock); @@ -410,7 +406,8 @@ out_unlock: d_add(dentry, inode); return NULL; } - de_put(de); + if (de) + de_put(de); return ERR_PTR(error); } @@ -440,10 +437,6 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent, lock_kernel(); ino = inode->i_ino; - if (!de) { - ret = -EINVAL; - goto out; - } i = filp->f_pos; switch (i) { case 0: @@ -582,7 +575,7 @@ static struct proc_dir_entry *__proc_create(struct proc_dir_entry **parent, /* make sure name is valid */ if (!name || !strlen(name)) goto out; - if (!(*parent) && xlate_proc_name(name, parent, &fn) != 0) + if (xlate_proc_name(name, parent, &fn) != 0) goto out; /* At this point there must not be any '/' characters beyond *fn */ @@ -648,6 +641,23 @@ struct proc_dir_entry *proc_mkdir_mode(const char *name, mode_t mode, return ent; } +struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, + struct proc_dir_entry *parent) +{ + struct proc_dir_entry *ent; + + ent = __proc_create(&parent, name, S_IFDIR | S_IRUGO | S_IXUGO, 2); + if (ent) { + ent->data = net; + if (proc_register(parent, ent) < 0) { + kfree(ent); + ent = NULL; + } + } + return ent; +} +EXPORT_SYMBOL_GPL(proc_net_mkdir); + struct proc_dir_entry *proc_mkdir(const char *name, struct proc_dir_entry *parent) { @@ -682,9 +692,10 @@ struct proc_dir_entry *create_proc_entry(const char *name, mode_t mode, return ent; } -struct proc_dir_entry *proc_create(const char *name, mode_t mode, - struct proc_dir_entry *parent, - const struct file_operations *proc_fops) +struct proc_dir_entry *proc_create_data(const char *name, mode_t mode, + struct proc_dir_entry *parent, + const struct file_operations *proc_fops, + void *data) { struct proc_dir_entry *pde; nlink_t nlink; @@ -705,6 +716,7 @@ struct proc_dir_entry *proc_create(const char *name, mode_t mode, if (!pde) goto out; pde->proc_fops = proc_fops; + pde->data = data; if (proc_register(parent, pde) < 0) goto out_free; return pde; @@ -734,55 +746,58 @@ void free_proc_entry(struct proc_dir_entry *de) void remove_proc_entry(const char *name, struct proc_dir_entry *parent) { struct proc_dir_entry **p; - struct proc_dir_entry *de; + struct proc_dir_entry *de = NULL; const char *fn = name; int len; - if (!parent && xlate_proc_name(name, &parent, &fn) != 0) - goto out; + if (xlate_proc_name(name, &parent, &fn) != 0) + return; len = strlen(fn); spin_lock(&proc_subdir_lock); for (p = &parent->subdir; *p; p=&(*p)->next ) { - if (!proc_match(len, fn, *p)) - continue; - de = *p; - *p = de->next; - de->next = NULL; - - spin_lock(&de->pde_unload_lock); - /* - * Stop accepting new callers into module. If you're - * dynamically allocating ->proc_fops, save a pointer somewhere. - */ - de->proc_fops = NULL; - /* Wait until all existing callers into module are done. */ - if (de->pde_users > 0) { - DECLARE_COMPLETION_ONSTACK(c); - - if (!de->pde_unload_completion) - de->pde_unload_completion = &c; - - spin_unlock(&de->pde_unload_lock); - spin_unlock(&proc_subdir_lock); + if (proc_match(len, fn, *p)) { + de = *p; + *p = de->next; + de->next = NULL; + break; + } + } + spin_unlock(&proc_subdir_lock); + if (!de) + return; - wait_for_completion(de->pde_unload_completion); + spin_lock(&de->pde_unload_lock); + /* + * Stop accepting new callers into module. If you're + * dynamically allocating ->proc_fops, save a pointer somewhere. + */ + de->proc_fops = NULL; + /* Wait until all existing callers into module are done. */ + if (de->pde_users > 0) { + DECLARE_COMPLETION_ONSTACK(c); + + if (!de->pde_unload_completion) + de->pde_unload_completion = &c; - spin_lock(&proc_subdir_lock); - goto continue_removing; - } spin_unlock(&de->pde_unload_lock); + wait_for_completion(de->pde_unload_completion); + + goto continue_removing; + } + spin_unlock(&de->pde_unload_lock); + continue_removing: - if (S_ISDIR(de->mode)) - parent->nlink--; - de->nlink = 0; - WARN_ON(de->subdir); - if (atomic_dec_and_test(&de->count)) - free_proc_entry(de); - break; + if (S_ISDIR(de->mode)) + parent->nlink--; + de->nlink = 0; + if (de->subdir) { + printk(KERN_WARNING "%s: removing non-empty directory " + "'%s/%s', leaking at least '%s'\n", __func__, + de->parent->name, de->name, de->subdir->name); + WARN_ON(1); } - spin_unlock(&proc_subdir_lock); -out: - return; + if (atomic_dec_and_test(&de->count)) + free_proc_entry(de); } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 82b3a1b5a70..6f4e8dc97da 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -25,8 +25,7 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de) { - if (de) - atomic_inc(&de->count); + atomic_inc(&de->count); return de; } @@ -35,18 +34,16 @@ struct proc_dir_entry *de_get(struct proc_dir_entry *de) */ void de_put(struct proc_dir_entry *de) { - if (de) { - lock_kernel(); - if (!atomic_read(&de->count)) { - printk("de_put: entry %s already free!\n", de->name); - unlock_kernel(); - return; - } - - if (atomic_dec_and_test(&de->count)) - free_proc_entry(de); + lock_kernel(); + if (!atomic_read(&de->count)) { + printk("de_put: entry %s already free!\n", de->name); unlock_kernel(); + return; } + + if (atomic_dec_and_test(&de->count)) + free_proc_entry(de); + unlock_kernel(); } /* @@ -392,7 +389,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, { struct inode * inode; - if (de != NULL && !try_module_get(de->owner)) + if (!try_module_get(de->owner)) goto out_mod; inode = iget_locked(sb, ino); @@ -402,30 +399,29 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; PROC_I(inode)->fd = 0; PROC_I(inode)->pde = de; - if (de) { - if (de->mode) { - inode->i_mode = de->mode; - inode->i_uid = de->uid; - inode->i_gid = de->gid; - } - if (de->size) - inode->i_size = de->size; - if (de->nlink) - inode->i_nlink = de->nlink; - if (de->proc_iops) - inode->i_op = de->proc_iops; - if (de->proc_fops) { - if (S_ISREG(inode->i_mode)) { + + if (de->mode) { + inode->i_mode = de->mode; + inode->i_uid = de->uid; + inode->i_gid = de->gid; + } + if (de->size) + inode->i_size = de->size; + if (de->nlink) + inode->i_nlink = de->nlink; + if (de->proc_iops) + inode->i_op = de->proc_iops; + if (de->proc_fops) { + if (S_ISREG(inode->i_mode)) { #ifdef CONFIG_COMPAT - if (!de->proc_fops->compat_ioctl) - inode->i_fop = - &proc_reg_file_ops_no_compat; - else + if (!de->proc_fops->compat_ioctl) + inode->i_fop = + &proc_reg_file_ops_no_compat; + else #endif - inode->i_fop = &proc_reg_file_ops; - } else { - inode->i_fop = de->proc_fops; - } + inode->i_fop = &proc_reg_file_ops; + } else { + inode->i_fop = de->proc_fops; } } unlock_new_inode(inode); @@ -433,8 +429,7 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, return inode; out_ino: - if (de != NULL) - module_put(de->owner); + module_put(de->owner); out_mod: return NULL; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index bc72f5c8c47..28cbca80590 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -11,6 +11,7 @@ #include <linux/proc_fs.h> +extern struct proc_dir_entry proc_root; #ifdef CONFIG_PROC_SYSCTL extern int proc_sys_init(void); #else @@ -46,9 +47,6 @@ extern int nommu_vma_show(struct seq_file *, struct vm_area_struct *); extern int maps_protect; -extern void create_seq_entry(char *name, mode_t mode, - const struct file_operations *f); -extern int proc_exe_link(struct inode *, struct path *); extern int proc_tid_stat(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task); extern int proc_tgid_stat(struct seq_file *m, struct pid_namespace *ns, diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c index 941e95114b5..79ecd281d2c 100644 --- a/fs/proc/nommu.c +++ b/fs/proc/nommu.c @@ -137,7 +137,7 @@ static const struct file_operations proc_nommu_vma_list_operations = { static int __init proc_nommu_init(void) { - create_seq_entry("maps", S_IRUGO, &proc_nommu_vma_list_operations); + proc_create("maps", S_IRUGO, NULL, &proc_nommu_vma_list_operations); return 0; } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 2d563979cb0..74a323d2b85 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -179,6 +179,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "PageTables: %8lu kB\n" "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" + "WritebackTmp: %8lu kB\n" "CommitLimit: %8lu kB\n" "Committed_AS: %8lu kB\n" "VmallocTotal: %8lu kB\n" @@ -210,6 +211,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, K(global_page_state(NR_PAGETABLE)), K(global_page_state(NR_UNSTABLE_NFS)), K(global_page_state(NR_BOUNCE)), + K(global_page_state(NR_WRITEBACK_TEMP)), K(allowed), K(committed), (unsigned long)VMALLOC_TOTAL >> 10, @@ -456,6 +458,20 @@ static const struct file_operations proc_slabstats_operations = { #endif #endif +#ifdef CONFIG_MMU +static int vmalloc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vmalloc_op); +} + +static const struct file_operations proc_vmalloc_operations = { + .open = vmalloc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; +#endif + static int show_stat(struct seq_file *p, void *v) { int i; @@ -812,14 +828,6 @@ static struct file_operations proc_kpageflags_operations = { struct proc_dir_entry *proc_root_kcore; -void create_seq_entry(char *name, mode_t mode, const struct file_operations *f) -{ - struct proc_dir_entry *entry; - entry = create_proc_entry(name, mode, NULL); - if (entry) - entry->proc_fops = f; -} - void __init proc_misc_init(void) { static struct { @@ -848,63 +856,52 @@ void __init proc_misc_init(void) /* And now for trickier ones */ #ifdef CONFIG_PRINTK - { - struct proc_dir_entry *entry; - entry = create_proc_entry("kmsg", S_IRUSR, &proc_root); - if (entry) - entry->proc_fops = &proc_kmsg_operations; - } + proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif - create_seq_entry("locks", 0, &proc_locks_operations); - create_seq_entry("devices", 0, &proc_devinfo_operations); - create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations); + proc_create("locks", 0, NULL, &proc_locks_operations); + proc_create("devices", 0, NULL, &proc_devinfo_operations); + proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK - create_seq_entry("partitions", 0, &proc_partitions_operations); + proc_create("partitions", 0, NULL, &proc_partitions_operations); #endif - create_seq_entry("stat", 0, &proc_stat_operations); - create_seq_entry("interrupts", 0, &proc_interrupts_operations); + proc_create("stat", 0, NULL, &proc_stat_operations); + proc_create("interrupts", 0, NULL, &proc_interrupts_operations); #ifdef CONFIG_SLABINFO - create_seq_entry("slabinfo",S_IWUSR|S_IRUGO,&proc_slabinfo_operations); + proc_create("slabinfo",S_IWUSR|S_IRUGO,NULL,&proc_slabinfo_operations); #ifdef CONFIG_DEBUG_SLAB_LEAK - create_seq_entry("slab_allocators", 0 ,&proc_slabstats_operations); + proc_create("slab_allocators", 0, NULL, &proc_slabstats_operations); +#endif #endif +#ifdef CONFIG_MMU + proc_create("vmallocinfo", S_IRUSR, NULL, &proc_vmalloc_operations); #endif - create_seq_entry("buddyinfo",S_IRUGO, &fragmentation_file_operations); - create_seq_entry("pagetypeinfo", S_IRUGO, &pagetypeinfo_file_ops); - create_seq_entry("vmstat",S_IRUGO, &proc_vmstat_file_operations); - create_seq_entry("zoneinfo",S_IRUGO, &proc_zoneinfo_file_operations); + proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations); + proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops); + proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations); + proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations); #ifdef CONFIG_BLOCK - create_seq_entry("diskstats", 0, &proc_diskstats_operations); + proc_create("diskstats", 0, NULL, &proc_diskstats_operations); #endif #ifdef CONFIG_MODULES - create_seq_entry("modules", 0, &proc_modules_operations); + proc_create("modules", 0, NULL, &proc_modules_operations); #endif #ifdef CONFIG_SCHEDSTATS - create_seq_entry("schedstat", 0, &proc_schedstat_operations); + proc_create("schedstat", 0, NULL, &proc_schedstat_operations); #endif #ifdef CONFIG_PROC_KCORE - proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL); - if (proc_root_kcore) { - proc_root_kcore->proc_fops = &proc_kcore_operations; + proc_root_kcore = proc_create("kcore", S_IRUSR, NULL, &proc_kcore_operations); + if (proc_root_kcore) proc_root_kcore->size = (size_t)high_memory - PAGE_OFFSET + PAGE_SIZE; - } #endif #ifdef CONFIG_PROC_PAGE_MONITOR - create_seq_entry("kpagecount", S_IRUSR, &proc_kpagecount_operations); - create_seq_entry("kpageflags", S_IRUSR, &proc_kpageflags_operations); + proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations); + proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations); #endif #ifdef CONFIG_PROC_VMCORE - proc_vmcore = create_proc_entry("vmcore", S_IRUSR, NULL); - if (proc_vmcore) - proc_vmcore->proc_fops = &proc_vmcore_operations; + proc_vmcore = proc_create("vmcore", S_IRUSR, NULL, &proc_vmcore_operations); #endif #ifdef CONFIG_MAGIC_SYSRQ - { - struct proc_dir_entry *entry; - entry = create_proc_entry("sysrq-trigger", S_IWUSR, NULL); - if (entry) - entry->proc_fops = &proc_sysrq_trigger_operations; - } + proc_create("sysrq-trigger", S_IWUSR, NULL, &proc_sysrq_trigger_operations); #endif } diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c index 13cd7835d0d..83f357b30d7 100644 --- a/fs/proc/proc_net.c +++ b/fs/proc/proc_net.c @@ -159,17 +159,6 @@ struct net *get_proc_net(const struct inode *inode) } EXPORT_SYMBOL_GPL(get_proc_net); -struct proc_dir_entry *proc_net_mkdir(struct net *net, const char *name, - struct proc_dir_entry *parent) -{ - struct proc_dir_entry *pde; - pde = proc_mkdir_mode(name, S_IRUGO | S_IXUGO, parent); - if (pde != NULL) - pde->data = net; - return pde; -} -EXPORT_SYMBOL_GPL(proc_net_mkdir); - static __net_init int proc_net_ns_init(struct net *net) { struct proc_dir_entry *netd, *net_statd; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index 614c34b6d1c..5acc001d49f 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -165,8 +165,8 @@ out: return err; } -static ssize_t proc_sys_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t proc_sys_call_handler(struct file *filp, void __user *buf, + size_t count, loff_t *ppos, int write) { struct dentry *dentry = filp->f_dentry; struct ctl_table_header *head; @@ -190,12 +190,12 @@ static ssize_t proc_sys_read(struct file *filp, char __user *buf, * and won't be until we finish. */ error = -EPERM; - if (sysctl_perm(table, MAY_READ)) + if (sysctl_perm(head->root, table, write ? MAY_WRITE : MAY_READ)) goto out; /* careful: calling conventions are nasty here */ res = count; - error = table->proc_handler(table, 0, filp, buf, &res, ppos); + error = table->proc_handler(table, write, filp, buf, &res, ppos); if (!error) error = res; out: @@ -204,44 +204,16 @@ out: return error; } -static ssize_t proc_sys_write(struct file *filp, const char __user *buf, +static ssize_t proc_sys_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { - struct dentry *dentry = filp->f_dentry; - struct ctl_table_header *head; - struct ctl_table *table; - ssize_t error; - size_t res; - - table = do_proc_sys_lookup(dentry->d_parent, &dentry->d_name, &head); - /* Has the sysctl entry disappeared on us? */ - error = -ENOENT; - if (!table) - goto out; - - /* Has the sysctl entry been replaced by a directory? */ - error = -EISDIR; - if (!table->proc_handler) - goto out; - - /* - * At this point we know that the sysctl was not unregistered - * and won't be until we finish. - */ - error = -EPERM; - if (sysctl_perm(table, MAY_WRITE)) - goto out; - - /* careful: calling conventions are nasty here */ - res = count; - error = table->proc_handler(table, 1, filp, (char __user *)buf, - &res, ppos); - if (!error) - error = res; -out: - sysctl_head_finish(head); + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 0); +} - return error; +static ssize_t proc_sys_write(struct file *filp, const char __user *buf, + size_t count, loff_t *ppos) +{ + return proc_sys_call_handler(filp, (void __user *)buf, count, ppos, 1); } @@ -416,7 +388,7 @@ static int proc_sys_permission(struct inode *inode, int mask, struct nameidata * goto out; /* Use the permissions on the sysctl table entry */ - error = sysctl_perm(table, mask); + error = sysctl_perm(head->root, table, mask); out: sysctl_head_finish(head); return error; diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c index 49816e00b51..21f490f5d65 100644 --- a/fs/proc/proc_tty.c +++ b/fs/proc/proc_tty.c @@ -5,7 +5,7 @@ */ #include <asm/uaccess.h> - +#include <linux/module.h> #include <linux/init.h> #include <linux/errno.h> #include <linux/time.h> @@ -136,39 +136,54 @@ static const struct file_operations proc_tty_drivers_operations = { .release = seq_release, }; -/* - * This is the handler for /proc/tty/ldiscs - */ -static int tty_ldiscs_read_proc(char *page, char **start, off_t off, - int count, int *eof, void *data) +static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) { - int i; - int len = 0; - off_t begin = 0; + return (*pos < NR_LDISCS) ? pos : NULL; +} + +static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos) +{ + (*pos)++; + return (*pos < NR_LDISCS) ? pos : NULL; +} + +static void tty_ldiscs_seq_stop(struct seq_file *m, void *v) +{ +} + +static int tty_ldiscs_seq_show(struct seq_file *m, void *v) +{ + int i = *(loff_t *)v; struct tty_ldisc *ld; - for (i=0; i < NR_LDISCS; i++) { - ld = tty_ldisc_get(i); - if (ld == NULL) - continue; - len += sprintf(page+len, "%-10s %2d\n", - ld->name ? ld->name : "???", i); - tty_ldisc_put(i); - if (len+begin > off+count) - break; - if (len+begin < off) { - begin += len; - len = 0; - } - } - if (i >= NR_LDISCS) - *eof = 1; - if (off >= len+begin) + ld = tty_ldisc_get(i); + if (ld == NULL) return 0; - *start = page + (off-begin); - return ((count < begin+len-off) ? count : begin+len-off); + seq_printf(m, "%-10s %2d\n", ld->name ? ld->name : "???", i); + tty_ldisc_put(i); + return 0; +} + +static const struct seq_operations tty_ldiscs_seq_ops = { + .start = tty_ldiscs_seq_start, + .next = tty_ldiscs_seq_next, + .stop = tty_ldiscs_seq_stop, + .show = tty_ldiscs_seq_show, +}; + +static int proc_tty_ldiscs_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &tty_ldiscs_seq_ops); } +static const struct file_operations tty_ldiscs_proc_fops = { + .owner = THIS_MODULE, + .open = proc_tty_ldiscs_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + /* * This function is called by tty_register_driver() to handle * registering the driver's /proc handler into /proc/tty/driver/<foo> @@ -177,16 +192,14 @@ void proc_tty_register_driver(struct tty_driver *driver) { struct proc_dir_entry *ent; - if ((!driver->read_proc && !driver->write_proc) || - !driver->driver_name || + if (!driver->ops->read_proc || !driver->driver_name || driver->proc_entry) return; ent = create_proc_entry(driver->driver_name, 0, proc_tty_driver); if (!ent) return; - ent->read_proc = driver->read_proc; - ent->write_proc = driver->write_proc; + ent->read_proc = driver->ops->read_proc; ent->owner = driver->owner; ent->data = driver; @@ -214,7 +227,6 @@ void proc_tty_unregister_driver(struct tty_driver *driver) */ void __init proc_tty_init(void) { - struct proc_dir_entry *entry; if (!proc_mkdir("tty", NULL)) return; proc_tty_ldisc = proc_mkdir("tty/ldisc", NULL); @@ -224,10 +236,7 @@ void __init proc_tty_init(void) * password lengths and inter-keystroke timings during password * entry. */ - proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR | S_IXUSR, NULL); - - create_proc_read_entry("tty/ldiscs", 0, NULL, tty_ldiscs_read_proc, NULL); - entry = create_proc_entry("tty/drivers", 0, NULL); - if (entry) - entry->proc_fops = &proc_tty_drivers_operations; + proc_tty_driver = proc_mkdir_mode("tty/driver", S_IRUSR|S_IXUSR, NULL); + proc_create("tty/ldiscs", 0, NULL, &tty_ldiscs_proc_fops); + proc_create("tty/drivers", 0, NULL, &proc_tty_drivers_operations); } diff --git a/fs/proc/root.c b/fs/proc/root.c index ef0fb57fc9e..95117538a4f 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -22,8 +22,6 @@ #include "internal.h" -struct proc_dir_entry *proc_bus, *proc_root_fs, *proc_root_driver; - static int proc_test_super(struct super_block *sb, void *data) { return sb->s_fs_info == data; @@ -126,8 +124,8 @@ void __init proc_root_init(void) #ifdef CONFIG_SYSVIPC proc_mkdir("sysvipc", NULL); #endif - proc_root_fs = proc_mkdir("fs", NULL); - proc_root_driver = proc_mkdir("driver", NULL); + proc_mkdir("fs", NULL); + proc_mkdir("driver", NULL); proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */ #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE) /* just give it a mountpoint */ @@ -137,7 +135,7 @@ void __init proc_root_init(void) #ifdef CONFIG_PROC_DEVICETREE proc_device_tree_init(); #endif - proc_bus = proc_mkdir("bus", NULL); + proc_mkdir("bus", NULL); proc_sys_init(); } @@ -232,9 +230,5 @@ void pid_ns_release_proc(struct pid_namespace *ns) EXPORT_SYMBOL(proc_symlink); EXPORT_SYMBOL(proc_mkdir); EXPORT_SYMBOL(create_proc_entry); -EXPORT_SYMBOL(proc_create); +EXPORT_SYMBOL(proc_create_data); EXPORT_SYMBOL(remove_proc_entry); -EXPORT_SYMBOL(proc_root); -EXPORT_SYMBOL(proc_root_fs); -EXPORT_SYMBOL(proc_bus); -EXPORT_SYMBOL(proc_root_driver); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9dfb5ff2420..e2b8e769f51 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -75,40 +75,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, return mm->total_vm; } -int proc_exe_link(struct inode *inode, struct path *path) -{ - struct vm_area_struct * vma; - int result = -ENOENT; - struct task_struct *task = get_proc_task(inode); - struct mm_struct * mm = NULL; - - if (task) { - mm = get_task_mm(task); - put_task_struct(task); - } - if (!mm) - goto out; - down_read(&mm->mmap_sem); - - vma = mm->mmap; - while (vma) { - if ((vma->vm_flags & VM_EXECUTABLE) && vma->vm_file) - break; - vma = vma->vm_next; - } - - if (vma) { - *path = vma->vm_file->f_path; - path_get(&vma->vm_file->f_path); - result = 0; - } - - up_read(&mm->mmap_sem); - mmput(mm); -out: - return result; -} - static void pad_len_spaces(struct seq_file *m, int len) { len = 25 + sizeof(void*) * 6 - len; @@ -338,8 +304,7 @@ const struct file_operations proc_maps_operations = { #define PSS_SHIFT 12 #ifdef CONFIG_PROC_PAGE_MONITOR -struct mem_size_stats -{ +struct mem_size_stats { struct vm_area_struct *vma; unsigned long resident; unsigned long shared_clean; @@ -347,6 +312,7 @@ struct mem_size_stats unsigned long private_clean; unsigned long private_dirty; unsigned long referenced; + unsigned long swap; u64 pss; }; @@ -363,6 +329,12 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) { ptent = *pte; + + if (is_swap_pte(ptent)) { + mss->swap += PAGE_SIZE; + continue; + } + if (!pte_present(ptent)) continue; @@ -421,7 +393,8 @@ static int show_smap(struct seq_file *m, void *v) "Shared_Dirty: %8lu kB\n" "Private_Clean: %8lu kB\n" "Private_Dirty: %8lu kB\n" - "Referenced: %8lu kB\n", + "Referenced: %8lu kB\n" + "Swap: %8lu kB\n", (vma->vm_end - vma->vm_start) >> 10, mss.resident >> 10, (unsigned long)(mss.pss >> (10 + PSS_SHIFT)), @@ -429,7 +402,8 @@ static int show_smap(struct seq_file *m, void *v) mss.shared_dirty >> 10, mss.private_clean >> 10, mss.private_dirty >> 10, - mss.referenced >> 10); + mss.referenced >> 10, + mss.swap >> 10); return ret; } @@ -579,7 +553,7 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end, return err; } -u64 swap_pte_to_pagemap_entry(pte_t pte) +static u64 swap_pte_to_pagemap_entry(pte_t pte) { swp_entry_t e = pte_to_swp_entry(pte); return swp_type(e) | (swp_offset(e) << MAX_SWAPFILES_SHIFT); diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8011528518b..4b4f9cc2f18 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -1,6 +1,7 @@ #include <linux/mm.h> #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/mount.h> #include <linux/ptrace.h> #include <linux/seq_file.h> @@ -103,40 +104,6 @@ int task_statm(struct mm_struct *mm, int *shared, int *text, return size; } -int proc_exe_link(struct inode *inode, struct path *path) -{ - struct vm_list_struct *vml; - struct vm_area_struct *vma; - struct task_struct *task = get_proc_task(inode); - struct mm_struct *mm = get_task_mm(task); - int result = -ENOENT; - - if (!mm) - goto out; - down_read(&mm->mmap_sem); - - vml = mm->context.vmlist; - vma = NULL; - while (vml) { - if ((vml->vma->vm_flags & VM_EXECUTABLE) && vml->vma->vm_file) { - vma = vml->vma; - break; - } - vml = vml->next; - } - - if (vma) { - *path = vma->vm_file->f_path; - path_get(&vma->vm_file->f_path); - result = 0; - } - - up_read(&mm->mmap_sem); - mmput(mm); -out: - return result; -} - /* * display mapping lines for a particular process's /proc/pid/maps */ diff --git a/fs/quota.c b/fs/quota.c index 84f28dd7211..db1cc9f3c7a 100644 --- a/fs/quota.c +++ b/fs/quota.c @@ -69,7 +69,6 @@ static int generic_quotactl_valid(struct super_block *sb, int type, int cmd, qid switch (cmd) { case Q_GETFMT: case Q_GETINFO: - case Q_QUOTAOFF: case Q_SETINFO: case Q_SETQUOTA: case Q_GETQUOTA: @@ -229,12 +228,12 @@ static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id, void if (IS_ERR(pathname = getname(addr))) return PTR_ERR(pathname); - ret = sb->s_qcop->quota_on(sb, type, id, pathname); + ret = sb->s_qcop->quota_on(sb, type, id, pathname, 0); putname(pathname); return ret; } case Q_QUOTAOFF: - return sb->s_qcop->quota_off(sb, type); + return sb->s_qcop->quota_off(sb, type, 0); case Q_GETFMT: { __u32 fmt; diff --git a/fs/quota_v1.c b/fs/quota_v1.c index f3841f23306..a6cf9269105 100644 --- a/fs/quota_v1.c +++ b/fs/quota_v1.c @@ -139,6 +139,9 @@ static int v1_read_file_info(struct super_block *sb, int type) goto out; } ret = 0; + /* limits are stored as unsigned 32-bit data */ + dqopt->info[type].dqi_maxblimit = 0xffffffff; + dqopt->info[type].dqi_maxilimit = 0xffffffff; dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; out: diff --git a/fs/quota_v2.c b/fs/quota_v2.c index c519a583e68..234ada90363 100644 --- a/fs/quota_v2.c +++ b/fs/quota_v2.c @@ -59,6 +59,9 @@ static int v2_read_file_info(struct super_block *sb, int type) sb->s_id); return -1; } + /* limits are stored as unsigned 32-bit data */ + info->dqi_maxblimit = 0xffffffff; + info->dqi_maxilimit = 0xffffffff; info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); info->dqi_flags = le32_to_cpu(dinfo.dqi_flags); @@ -303,7 +306,7 @@ static uint find_free_dqentry(struct dquot *dquot, int *err) printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); goto out_buf; } - dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1); + le16_add_cpu(&dh->dqdh_entries, 1); memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); /* Find free structure in block */ for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); @@ -445,7 +448,7 @@ static int free_dqentry(struct dquot *dquot, uint blk) goto out_buf; } dh = (struct v2_disk_dqdbheader *)buf; - dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)-1); + le16_add_cpu(&dh->dqdh_entries, -1); if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */ if ((ret = remove_free_dqentry(sb, type, buf, blk)) < 0 || (ret = put_free_dqblk(sb, type, buf, blk)) < 0) { diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b41a514b097..9590b902430 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -26,6 +26,9 @@ #include <linux/fs.h> #include <linux/mm.h> +#include <linux/ramfs.h> + +#include "internal.h" const struct address_space_operations ramfs_aops = { .readpage = simple_readpage, diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 8428d5b2711..b13123424e4 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -44,7 +44,7 @@ static const struct inode_operations ramfs_dir_inode_operations; static struct backing_dev_info ramfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_MAP_DIRECT | BDI_CAP_MAP_COPY | BDI_CAP_READ_MAP | BDI_CAP_WRITE_MAP | BDI_CAP_EXEC_MAP, }; diff --git a/fs/ramfs/internal.h b/fs/ramfs/internal.h index af7cc074a47..6b330639b51 100644 --- a/fs/ramfs/internal.h +++ b/fs/ramfs/internal.h @@ -11,5 +11,4 @@ extern const struct address_space_operations ramfs_aops; -extern const struct file_operations ramfs_file_operations; extern const struct inode_operations ramfs_file_inode_operations; diff --git a/fs/reiserfs/bitmap.c b/fs/reiserfs/bitmap.c index f491ceb5af0..4646caa6045 100644 --- a/fs/reiserfs/bitmap.c +++ b/fs/reiserfs/bitmap.c @@ -479,7 +479,7 @@ static void __discard_prealloc(struct reiserfs_transaction_handle *th, if (ei->i_prealloc_count < 0) reiserfs_warning(th->t_super, "zam-4001:%s: inode has negative prealloc blocks count.", - __FUNCTION__); + __func__); #endif while (ei->i_prealloc_count > 0) { reiserfs_free_prealloc_block(th, inode, ei->i_prealloc_block); @@ -517,7 +517,7 @@ void reiserfs_discard_all_prealloc(struct reiserfs_transaction_handle *th) if (!ei->i_prealloc_count) { reiserfs_warning(th->t_super, "zam-4001:%s: inode is in prealloc list but has no preallocated blocks.", - __FUNCTION__); + __func__); } #endif __discard_prealloc(th, ei); @@ -632,7 +632,7 @@ int reiserfs_parse_alloc_options(struct super_block *s, char *options) } reiserfs_warning(s, "zam-4001: %s : unknown option - %s", - __FUNCTION__, this_char); + __func__, this_char); return 1; } @@ -1254,7 +1254,7 @@ struct buffer_head *reiserfs_read_bitmap_block(struct super_block *sb, bh = sb_bread(sb, block); if (bh == NULL) reiserfs_warning(sb, "sh-2029: %s: bitmap block (#%u) " - "reading failed", __FUNCTION__, block); + "reading failed", __func__, block); else { if (buffer_locked(bh)) { PROC_INFO_INC(sb, scan_bitmap.wait); diff --git a/fs/reiserfs/do_balan.c b/fs/reiserfs/do_balan.c index 7ee4208793b..2f87f5b1463 100644 --- a/fs/reiserfs/do_balan.c +++ b/fs/reiserfs/do_balan.c @@ -1464,29 +1464,29 @@ static int balance_leaf(struct tree_balance *tb, struct item_head *ih, /* item h } else /* item falls wholly into S_new[i] */ { - int ret_val; + int leaf_mi; struct item_head *pasted; #ifdef CONFIG_REISERFS_CHECK - struct item_head *ih = + struct item_head *ih_check = B_N_PITEM_HEAD(tbS0, item_pos); - if (!is_direntry_le_ih(ih) - && (pos_in_item != ih_item_len(ih) + if (!is_direntry_le_ih(ih_check) + && (pos_in_item != ih_item_len(ih_check) || tb->insert_size[0] <= 0)) reiserfs_panic(tb->tb_sb, "PAP-12235: balance_leaf: pos_in_item must be equal to ih_item_len"); #endif /* CONFIG_REISERFS_CHECK */ - ret_val = + leaf_mi = leaf_move_items(LEAF_FROM_S_TO_SNEW, tb, snum[i], sbytes[i], S_new[i]); - RFALSE(ret_val, + RFALSE(leaf_mi, "PAP-12240: unexpected value returned by leaf_move_items (%d)", - ret_val); + leaf_mi); /* paste into item */ bi.tb = tb; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 74363a7aacb..830332021ed 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -12,8 +12,6 @@ #include <linux/smp_lock.h> #include <linux/compat.h> -static int reiserfs_unpack(struct inode *inode, struct file *filp); - /* ** reiserfs_ioctl - handler for ioctl for inode ** supported commands: @@ -159,7 +157,7 @@ int reiserfs_prepare_write(struct file *f, struct page *page, ** Function try to convert tail from direct item into indirect. ** It set up nopack attribute in the REISERFS_I(inode)->nopack */ -static int reiserfs_unpack(struct inode *inode, struct file *filp) +int reiserfs_unpack(struct inode *inode, struct file *filp) { int retval = 0; int index; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 060eb3f598e..e396b2fa474 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -1187,7 +1187,7 @@ static int flush_commit_list(struct super_block *s, if (retval) reiserfs_abort(s, retval, "Journal write error in %s", - __FUNCTION__); + __func__); put_fs_excl(); return retval; } @@ -1534,7 +1534,7 @@ static int flush_journal_list(struct super_block *s, reiserfs_warning(s, "clm-2082: Unable to flush buffer %llu in %s", (unsigned long long)saved_bh-> - b_blocknr, __FUNCTION__); + b_blocknr, __func__); } free_cnode: last = cn; @@ -1586,7 +1586,7 @@ static int flush_journal_list(struct super_block *s, if (err) reiserfs_abort(s, -EIO, "Write error while pushing transaction to disk in %s", - __FUNCTION__); + __func__); flush_older_and_return: /* before we can update the journal header block, we _must_ flush all @@ -1616,7 +1616,7 @@ static int flush_journal_list(struct super_block *s, if (err) reiserfs_abort(s, -EIO, "Write error while updating journal header in %s", - __FUNCTION__); + __func__); } remove_all_from_journal_list(s, jl, 0); list_del_init(&jl->j_list); @@ -2574,11 +2574,9 @@ static int release_journal_dev(struct super_block *super, result = 0; - if (journal->j_dev_file != NULL) { - result = filp_close(journal->j_dev_file, NULL); - journal->j_dev_file = NULL; - journal->j_dev_bd = NULL; - } else if (journal->j_dev_bd != NULL) { + if (journal->j_dev_bd != NULL) { + if (journal->j_dev_bd->bd_dev != super->s_dev) + bd_release(journal->j_dev_bd); result = blkdev_put(journal->j_dev_bd); journal->j_dev_bd = NULL; } @@ -2603,7 +2601,6 @@ static int journal_init_dev(struct super_block *super, result = 0; journal->j_dev_bd = NULL; - journal->j_dev_file = NULL; jdev = SB_ONDISK_JOURNAL_DEVICE(super) ? new_decode_dev(SB_ONDISK_JOURNAL_DEVICE(super)) : super->s_dev; @@ -2620,35 +2617,34 @@ static int journal_init_dev(struct super_block *super, "cannot init journal device '%s': %i", __bdevname(jdev, b), result); return result; - } else if (jdev != super->s_dev) + } else if (jdev != super->s_dev) { + result = bd_claim(journal->j_dev_bd, journal); + if (result) { + blkdev_put(journal->j_dev_bd); + return result; + } + set_blocksize(journal->j_dev_bd, super->s_blocksize); + } + return 0; } - journal->j_dev_file = filp_open(jdev_name, 0, 0); - if (!IS_ERR(journal->j_dev_file)) { - struct inode *jdev_inode = journal->j_dev_file->f_mapping->host; - if (!S_ISBLK(jdev_inode->i_mode)) { - reiserfs_warning(super, "journal_init_dev: '%s' is " - "not a block device", jdev_name); - result = -ENOTBLK; - release_journal_dev(super, journal); - } else { - /* ok */ - journal->j_dev_bd = I_BDEV(jdev_inode); - set_blocksize(journal->j_dev_bd, super->s_blocksize); - reiserfs_info(super, - "journal_init_dev: journal device: %s\n", - bdevname(journal->j_dev_bd, b)); - } - } else { - result = PTR_ERR(journal->j_dev_file); - journal->j_dev_file = NULL; + journal->j_dev_bd = open_bdev_excl(jdev_name, 0, journal); + if (IS_ERR(journal->j_dev_bd)) { + result = PTR_ERR(journal->j_dev_bd); + journal->j_dev_bd = NULL; reiserfs_warning(super, "journal_init_dev: Cannot open '%s': %i", jdev_name, result); + return result; } - return result; + + set_blocksize(journal->j_dev_bd, super->s_blocksize); + reiserfs_info(super, + "journal_init_dev: journal device: %s\n", + bdevname(journal->j_dev_bd, b)); + return 0; } /** @@ -4316,5 +4312,5 @@ static void __reiserfs_journal_abort_soft(struct super_block *sb, int errno) void reiserfs_journal_abort(struct super_block *sb, int errno) { - return __reiserfs_journal_abort_soft(sb, errno); + __reiserfs_journal_abort_soft(sb, errno); } diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c index 8867533cb72..c1add28dd45 100644 --- a/fs/reiserfs/namei.c +++ b/fs/reiserfs/namei.c @@ -301,7 +301,7 @@ static int reiserfs_find_entry(struct inode *dir, const char *name, int namelen, path_to_entry, de); if (retval == IO_ERROR) { reiserfs_warning(dir->i_sb, "zam-7001: io error in %s", - __FUNCTION__); + __func__); return IO_ERROR; } @@ -496,7 +496,7 @@ static int reiserfs_add_entry(struct reiserfs_transaction_handle *th, reiserfs_warning(dir->i_sb, "zam-7002:%s: \"reiserfs_find_entry\" " "has returned unexpected value (%d)", - __FUNCTION__, retval); + __func__, retval); } return -EEXIST; @@ -907,7 +907,7 @@ static int reiserfs_rmdir(struct inode *dir, struct dentry *dentry) if (inode->i_nlink != 2 && inode->i_nlink != 1) reiserfs_warning(inode->i_sb, "%s: empty directory has nlink " - "!= 2 (%d)", __FUNCTION__, inode->i_nlink); + "!= 2 (%d)", __func__, inode->i_nlink); clear_nlink(inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC; @@ -984,7 +984,7 @@ static int reiserfs_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) { reiserfs_warning(inode->i_sb, "%s: deleting nonexistent file " - "(%s:%lu), %d", __FUNCTION__, + "(%s:%lu), %d", __func__, reiserfs_bdevname(inode->i_sb), inode->i_ino, inode->i_nlink); inode->i_nlink = 1; diff --git a/fs/reiserfs/objectid.c b/fs/reiserfs/objectid.c index 65feba4deb6..ea0cf8c28a9 100644 --- a/fs/reiserfs/objectid.c +++ b/fs/reiserfs/objectid.c @@ -61,7 +61,7 @@ __u32 reiserfs_get_unused_objectid(struct reiserfs_transaction_handle *th) /* comment needed -Hans */ unused_objectid = le32_to_cpu(map[1]); if (unused_objectid == U32_MAX) { - reiserfs_warning(s, "%s: no more object ids", __FUNCTION__); + reiserfs_warning(s, "%s: no more object ids", __func__); reiserfs_restore_prepared_buffer(s, SB_BUFFER_WITH_SB(s)); return 0; } @@ -114,7 +114,7 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, if (objectid_to_release == le32_to_cpu(map[i])) { /* This incrementation unallocates the objectid. */ //map[i]++; - map[i] = cpu_to_le32(le32_to_cpu(map[i]) + 1); + le32_add_cpu(&map[i], 1); /* Did we unallocate the last member of an odd sequence, and can shrink oids? */ if (map[i] == map[i + 1]) { @@ -138,8 +138,7 @@ void reiserfs_release_objectid(struct reiserfs_transaction_handle *th, /* size of objectid map is not changed */ if (objectid_to_release + 1 == le32_to_cpu(map[i + 1])) { //objectid_map[i+1]--; - map[i + 1] = - cpu_to_le32(le32_to_cpu(map[i + 1]) - 1); + le32_add_cpu(&map[i + 1], -1); return; } diff --git a/fs/reiserfs/procfs.c b/fs/reiserfs/procfs.c index 8f86c52b30d..b9dbeeca704 100644 --- a/fs/reiserfs/procfs.c +++ b/fs/reiserfs/procfs.c @@ -467,6 +467,7 @@ static const struct file_operations r_file_operations = { .read = seq_read, .llseek = seq_lseek, .release = seq_release, + .owner = THIS_MODULE, }; static struct proc_dir_entry *proc_info_root = NULL; @@ -475,12 +476,8 @@ static const char proc_info_root_name[] = "fs/reiserfs"; static void add_file(struct super_block *sb, char *name, int (*func) (struct seq_file *, struct super_block *)) { - struct proc_dir_entry *de; - de = create_proc_entry(name, 0, REISERFS_SB(sb)->procdir); - if (de) { - de->data = func; - de->proc_fops = &r_file_operations; - } + proc_create_data(name, 0, REISERFS_SB(sb)->procdir, + &r_file_operations, func); } int reiserfs_proc_info_init(struct super_block *sb) diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c index d2db2417b2b..abbc64dcc8d 100644 --- a/fs/reiserfs/stree.c +++ b/fs/reiserfs/stree.c @@ -1419,8 +1419,7 @@ int reiserfs_delete_object(struct reiserfs_transaction_handle *th, inode_generation = &REISERFS_SB(th->t_super)->s_rs->s_inode_generation; - *inode_generation = - cpu_to_le32(le32_to_cpu(*inode_generation) + 1); + le32_add_cpu(inode_generation, 1); } /* USE_INODE_GENERATION_COUNTER */ #endif diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 393cc22c171..ed424d708e6 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -304,7 +304,7 @@ static int finish_unfinished(struct super_block *s) /* Turn quotas off */ for (i = 0; i < MAXQUOTAS; i++) { if (sb_dqopt(s)->files[i]) - vfs_quota_off_mount(s, i); + vfs_quota_off(s, i, 0); } if (ms_active_set) /* Restore the flag back */ @@ -634,7 +634,7 @@ static int reiserfs_acquire_dquot(struct dquot *); static int reiserfs_release_dquot(struct dquot *); static int reiserfs_mark_dquot_dirty(struct dquot *); static int reiserfs_write_info(struct super_block *, int); -static int reiserfs_quota_on(struct super_block *, int, int, char *); +static int reiserfs_quota_on(struct super_block *, int, int, char *, int); static struct dquot_operations reiserfs_quota_operations = { .initialize = reiserfs_dquot_initialize, @@ -1890,8 +1890,14 @@ static int reiserfs_dquot_drop(struct inode *inode) ret = journal_begin(&th, inode->i_sb, 2 * REISERFS_QUOTA_DEL_BLOCKS(inode->i_sb)); - if (ret) + if (ret) { + /* + * We call dquot_drop() anyway to at least release references + * to quota structures so that umount does not hang. + */ + dquot_drop(inode); goto out; + } ret = dquot_drop(inode); err = journal_end(&th, inode->i_sb, @@ -2015,13 +2021,17 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type) * Standard function to be called on quota_on */ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, - char *path) + char *path, int remount) { int err; struct nameidata nd; + struct inode *inode; if (!(REISERFS_SB(sb)->s_mount_opt & (1 << REISERFS_QUOTA))) return -EINVAL; + /* No more checks needed? Path and format_id are bogus anyway... */ + if (remount) + return vfs_quota_on(sb, type, format_id, path, 1); err = path_lookup(path, LOOKUP_FOLLOW, &nd); if (err) return err; @@ -2030,18 +2040,24 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, path_put(&nd.path); return -EXDEV; } + inode = nd.path.dentry->d_inode; /* We must not pack tails for quota files on reiserfs for quota IO to work */ - if (!(REISERFS_I(nd.path.dentry->d_inode)->i_flags & i_nopack_mask)) { - reiserfs_warning(sb, - "reiserfs: Quota file must have tail packing disabled."); - path_put(&nd.path); - return -EINVAL; + if (!(REISERFS_I(inode)->i_flags & i_nopack_mask)) { + err = reiserfs_unpack(inode, NULL); + if (err) { + reiserfs_warning(sb, + "reiserfs: Unpacking tail of quota file failed" + " (%d). Cannot turn on quotas.", err); + path_put(&nd.path); + return -EINVAL; + } + mark_inode_dirty(inode); } /* Not journalling quota? No more tests needed... */ if (!REISERFS_SB(sb)->s_qf_names[USRQUOTA] && !REISERFS_SB(sb)->s_qf_names[GRPQUOTA]) { path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path); + return vfs_quota_on(sb, type, format_id, path, 0); } /* Quotafile not of fs root? */ if (nd.path.dentry->d_parent->d_inode != sb->s_root->d_inode) @@ -2049,7 +2065,7 @@ static int reiserfs_quota_on(struct super_block *sb, int type, int format_id, "reiserfs: Quota file not on filesystem root. " "Journalled quota will not work."); path_put(&nd.path); - return vfs_quota_on(sb, type, format_id, path); + return vfs_quota_on(sb, type, format_id, path, 0); } /* Read data from quotafile - avoid pagecache and such because we cannot afford diff --git a/fs/select.c b/fs/select.c index 00f58c5c7e0..8dda969614a 100644 --- a/fs/select.c +++ b/fs/select.c @@ -21,6 +21,7 @@ #include <linux/poll.h> #include <linux/personality.h> /* for STICKY_TIMEOUTS */ #include <linux/file.h> +#include <linux/fdtable.h> #include <linux/fs.h> #include <linux/rcupdate.h> @@ -298,7 +299,7 @@ int do_select(int n, fd_set_bits *fds, s64 *timeout) #define MAX_SELECT_SECONDS \ ((unsigned long) (MAX_SCHEDULE_TIMEOUT / HZ)-1) -static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, +int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; @@ -425,7 +426,7 @@ sticky: return ret; } -#ifdef TIF_RESTORE_SIGMASK +#ifdef HAVE_SET_RESTORE_SIGMASK asmlinkage long sys_pselect7(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize) @@ -498,7 +499,7 @@ sticky: if (sigmask) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); } } else if (sigmask) sigprocmask(SIG_SETMASK, &sigsaved, NULL); @@ -528,7 +529,7 @@ asmlinkage long sys_pselect6(int n, fd_set __user *inp, fd_set __user *outp, return sys_pselect7(n, inp, outp, exp, tsp, up, sigsetsize); } -#endif /* TIF_RESTORE_SIGMASK */ +#endif /* HAVE_SET_RESTORE_SIGMASK */ struct poll_list { struct poll_list *next; @@ -759,7 +760,7 @@ asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds, return ret; } -#ifdef TIF_RESTORE_SIGMASK +#ifdef HAVE_SET_RESTORE_SIGMASK asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, struct timespec __user *tsp, const sigset_t __user *sigmask, size_t sigsetsize) @@ -805,7 +806,7 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, if (sigmask) { memcpy(¤t->saved_sigmask, &sigsaved, sizeof(sigsaved)); - set_thread_flag(TIF_RESTORE_SIGMASK); + set_restore_sigmask(); } ret = -ERESTARTNOHAND; } else if (sigmask) @@ -839,4 +840,4 @@ asmlinkage long sys_ppoll(struct pollfd __user *ufds, unsigned int nfds, return ret; } -#endif /* TIF_RESTORE_SIGMASK */ +#endif /* HAVE_SET_RESTORE_SIGMASK */ diff --git a/fs/seq_file.c b/fs/seq_file.c index cdfd996ca6e..3f54dbd6c49 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -350,28 +350,40 @@ int seq_printf(struct seq_file *m, const char *f, ...) } EXPORT_SYMBOL(seq_printf); +static char *mangle_path(char *s, char *p, char *esc) +{ + while (s <= p) { + char c = *p++; + if (!c) { + return s; + } else if (!strchr(esc, c)) { + *s++ = c; + } else if (s + 4 > p) { + break; + } else { + *s++ = '\\'; + *s++ = '0' + ((c & 0300) >> 6); + *s++ = '0' + ((c & 070) >> 3); + *s++ = '0' + (c & 07); + } + } + return NULL; +} + +/* + * return the absolute path of 'dentry' residing in mount 'mnt'. + */ int seq_path(struct seq_file *m, struct path *path, char *esc) { if (m->count < m->size) { char *s = m->buf + m->count; char *p = d_path(path, s, m->size - m->count); if (!IS_ERR(p)) { - while (s <= p) { - char c = *p++; - if (!c) { - p = m->buf + m->count; - m->count = s - m->buf; - return s - p; - } else if (!strchr(esc, c)) { - *s++ = c; - } else if (s + 4 > p) { - break; - } else { - *s++ = '\\'; - *s++ = '0' + ((c & 0300) >> 6); - *s++ = '0' + ((c & 070) >> 3); - *s++ = '0' + (c & 07); - } + s = mangle_path(s, p, esc); + if (s) { + p = m->buf + m->count; + m->count = s - m->buf; + return s - p; } } } @@ -380,6 +392,57 @@ int seq_path(struct seq_file *m, struct path *path, char *esc) } EXPORT_SYMBOL(seq_path); +/* + * Same as seq_path, but relative to supplied root. + * + * root may be changed, see __d_path(). + */ +int seq_path_root(struct seq_file *m, struct path *path, struct path *root, + char *esc) +{ + int err = -ENAMETOOLONG; + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p; + + spin_lock(&dcache_lock); + p = __d_path(path, root, s, m->size - m->count); + spin_unlock(&dcache_lock); + err = PTR_ERR(p); + if (!IS_ERR(p)) { + s = mangle_path(s, p, esc); + if (s) { + p = m->buf + m->count; + m->count = s - m->buf; + return 0; + } + } + } + m->count = m->size; + return err; +} + +/* + * returns the path of the 'dentry' from the root of its filesystem. + */ +int seq_dentry(struct seq_file *m, struct dentry *dentry, char *esc) +{ + if (m->count < m->size) { + char *s = m->buf + m->count; + char *p = dentry_path(dentry, s, m->size - m->count); + if (!IS_ERR(p)) { + s = mangle_path(s, p, esc); + if (s) { + p = m->buf + m->count; + m->count = s - m->buf; + return s - p; + } + } + } + m->count = m->size; + return -1; +} + static void *single_start(struct seq_file *p, loff_t *pos) { return NULL + (*pos == 0); diff --git a/fs/signalfd.c b/fs/signalfd.c index 8ead0db3593..619725644c7 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -207,11 +207,8 @@ static const struct file_operations signalfd_fops = { asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask) { - int error; sigset_t sigmask; struct signalfd_ctx *ctx; - struct file *file; - struct inode *inode; if (sizemask != sizeof(sigset_t) || copy_from_user(&sigmask, user_mask, sizeof(sigmask))) @@ -230,12 +227,11 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas * When we call this, the initialization must be complete, since * anon_inode_getfd() will install the fd. */ - error = anon_inode_getfd(&ufd, &inode, &file, "[signalfd]", - &signalfd_fops, ctx); - if (error) - goto err_fdalloc; + ufd = anon_inode_getfd("[signalfd]", &signalfd_fops, ctx); + if (ufd < 0) + kfree(ctx); } else { - file = fget(ufd); + struct file *file = fget(ufd); if (!file) return -EBADF; ctx = file->private_data; @@ -252,9 +248,4 @@ asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemas } return ufd; - -err_fdalloc: - kfree(ctx); - return error; } - diff --git a/fs/smbfs/smb_debug.h b/fs/smbfs/smb_debug.h index 734972b9269..fc4b1a5dd75 100644 --- a/fs/smbfs/smb_debug.h +++ b/fs/smbfs/smb_debug.h @@ -11,14 +11,14 @@ * these are normally enabled. */ #ifdef SMBFS_PARANOIA -# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __FUNCTION__ , ## a) +# define PARANOIA(f, a...) printk(KERN_NOTICE "%s: " f, __func__ , ## a) #else # define PARANOIA(f, a...) do { ; } while(0) #endif /* lots of debug messages */ #ifdef SMBFS_DEBUG_VERBOSE -# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __FUNCTION__ , ## a) +# define VERBOSE(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a) #else # define VERBOSE(f, a...) do { ; } while(0) #endif @@ -28,7 +28,7 @@ * too common name. */ #ifdef SMBFS_DEBUG -#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __FUNCTION__ , ## a) +#define DEBUG1(f, a...) printk(KERN_DEBUG "%s: " f, __func__ , ## a) #else #define DEBUG1(f, a...) do { ; } while(0) #endif diff --git a/fs/splice.c b/fs/splice.c index eeb1a86a701..633f58ebfb7 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1075,7 +1075,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, ret = splice_direct_to_actor(in, &sd, direct_splice_actor); if (ret > 0) - *ppos += ret; + *ppos = sd.pos; return ret; } diff --git a/fs/super.c b/fs/super.c index 4798350b2bc..453877c5697 100644 --- a/fs/super.c +++ b/fs/super.c @@ -117,7 +117,7 @@ static inline void destroy_super(struct super_block *s) * Drop a superblock's refcount. Returns non-zero if the superblock was * destroyed. The caller must hold sb_lock. */ -int __put_super(struct super_block *sb) +static int __put_super(struct super_block *sb) { int ret = 0; @@ -179,7 +179,7 @@ void deactivate_super(struct super_block *s) if (atomic_dec_and_lock(&s->s_active, &sb_lock)) { s->s_count -= S_BIAS-1; spin_unlock(&sb_lock); - DQUOT_OFF(s); + DQUOT_OFF(s, 0); down_write(&s->s_umount); fs->kill_sb(s); put_filesystem(fs); @@ -608,6 +608,7 @@ retry: int do_remount_sb(struct super_block *sb, int flags, void *data, int force) { int retval; + int remount_rw; #ifdef CONFIG_BLOCK if (!(flags & MS_RDONLY) && bdev_read_only(sb->s_bdev)) @@ -625,8 +626,11 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) mark_files_ro(sb); else if (!fs_may_remount_ro(sb)) return -EBUSY; - DQUOT_OFF(sb); + retval = DQUOT_OFF(sb, 1); + if (retval < 0 && retval != -ENOSYS) + return -EBUSY; } + remount_rw = !(flags & MS_RDONLY) && (sb->s_flags & MS_RDONLY); if (sb->s_op->remount_fs) { lock_super(sb); @@ -636,6 +640,8 @@ int do_remount_sb(struct super_block *sb, int flags, void *data, int force) return retval; } sb->s_flags = (sb->s_flags & ~MS_RMT_MASK) | (flags & MS_RMT_MASK); + if (remount_rw) + DQUOT_ON_REMOUNT(sb); return 0; } diff --git a/fs/sync.c b/fs/sync.c index 7cd005ea763..228e17b5e9e 100644 --- a/fs/sync.c +++ b/fs/sync.c @@ -64,7 +64,7 @@ int file_fsync(struct file *filp, struct dentry *dentry, int datasync) /* sync the superblock to buffers */ sb = inode->i_sb; lock_super(sb); - if (sb->s_op->write_super) + if (sb->s_dirt && sb->s_op->write_super) sb->s_op->write_super(sb); unlock_super(sb); diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index ade9a7e6a75..e7735f643cd 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -135,7 +135,7 @@ sysfs_read_file(struct file *file, char __user *buf, size_t count, loff_t *ppos) goto out; } pr_debug("%s: count = %zd, ppos = %lld, buf = %s\n", - __FUNCTION__, count, *ppos, buffer->page); + __func__, count, *ppos, buffer->page); retval = simple_read_from_buffer(buf, count, ppos, buffer->page, buffer->count); out: @@ -477,11 +477,10 @@ const struct file_operations sysfs_file_operations = { .poll = sysfs_poll, }; - -int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, - int type) +int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type, mode_t amode) { - umode_t mode = (attr->mode & S_IALLUGO) | S_IFREG; + umode_t mode = (amode & S_IALLUGO) | S_IFREG; struct sysfs_addrm_cxt acxt; struct sysfs_dirent *sd; int rc; @@ -502,6 +501,13 @@ int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, } +int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, + int type) +{ + return sysfs_add_file_mode(dir_sd, attr, type, attr->mode); +} + + /** * sysfs_create_file - create an attribute file for an object. * @kobj: object we're creating for. diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 47790491503..eeba38417b1 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -23,35 +23,50 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, int i; for (i = 0, attr = grp->attrs; *attr; i++, attr++) - if (!grp->is_visible || - grp->is_visible(kobj, *attr, i)) - sysfs_hash_and_remove(dir_sd, (*attr)->name); + sysfs_hash_and_remove(dir_sd, (*attr)->name); } static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj, - const struct attribute_group *grp) + const struct attribute_group *grp, int update) { struct attribute *const* attr; int error = 0, i; - for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) - if (!grp->is_visible || - grp->is_visible(kobj, *attr, i)) - error |= - sysfs_add_file(dir_sd, *attr, SYSFS_KOBJ_ATTR); + for (i = 0, attr = grp->attrs; *attr && !error; i++, attr++) { + mode_t mode = 0; + + /* in update mode, we're changing the permissions or + * visibility. Do this by first removing then + * re-adding (if required) the file */ + if (update) + sysfs_hash_and_remove(dir_sd, (*attr)->name); + if (grp->is_visible) { + mode = grp->is_visible(kobj, *attr, i); + if (!mode) + continue; + } + error = sysfs_add_file_mode(dir_sd, *attr, SYSFS_KOBJ_ATTR, + (*attr)->mode | mode); + if (unlikely(error)) + break; + } if (error) remove_files(dir_sd, kobj, grp); return error; } -int sysfs_create_group(struct kobject * kobj, - const struct attribute_group * grp) +static int internal_create_group(struct kobject *kobj, int update, + const struct attribute_group *grp) { struct sysfs_dirent *sd; int error; - BUG_ON(!kobj || !kobj->sd); + BUG_ON(!kobj || (!update && !kobj->sd)); + + /* Updates may happen before the object has been instantiated */ + if (unlikely(update && !kobj->sd)) + return -EINVAL; if (grp->name) { error = sysfs_create_subdir(kobj, grp->name, &sd); @@ -60,7 +75,7 @@ int sysfs_create_group(struct kobject * kobj, } else sd = kobj->sd; sysfs_get(sd); - error = create_files(sd, kobj, grp); + error = create_files(sd, kobj, grp, update); if (error) { if (grp->name) sysfs_remove_subdir(sd); @@ -69,6 +84,47 @@ int sysfs_create_group(struct kobject * kobj, return error; } +/** + * sysfs_create_group - given a directory kobject, create an attribute group + * @kobj: The kobject to create the group on + * @grp: The attribute group to create + * + * This function creates a group for the first time. It will explicitly + * warn and error if any of the attribute files being created already exist. + * + * Returns 0 on success or error. + */ +int sysfs_create_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + return internal_create_group(kobj, 0, grp); +} + +/** + * sysfs_update_group - given a directory kobject, create an attribute group + * @kobj: The kobject to create the group on + * @grp: The attribute group to create + * + * This function updates an attribute group. Unlike + * sysfs_create_group(), it will explicitly not warn or error if any + * of the attribute files being created already exist. Furthermore, + * if the visibility of the files has changed through the is_visible() + * callback, it will update the permissions and add or remove the + * relevant files. + * + * The primary use for this function is to call it after making a change + * that affects group visibility. + * + * Returns 0 on success or error. + */ +int sysfs_update_group(struct kobject *kobj, + const struct attribute_group *grp) +{ + return internal_create_group(kobj, 1, grp); +} + + + void sysfs_remove_group(struct kobject * kobj, const struct attribute_group * grp) { @@ -95,4 +151,5 @@ void sysfs_remove_group(struct kobject * kobj, EXPORT_SYMBOL_GPL(sysfs_create_group); +EXPORT_SYMBOL_GPL(sysfs_update_group); EXPORT_SYMBOL_GPL(sysfs_remove_group); diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index d9262f74f94..eb53c632f85 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -30,7 +30,7 @@ static const struct address_space_operations sysfs_aops = { static struct backing_dev_info sysfs_backing_dev_info = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, }; static const struct inode_operations sysfs_inode_operations ={ @@ -59,6 +59,8 @@ int sysfs_setattr(struct dentry * dentry, struct iattr * iattr) if (error) return error; + iattr->ia_valid &= ~ATTR_SIZE; /* ignore size changes */ + error = inode_setattr(inode, iattr); if (error) return error; diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c index 74168266cd5..14f0023984d 100644 --- a/fs/sysfs/mount.c +++ b/fs/sysfs/mount.c @@ -61,7 +61,7 @@ static int sysfs_fill_super(struct super_block *sb, void *data, int silent) /* instantiate and link root dentry */ root = d_alloc_root(inode); if (!root) { - pr_debug("%s: could not get root dentry!\n",__FUNCTION__); + pr_debug("%s: could not get root dentry!\n",__func__); iput(inode); return -ENOMEM; } diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index ff17f8da9b4..ce4e15f8aae 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -154,6 +154,8 @@ extern const struct file_operations sysfs_file_operations; int sysfs_add_file(struct sysfs_dirent *dir_sd, const struct attribute *attr, int type); +int sysfs_add_file_mode(struct sysfs_dirent *dir_sd, + const struct attribute *attr, int type, mode_t amode); /* * bin.c */ diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h index 42d51d1c05c..38ebe3f85b3 100644 --- a/fs/sysv/sysv.h +++ b/fs/sysv/sysv.h @@ -217,9 +217,9 @@ static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d) if (sbi->s_bytesex == BYTESEX_PDP) *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d); else if (sbi->s_bytesex == BYTESEX_LE) - *(__le32*)n = cpu_to_le32(le32_to_cpu(*(__le32*)n)+d); + le32_add_cpu((__le32 *)n, d); else - *(__be32*)n = cpu_to_be32(be32_to_cpu(*(__be32*)n)+d); + be32_add_cpu((__be32 *)n, d); return *n; } @@ -242,9 +242,9 @@ static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n) static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d) { if (sbi->s_bytesex != BYTESEX_BE) - *(__le16*)n = cpu_to_le16(le16_to_cpu(*(__le16 *)n)+d); + le16_add_cpu((__le16 *)n, d); else - *(__be16*)n = cpu_to_be16(be16_to_cpu(*(__be16 *)n)+d); + be16_add_cpu((__be16 *)n, d); return *n; } diff --git a/fs/timerfd.c b/fs/timerfd.c index 10c80b59ec4..d87d354ec42 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -20,6 +20,7 @@ #include <linux/hrtimer.h> #include <linux/anon_inodes.h> #include <linux/timerfd.h> +#include <linux/syscalls.h> struct timerfd_ctx { struct hrtimer tmr; @@ -180,10 +181,8 @@ static struct file *timerfd_fget(int fd) asmlinkage long sys_timerfd_create(int clockid, int flags) { - int error, ufd; + int ufd; struct timerfd_ctx *ctx; - struct file *file; - struct inode *inode; if (flags) return -EINVAL; @@ -199,12 +198,9 @@ asmlinkage long sys_timerfd_create(int clockid, int flags) ctx->clockid = clockid; hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); - error = anon_inode_getfd(&ufd, &inode, &file, "[timerfd]", - &timerfd_fops, ctx); - if (error) { + ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx); + if (ufd < 0) kfree(ctx); - return error; - } return ufd; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index ba5537d4bc1..2b34c8ca6c8 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -890,7 +890,7 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) { kernel_lb_addr eloc; - uint32_t elen; + uint32_t bsize; block = udf_new_block(inode->i_sb, inode, iinfo->i_location.partitionReferenceNum, @@ -903,9 +903,9 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, eloc.logicalBlockNum = block; eloc.partitionReferenceNum = iinfo->i_location.partitionReferenceNum; - elen = inode->i_sb->s_blocksize; - iinfo->i_lenExtents = elen; - udf_add_aext(inode, &epos, eloc, elen, 0); + bsize = inode->i_sb->s_blocksize; + iinfo->i_lenExtents = bsize; + udf_add_aext(inode, &epos, eloc, bsize, 0); brelse(epos.bh); block = udf_get_pblock(inode->i_sb, block, diff --git a/fs/udf/super.c b/fs/udf/super.c index b564fc140fe..9fb18a340fc 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -240,7 +240,7 @@ static int udf_sb_alloc_partition_maps(struct super_block *sb, u32 count) sbi->s_partmaps = kcalloc(count, sizeof(struct udf_part_map), GFP_KERNEL); if (!sbi->s_partmaps) { - udf_error(sb, __FUNCTION__, + udf_error(sb, __func__, "Unable to allocate space for %d partition maps", count); sbi->s_partitions = 0; @@ -1086,7 +1086,7 @@ static struct udf_bitmap *udf_sb_alloc_bitmap(struct super_block *sb, u32 index) bitmap = vmalloc(size); /* TODO: get rid of vmalloc */ if (bitmap == NULL) { - udf_error(sb, __FUNCTION__, + udf_error(sb, __func__, "Unable to allocate space for bitmap " "and %d buffer_head pointers", nr_groups); return NULL; diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 1e7598fb978..0d9ada17373 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -277,7 +277,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, if (!page)/* it was truncated */ continue; if (IS_ERR(page)) {/* or EIO */ - ufs_error(inode->i_sb, __FUNCTION__, + ufs_error(inode->i_sb, __func__, "read of page %llu failed\n", (unsigned long long)index); continue; @@ -308,7 +308,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg, ll_rw_block(READ, 1, &bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - ufs_error(inode->i_sb, __FUNCTION__, + ufs_error(inode->i_sb, __func__, "read of block failed\n"); break; } diff --git a/fs/ufs/dir.c b/fs/ufs/dir.c index ef563fc8d72..df0bef18742 100644 --- a/fs/ufs/dir.c +++ b/fs/ufs/dir.c @@ -179,7 +179,7 @@ bad_entry: goto fail; Eend: p = (struct ufs_dir_entry *)(kaddr + offs); - ufs_error(sb, __FUNCTION__, + ufs_error(sb, __func__, "entry in directory #%lu spans the page boundary" "offset=%lu", dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs); @@ -284,7 +284,7 @@ struct ufs_dir_entry *ufs_find_entry(struct inode *dir, struct dentry *dentry, kaddr += ufs_last_byte(dir, n) - reclen; while ((char *) de <= kaddr) { if (de->d_reclen == 0) { - ufs_error(dir->i_sb, __FUNCTION__, + ufs_error(dir->i_sb, __func__, "zero-length directory entry"); ufs_put_page(page); goto out; @@ -356,7 +356,7 @@ int ufs_add_link(struct dentry *dentry, struct inode *inode) goto got_it; } if (de->d_reclen == 0) { - ufs_error(dir->i_sb, __FUNCTION__, + ufs_error(dir->i_sb, __func__, "zero-length directory entry"); err = -EIO; goto out_unlock; @@ -456,7 +456,7 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) struct page *page = ufs_get_page(inode, n); if (IS_ERR(page)) { - ufs_error(sb, __FUNCTION__, + ufs_error(sb, __func__, "bad page in #%lu", inode->i_ino); filp->f_pos += PAGE_CACHE_SIZE - offset; @@ -475,7 +475,7 @@ ufs_readdir(struct file *filp, void *dirent, filldir_t filldir) limit = kaddr + ufs_last_byte(inode, n) - UFS_DIR_REC_LEN(1); for ( ;(char*)de <= limit; de = ufs_next_entry(sb, de)) { if (de->d_reclen == 0) { - ufs_error(sb, __FUNCTION__, + ufs_error(sb, __func__, "zero-length directory entry"); ufs_put_page(page); return -EIO; @@ -536,7 +536,7 @@ int ufs_delete_entry(struct inode *inode, struct ufs_dir_entry *dir, while ((char*)de < (char*)dir) { if (de->d_reclen == 0) { - ufs_error(inode->i_sb, __FUNCTION__, + ufs_error(inode->i_sb, __func__, "zero-length directory entry"); err = -EIO; goto out; @@ -633,7 +633,7 @@ int ufs_empty_dir(struct inode * inode) while ((char *)de <= kaddr) { if (de->d_reclen == 0) { - ufs_error(inode->i_sb, __FUNCTION__, + ufs_error(inode->i_sb, __func__, "zero-length directory entry: " "kaddr=%p, de=%p\n", kaddr, de); goto not_empty; diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 5446b888fc8..39f87789856 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -929,7 +929,7 @@ void ufs_delete_inode (struct inode * inode) old_i_size = inode->i_size; inode->i_size = 0; if (inode->i_blocks && ufs_truncate(inode, old_i_size)) - ufs_warning(inode->i_sb, __FUNCTION__, "ufs_truncate failed\n"); + ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); ufs_free_inode (inode); unlock_kernel(); return; diff --git a/fs/ufs/swab.h b/fs/ufs/swab.h index 1683d2bee61..8d974c4fd18 100644 --- a/fs/ufs/swab.h +++ b/fs/ufs/swab.h @@ -40,25 +40,7 @@ cpu_to_fs64(struct super_block *sbp, u64 n) return (__force __fs64)cpu_to_be64(n); } -static __inline u32 -fs64_add(struct super_block *sbp, u32 *n, int d) -{ - if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) - return *n = cpu_to_le64(le64_to_cpu(*n)+d); - else - return *n = cpu_to_be64(be64_to_cpu(*n)+d); -} - -static __inline u32 -fs64_sub(struct super_block *sbp, u32 *n, int d) -{ - if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) - return *n = cpu_to_le64(le64_to_cpu(*n)-d); - else - return *n = cpu_to_be64(be64_to_cpu(*n)-d); -} - -static __inline u32 +static inline u32 fs32_to_cpu(struct super_block *sbp, __fs32 n) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) @@ -80,18 +62,18 @@ static inline void fs32_add(struct super_block *sbp, __fs32 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) - *(__le32 *)n = cpu_to_le32(le32_to_cpu(*(__le32 *)n)+d); + le32_add_cpu((__le32 *)n, d); else - *(__be32 *)n = cpu_to_be32(be32_to_cpu(*(__be32 *)n)+d); + be32_add_cpu((__be32 *)n, d); } static inline void fs32_sub(struct super_block *sbp, __fs32 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) - *(__le32 *)n = cpu_to_le32(le32_to_cpu(*(__le32 *)n)-d); + le32_add_cpu((__le32 *)n, -d); else - *(__be32 *)n = cpu_to_be32(be32_to_cpu(*(__be32 *)n)-d); + be32_add_cpu((__be32 *)n, -d); } static inline u16 @@ -116,18 +98,18 @@ static inline void fs16_add(struct super_block *sbp, __fs16 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) - *(__le16 *)n = cpu_to_le16(le16_to_cpu(*(__le16 *)n)+d); + le16_add_cpu((__le16 *)n, d); else - *(__be16 *)n = cpu_to_be16(be16_to_cpu(*(__be16 *)n)+d); + be16_add_cpu((__be16 *)n, d); } static inline void fs16_sub(struct super_block *sbp, __fs16 *n, int d) { if (UFS_SB(sbp)->s_bytesex == BYTESEX_LE) - *(__le16 *)n = cpu_to_le16(le16_to_cpu(*(__le16 *)n)-d); + le16_add_cpu((__le16 *)n, -d); else - *(__be16 *)n = cpu_to_be16(be16_to_cpu(*(__be16 *)n)-d); + be16_add_cpu((__be16 *)n, -d); } #endif /* _UFS_SWAB_H */ diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index fcb9231bb9e..244a1aaa940 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -66,7 +66,7 @@ struct ufs_inode_info { #ifdef CONFIG_UFS_DEBUG # define UFSD(f, a...) { \ printk ("UFSD (%s, %d): %s:", \ - __FILE__, __LINE__, __FUNCTION__); \ + __FILE__, __LINE__, __func__); \ printk (f, ## a); \ } #else diff --git a/fs/utimes.c b/fs/utimes.c index a2bef77dc9c..af059d5cb48 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -40,9 +40,14 @@ asmlinkage long sys_utime(char __user *filename, struct utimbuf __user *times) #endif +static bool nsec_special(long nsec) +{ + return nsec == UTIME_OMIT || nsec == UTIME_NOW; +} + static bool nsec_valid(long nsec) { - if (nsec == UTIME_OMIT || nsec == UTIME_NOW) + if (nsec_special(nsec)) return true; return nsec >= 0 && nsec <= 999999999; @@ -119,7 +124,15 @@ long do_utimes(int dfd, char __user *filename, struct timespec *times, int flags newattrs.ia_mtime.tv_nsec = times[1].tv_nsec; newattrs.ia_valid |= ATTR_MTIME_SET; } - } else { + } + + /* + * If times is NULL or both times are either UTIME_OMIT or + * UTIME_NOW, then need to check permissions, because + * inode_change_ok() won't do it. + */ + if (!times || (nsec_special(times[0].tv_nsec) && + nsec_special(times[1].tv_nsec))) { error = -EACCES; if (IS_IMMUTABLE(inode)) goto mnt_drop_write_and_out; diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index cd450bea9f1..a3522727ea5 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -176,15 +176,10 @@ static inline int vfat_is_used_badchars(const wchar_t *s, int len) for (i = 0; i < len; i++) if (vfat_bad_char(s[i])) return -EINVAL; - return 0; -} -static int vfat_valid_longname(const unsigned char *name, unsigned int len) -{ - if (name[len - 1] == ' ') + if (s[i - 1] == ' ') /* last character cannot be space */ return -EINVAL; - if (len >= 256) - return -ENAMETOOLONG; + return 0; } @@ -477,7 +472,7 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, if (utf8) { int name_len = strlen(name); - *outlen = utf8_mbstowcs((wchar_t *)outname, name, PAGE_SIZE); + *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX); /* * We stripped '.'s before and set len appropriately, @@ -485,11 +480,14 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, */ *outlen -= (name_len - len); + if (*outlen > 255) + return -ENAMETOOLONG; + op = &outname[*outlen * sizeof(wchar_t)]; } else { if (nls) { for (i = 0, ip = name, op = outname, *outlen = 0; - i < len && *outlen <= 260; + i < len && *outlen <= 255; *outlen += 1) { if (escape && (*ip == ':')) { @@ -525,18 +523,20 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname, op += 2; } } + if (i < len) + return -ENAMETOOLONG; } else { for (i = 0, ip = name, op = outname, *outlen = 0; - i < len && *outlen <= 260; + i < len && *outlen <= 255; i++, *outlen += 1) { *op++ = *ip++; *op++ = 0; } + if (i < len) + return -ENAMETOOLONG; } } - if (*outlen > 260) - return -ENAMETOOLONG; *longlen = *outlen; if (*outlen % 13) { @@ -565,7 +565,6 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name, struct fat_mount_options *opts = &sbi->options; struct msdos_dir_slot *ps; struct msdos_dir_entry *de; - unsigned long page; unsigned char cksum, lcase; unsigned char msdos_name[MSDOS_NAME]; wchar_t *uname; @@ -574,15 +573,11 @@ static int vfat_build_slots(struct inode *dir, const unsigned char *name, loff_t offset; *nr_slots = 0; - err = vfat_valid_longname(name, len); - if (err) - return err; - page = __get_free_page(GFP_KERNEL); - if (!page) + uname = __getname(); + if (!uname) return -ENOMEM; - uname = (wchar_t *)page; err = xlate_to_uni(name, len, (unsigned char *)uname, &ulen, &usize, opts->unicode_xlate, opts->utf8, sbi->nls_io); if (err) @@ -634,7 +629,7 @@ shortname: de->starthi = cpu_to_le16(cluster >> 16); de->size = 0; out_free: - free_page(page); + __putname(uname); return err; } @@ -991,7 +986,7 @@ error_inode: if (corrupt < 0) { fat_fs_panic(new_dir->i_sb, "%s: Filesystem corrupted (i_pos %lld)", - __FUNCTION__, sinfo.i_pos); + __func__, sinfo.i_pos); } goto out; } @@ -1003,7 +998,7 @@ static const struct inode_operations vfat_dir_inode_operations = { .mkdir = vfat_mkdir, .rmdir = vfat_rmdir, .rename = vfat_rename, - .setattr = fat_notify_change, + .setattr = fat_setattr, .getattr = fat_getattr, }; diff --git a/fs/xattr.c b/fs/xattr.c index f7062da505d..4706a8b1f49 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -67,7 +67,7 @@ xattr_permission(struct inode *inode, const char *name, int mask) } int -vfs_setxattr(struct dentry *dentry, char *name, void *value, +vfs_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { struct inode *inode = dentry->d_inode; @@ -131,7 +131,7 @@ out_noalloc: EXPORT_SYMBOL_GPL(xattr_getsecurity); ssize_t -vfs_getxattr(struct dentry *dentry, char *name, void *value, size_t size) +vfs_getxattr(struct dentry *dentry, const char *name, void *value, size_t size) { struct inode *inode = dentry->d_inode; int error; @@ -187,7 +187,7 @@ vfs_listxattr(struct dentry *d, char *list, size_t size) EXPORT_SYMBOL_GPL(vfs_listxattr); int -vfs_removexattr(struct dentry *dentry, char *name) +vfs_removexattr(struct dentry *dentry, const char *name) { struct inode *inode = dentry->d_inode; int error; @@ -218,7 +218,7 @@ EXPORT_SYMBOL_GPL(vfs_removexattr); * Extended attribute SET operations */ static long -setxattr(struct dentry *d, char __user *name, void __user *value, +setxattr(struct dentry *d, const char __user *name, const void __user *value, size_t size, int flags) { int error; @@ -252,8 +252,8 @@ setxattr(struct dentry *d, char __user *name, void __user *value, } asmlinkage long -sys_setxattr(char __user *path, char __user *name, void __user *value, - size_t size, int flags) +sys_setxattr(const char __user *path, const char __user *name, + const void __user *value, size_t size, int flags) { struct nameidata nd; int error; @@ -271,8 +271,8 @@ sys_setxattr(char __user *path, char __user *name, void __user *value, } asmlinkage long -sys_lsetxattr(char __user *path, char __user *name, void __user *value, - size_t size, int flags) +sys_lsetxattr(const char __user *path, const char __user *name, + const void __user *value, size_t size, int flags) { struct nameidata nd; int error; @@ -290,7 +290,7 @@ sys_lsetxattr(char __user *path, char __user *name, void __user *value, } asmlinkage long -sys_fsetxattr(int fd, char __user *name, void __user *value, +sys_fsetxattr(int fd, const char __user *name, const void __user *value, size_t size, int flags) { struct file *f; @@ -307,7 +307,6 @@ sys_fsetxattr(int fd, char __user *name, void __user *value, error = setxattr(dentry, name, value, size, flags); mnt_drop_write(f->f_path.mnt); } -out_fput: fput(f); return error; } @@ -316,7 +315,8 @@ out_fput: * Extended attribute GET operations */ static ssize_t -getxattr(struct dentry *d, char __user *name, void __user *value, size_t size) +getxattr(struct dentry *d, const char __user *name, void __user *value, + size_t size) { ssize_t error; void *kvalue = NULL; @@ -350,8 +350,8 @@ getxattr(struct dentry *d, char __user *name, void __user *value, size_t size) } asmlinkage ssize_t -sys_getxattr(char __user *path, char __user *name, void __user *value, - size_t size) +sys_getxattr(const char __user *path, const char __user *name, + void __user *value, size_t size) { struct nameidata nd; ssize_t error; @@ -365,7 +365,7 @@ sys_getxattr(char __user *path, char __user *name, void __user *value, } asmlinkage ssize_t -sys_lgetxattr(char __user *path, char __user *name, void __user *value, +sys_lgetxattr(const char __user *path, const char __user *name, void __user *value, size_t size) { struct nameidata nd; @@ -380,7 +380,7 @@ sys_lgetxattr(char __user *path, char __user *name, void __user *value, } asmlinkage ssize_t -sys_fgetxattr(int fd, char __user *name, void __user *value, size_t size) +sys_fgetxattr(int fd, const char __user *name, void __user *value, size_t size) { struct file *f; ssize_t error = -EBADF; @@ -425,7 +425,7 @@ listxattr(struct dentry *d, char __user *list, size_t size) } asmlinkage ssize_t -sys_listxattr(char __user *path, char __user *list, size_t size) +sys_listxattr(const char __user *path, char __user *list, size_t size) { struct nameidata nd; ssize_t error; @@ -439,7 +439,7 @@ sys_listxattr(char __user *path, char __user *list, size_t size) } asmlinkage ssize_t -sys_llistxattr(char __user *path, char __user *list, size_t size) +sys_llistxattr(const char __user *path, char __user *list, size_t size) { struct nameidata nd; ssize_t error; @@ -471,7 +471,7 @@ sys_flistxattr(int fd, char __user *list, size_t size) * Extended attribute REMOVE operations */ static long -removexattr(struct dentry *d, char __user *name) +removexattr(struct dentry *d, const char __user *name) { int error; char kname[XATTR_NAME_MAX + 1]; @@ -486,7 +486,7 @@ removexattr(struct dentry *d, char __user *name) } asmlinkage long -sys_removexattr(char __user *path, char __user *name) +sys_removexattr(const char __user *path, const char __user *name) { struct nameidata nd; int error; @@ -504,7 +504,7 @@ sys_removexattr(char __user *path, char __user *name) } asmlinkage long -sys_lremovexattr(char __user *path, char __user *name) +sys_lremovexattr(const char __user *path, const char __user *name) { struct nameidata nd; int error; @@ -522,7 +522,7 @@ sys_lremovexattr(char __user *path, char __user *name) } asmlinkage long -sys_fremovexattr(int fd, char __user *name) +sys_fremovexattr(int fd, const char __user *name) { struct file *f; struct dentry *dentry; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index 524021ff543..3f53dd101f9 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -64,3 +64,16 @@ config XFS_RT See the xfs man page in section 5 for additional information. If unsure, say N. + +config XFS_DEBUG + bool "XFS Debugging support (EXPERIMENTAL)" + depends on XFS_FS && EXPERIMENTAL + help + Say Y here to get an XFS build with many debugging features, + including ASSERT checks, function wrappers around macros, + and extra sanity-checking functions in various code paths. + + Note that the resulting code will be HUGE and SLOW, and probably + not useful unless you are debugging a particular problem. + + Say N unless you are an XFS developer, or you play one on TV. diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index c110bb00266..ff6a19873e5 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -20,29 +20,24 @@ #include <linux/rwsem.h> -enum { MR_NONE, MR_ACCESS, MR_UPDATE }; - typedef struct { struct rw_semaphore mr_lock; +#ifdef DEBUG int mr_writer; +#endif } mrlock_t; +#ifdef DEBUG #define mrinit(mrp, name) \ do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0) +#else +#define mrinit(mrp, name) \ + do { init_rwsem(&(mrp)->mr_lock); } while (0) +#endif + #define mrlock_init(mrp, t,n,s) mrinit(mrp, n) #define mrfree(mrp) do { } while (0) -static inline void mraccess(mrlock_t *mrp) -{ - down_read(&mrp->mr_lock); -} - -static inline void mrupdate(mrlock_t *mrp) -{ - down_write(&mrp->mr_lock); - mrp->mr_writer = 1; -} - static inline void mraccess_nested(mrlock_t *mrp, int subclass) { down_read_nested(&mrp->mr_lock, subclass); @@ -51,10 +46,11 @@ static inline void mraccess_nested(mrlock_t *mrp, int subclass) static inline void mrupdate_nested(mrlock_t *mrp, int subclass) { down_write_nested(&mrp->mr_lock, subclass); +#ifdef DEBUG mrp->mr_writer = 1; +#endif } - static inline int mrtryaccess(mrlock_t *mrp) { return down_read_trylock(&mrp->mr_lock); @@ -64,39 +60,31 @@ static inline int mrtryupdate(mrlock_t *mrp) { if (!down_write_trylock(&mrp->mr_lock)) return 0; +#ifdef DEBUG mrp->mr_writer = 1; +#endif return 1; } -static inline void mrunlock(mrlock_t *mrp) +static inline void mrunlock_excl(mrlock_t *mrp) { - if (mrp->mr_writer) { - mrp->mr_writer = 0; - up_write(&mrp->mr_lock); - } else { - up_read(&mrp->mr_lock); - } +#ifdef DEBUG + mrp->mr_writer = 0; +#endif + up_write(&mrp->mr_lock); } -static inline void mrdemote(mrlock_t *mrp) +static inline void mrunlock_shared(mrlock_t *mrp) { - mrp->mr_writer = 0; - downgrade_write(&mrp->mr_lock); + up_read(&mrp->mr_lock); } -#ifdef DEBUG -/* - * Debug-only routine, without some platform-specific asm code, we can - * now only answer requests regarding whether we hold the lock for write - * (reader state is outside our visibility, we only track writer state). - * Note: means !ismrlocked would give false positives, so don't do that. - */ -static inline int ismrlocked(mrlock_t *mrp, int type) +static inline void mrdemote(mrlock_t *mrp) { - if (mrp && type == MR_UPDATE) - return mrp->mr_writer; - return 1; -} +#ifdef DEBUG + mrp->mr_writer = 0; #endif + downgrade_write(&mrp->mr_lock); +} #endif /* __XFS_SUPPORT_MRLOCK_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 52f6846101d..5105015a75a 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -886,7 +886,7 @@ int xfs_buf_lock_value( xfs_buf_t *bp) { - return atomic_read(&bp->b_sema.count); + return bp->b_sema.count; } #endif diff --git a/fs/xfs/linux-2.6/xfs_export.c b/fs/xfs/linux-2.6/xfs_export.c index 265f0168ab7..c672b3238b1 100644 --- a/fs/xfs/linux-2.6/xfs_export.c +++ b/fs/xfs/linux-2.6/xfs_export.c @@ -133,7 +133,7 @@ xfs_nfs_get_inode( if (!ip) return ERR_PTR(-EIO); - if (!ip->i_d.di_mode || ip->i_d.di_gen != generation) { + if (ip->i_d.di_gen != generation) { xfs_iput_new(ip, XFS_ILOCK_SHARED); return ERR_PTR(-ENOENT); } diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 05905246434..65e78c13d4a 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -43,9 +43,6 @@ #include <linux/smp_lock.h> static struct vm_operations_struct xfs_file_vm_ops; -#ifdef CONFIG_XFS_DMAPI -static struct vm_operations_struct xfs_dmapi_file_vm_ops; -#endif STATIC_INLINE ssize_t __xfs_file_read( @@ -202,22 +199,6 @@ xfs_file_fsync( (xfs_off_t)0, (xfs_off_t)-1); } -#ifdef CONFIG_XFS_DMAPI -STATIC int -xfs_vm_fault( - struct vm_area_struct *vma, - struct vm_fault *vmf) -{ - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - bhv_vnode_t *vp = vn_from_inode(inode); - - ASSERT_ALWAYS(vp->v_vfsp->vfs_flag & VFS_DMI); - if (XFS_SEND_MMAP(XFS_VFSTOM(vp->v_vfsp), vma, 0)) - return VM_FAULT_SIGBUS; - return filemap_fault(vma, vmf); -} -#endif /* CONFIG_XFS_DMAPI */ - /* * Unfortunately we can't just use the clean and simple readdir implementation * below, because nfs might call back into ->lookup from the filldir callback @@ -386,11 +367,6 @@ xfs_file_mmap( vma->vm_ops = &xfs_file_vm_ops; vma->vm_flags |= VM_CAN_NONLINEAR; -#ifdef CONFIG_XFS_DMAPI - if (XFS_M(filp->f_path.dentry->d_inode->i_sb)->m_flags & XFS_MOUNT_DMAPI) - vma->vm_ops = &xfs_dmapi_file_vm_ops; -#endif /* CONFIG_XFS_DMAPI */ - file_accessed(filp); return 0; } @@ -437,47 +413,6 @@ xfs_file_ioctl_invis( return error; } -#ifdef CONFIG_XFS_DMAPI -#ifdef HAVE_VMOP_MPROTECT -STATIC int -xfs_vm_mprotect( - struct vm_area_struct *vma, - unsigned int newflags) -{ - struct inode *inode = vma->vm_file->f_path.dentry->d_inode; - struct xfs_mount *mp = XFS_M(inode->i_sb); - int error = 0; - - if (mp->m_flags & XFS_MOUNT_DMAPI) { - if ((vma->vm_flags & VM_MAYSHARE) && - (newflags & VM_WRITE) && !(vma->vm_flags & VM_WRITE)) - error = XFS_SEND_MMAP(mp, vma, VM_WRITE); - } - return error; -} -#endif /* HAVE_VMOP_MPROTECT */ -#endif /* CONFIG_XFS_DMAPI */ - -#ifdef HAVE_FOP_OPEN_EXEC -/* If the user is attempting to execute a file that is offline then - * we have to trigger a DMAPI READ event before the file is marked as busy - * otherwise the invisible I/O will not be able to write to the file to bring - * it back online. - */ -STATIC int -xfs_file_open_exec( - struct inode *inode) -{ - struct xfs_mount *mp = XFS_M(inode->i_sb); - struct xfs_inode *ip = XFS_I(inode); - - if (unlikely(mp->m_flags & XFS_MOUNT_DMAPI) && - DM_EVENT_ENABLED(ip, DM_EVENT_READ)) - return -XFS_SEND_DATA(mp, DM_EVENT_READ, ip, 0, 0, 0, NULL); - return 0; -} -#endif /* HAVE_FOP_OPEN_EXEC */ - /* * mmap()d file has taken write protection fault and is being made * writable. We can set the page state up correctly for a writable @@ -546,13 +481,3 @@ static struct vm_operations_struct xfs_file_vm_ops = { .fault = filemap_fault, .page_mkwrite = xfs_vm_page_mkwrite, }; - -#ifdef CONFIG_XFS_DMAPI -static struct vm_operations_struct xfs_dmapi_file_vm_ops = { - .fault = xfs_vm_fault, - .page_mkwrite = xfs_vm_page_mkwrite, -#ifdef HAVE_VMOP_MPROTECT - .mprotect = xfs_vm_mprotect, -#endif -}; -#endif /* CONFIG_XFS_DMAPI */ diff --git a/fs/xfs/linux-2.6/xfs_ioctl.c b/fs/xfs/linux-2.6/xfs_ioctl.c index 4ddb86b73c6..a42ba9d7115 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl.c +++ b/fs/xfs/linux-2.6/xfs_ioctl.c @@ -238,7 +238,7 @@ xfs_vget_fsop_handlereq( return error; if (ip == NULL) return XFS_ERROR(EIO); - if (ip->i_d.di_mode == 0 || ip->i_d.di_gen != igen) { + if (ip->i_d.di_gen != igen) { xfs_iput_new(ip, XFS_ILOCK_SHARED); return XFS_ERROR(ENOENT); } @@ -505,14 +505,14 @@ xfs_attrmulti_attr_get( { char *kbuf; int error = EFAULT; - + if (*len > XATTR_SIZE_MAX) return EINVAL; kbuf = kmalloc(*len, GFP_KERNEL); if (!kbuf) return ENOMEM; - error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags, NULL); + error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags); if (error) goto out_kfree; @@ -546,7 +546,7 @@ xfs_attrmulti_attr_set( if (copy_from_user(kbuf, ubuf, len)) goto out_kfree; - + error = xfs_attr_set(XFS_I(inode), name, kbuf, len, flags); out_kfree: diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index a1237dad643..2bf287ef548 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -511,7 +511,8 @@ xfs_vn_rename( xfs_dentry_to_name(&nname, ndentry); error = xfs_rename(XFS_I(odir), &oname, XFS_I(odentry->d_inode), - XFS_I(ndir), &nname); + XFS_I(ndir), &nname, new_inode ? + XFS_I(new_inode) : NULL); if (likely(!error)) { if (new_inode) xfs_validate_fields(new_inode); diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index e5143323e71..4edc46915b5 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -75,6 +75,7 @@ #include <linux/delay.h> #include <linux/log2.h> #include <linux/spinlock.h> +#include <linux/random.h> #include <asm/page.h> #include <asm/div64.h> @@ -99,7 +100,6 @@ /* * Feature macros (disable/enable) */ -#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #else diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 1ebd8004469..5e3b57516ec 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -394,7 +394,7 @@ xfs_zero_last_block( int error = 0; xfs_bmbt_irec_t imap; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); zero_offset = XFS_B_FSB_OFFSET(mp, isize); if (zero_offset == 0) { @@ -425,14 +425,14 @@ xfs_zero_last_block( * out sync. We need to drop the ilock while we do this so we * don't deadlock when the buffer cache calls back to us. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); + xfs_iunlock(ip, XFS_ILOCK_EXCL); zero_len = mp->m_sb.sb_blocksize - zero_offset; if (isize + zero_len > offset) zero_len = offset - isize; error = xfs_iozero(ip, isize, zero_len); - xfs_ilock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_ilock(ip, XFS_ILOCK_EXCL); ASSERT(error >= 0); return error; } @@ -465,8 +465,7 @@ xfs_zero_eof( int error = 0; xfs_bmbt_irec_t imap; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); ASSERT(offset > isize); /* @@ -475,8 +474,7 @@ xfs_zero_eof( */ error = xfs_zero_last_block(ip, offset, isize); if (error) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; } @@ -507,8 +505,7 @@ xfs_zero_eof( error = xfs_bmapi(NULL, ip, start_zero_fsb, zero_count_fsb, 0, NULL, 0, &imap, &nimaps, NULL, NULL); if (error) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); return error; } ASSERT(nimaps > 0); @@ -532,7 +529,7 @@ xfs_zero_eof( * Drop the inode lock while we're doing the I/O. * We'll still have the iolock to protect us. */ - xfs_iunlock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_iunlock(ip, XFS_ILOCK_EXCL); zero_off = XFS_FSB_TO_B(mp, start_zero_fsb); zero_len = XFS_FSB_TO_B(mp, imap.br_blockcount); @@ -548,13 +545,13 @@ xfs_zero_eof( start_zero_fsb = imap.br_startoff + imap.br_blockcount; ASSERT(start_zero_fsb <= (end_zero_fsb + 1)); - xfs_ilock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_ilock(ip, XFS_ILOCK_EXCL); } return 0; out_lock: - xfs_ilock(ip, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); + xfs_ilock(ip, XFS_ILOCK_EXCL); ASSERT(error >= 0); return error; } diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index e1d498b4ba7..e6be37dbd0e 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -50,7 +50,6 @@ struct xfs_iomap; #define XFS_INVAL_CACHED 18 #define XFS_DIORD_ENTER 19 #define XFS_DIOWR_ENTER 20 -#define XFS_SENDFILE_ENTER 21 #define XFS_WRITEPAGE_ENTER 22 #define XFS_RELEASEPAGE_ENTER 23 #define XFS_INVALIDPAGE_ENTER 24 diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 865eb708aa9..742b2c7852c 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -1181,7 +1181,7 @@ xfs_fs_statfs( statp->f_fsid.val[0] = (u32)id; statp->f_fsid.val[1] = (u32)(id >> 32); - xfs_icsb_sync_counters_flags(mp, XFS_ICSB_LAZY_COUNT); + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); spin_lock(&mp->m_sb_lock); statp->f_bsize = sbp->sb_blocksize; diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 8b4d63ce869..9d73cb5c0fc 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -25,12 +25,6 @@ struct attrlist_cursor_kern; typedef struct inode bhv_vnode_t; -#define VN_ISLNK(vp) S_ISLNK((vp)->i_mode) -#define VN_ISREG(vp) S_ISREG((vp)->i_mode) -#define VN_ISDIR(vp) S_ISDIR((vp)->i_mode) -#define VN_ISCHR(vp) S_ISCHR((vp)->i_mode) -#define VN_ISBLK(vp) S_ISBLK((vp)->i_mode) - /* * Vnode to Linux inode mapping. */ @@ -151,24 +145,6 @@ typedef struct bhv_vattr { XFS_AT_TYPE|XFS_AT_BLKSIZE|XFS_AT_NBLOCKS|XFS_AT_VCODE|\ XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS|XFS_AT_GENCOUNT) -/* - * Modes. - */ -#define VSUID S_ISUID /* set user id on execution */ -#define VSGID S_ISGID /* set group id on execution */ -#define VSVTX S_ISVTX /* save swapped text even after use */ -#define VREAD S_IRUSR /* read, write, execute permissions */ -#define VWRITE S_IWUSR -#define VEXEC S_IXUSR - -#define MODEMASK S_IALLUGO /* mode bits plus permission bits */ - -/* - * Check whether mandatory file locking is enabled. - */ -#define MANDLOCK(vp, mode) \ - (VN_ISREG(vp) && ((mode) & (VSGID|(VEXEC>>3))) == VSGID) - extern void vn_init(void); extern int vn_revalidate(bhv_vnode_t *); diff --git a/fs/xfs/quota/xfs_dquot.c b/fs/xfs/quota/xfs_dquot.c index 631ebb31b29..85df3288efd 100644 --- a/fs/xfs/quota/xfs_dquot.c +++ b/fs/xfs/quota/xfs_dquot.c @@ -933,7 +933,7 @@ xfs_qm_dqget( type == XFS_DQ_PROJ || type == XFS_DQ_GROUP); if (ip) { - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (type == XFS_DQ_USER) ASSERT(ip->i_udquot == NULL); else @@ -1088,7 +1088,7 @@ xfs_qm_dqget( xfs_qm_mplist_unlock(mp); XFS_DQ_HASH_UNLOCK(h); dqret: - ASSERT((ip == NULL) || XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT((ip == NULL) || xfs_isilocked(ip, XFS_ILOCK_EXCL)); xfs_dqtrace_entry(dqp, "DQGET DONE"); *O_dqpp = dqp; return (0); diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 40ea5640956..d31cce1165c 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -670,7 +670,7 @@ xfs_qm_dqattach_one( xfs_dquot_t *dqp; int error; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); error = 0; /* * See if we already have it in the inode itself. IO_idqpp is @@ -874,7 +874,7 @@ xfs_qm_dqattach( return 0; ASSERT((flags & XFS_QMOPT_ILOCKED) == 0 || - XFS_ISLOCKED_INODE_EXCL(ip)); + xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (! (flags & XFS_QMOPT_ILOCKED)) xfs_ilock(ip, XFS_ILOCK_EXCL); @@ -888,7 +888,8 @@ xfs_qm_dqattach( goto done; nquotas++; } - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); if (XFS_IS_OQUOTA_ON(mp)) { error = XFS_IS_GQUOTA_ON(mp) ? xfs_qm_dqattach_one(ip, ip->i_d.di_gid, XFS_DQ_GROUP, @@ -913,7 +914,7 @@ xfs_qm_dqattach( * This WON'T, in general, result in a thrash. */ if (nquotas == 2) { - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_udquot); ASSERT(ip->i_gdquot); @@ -956,7 +957,7 @@ xfs_qm_dqattach( #ifdef QUOTADEBUG else - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); #endif return error; } @@ -1291,7 +1292,7 @@ xfs_qm_dqget_noattach( xfs_mount_t *mp; xfs_dquot_t *udqp, *gdqp; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); mp = ip->i_mount; udqp = NULL; gdqp = NULL; @@ -1392,7 +1393,7 @@ xfs_qm_qino_alloc( * Keep an extra reference to this quota inode. This inode is * locked exclusively and joined to the transaction already. */ - ASSERT(XFS_ISLOCKED_INODE_EXCL(*ip)); + ASSERT(xfs_isilocked(*ip, XFS_ILOCK_EXCL)); VN_HOLD(XFS_ITOV((*ip))); /* @@ -1737,12 +1738,6 @@ xfs_qm_dqusage_adjust( return error; } - if (ip->i_d.di_mode == 0) { - xfs_iput_new(ip, XFS_ILOCK_EXCL); - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOENT); - } - /* * Obtain the locked dquots. In case of an error (eg. allocation * fails for ENOSPC), we return the negative of the error number @@ -2563,7 +2558,7 @@ xfs_qm_vop_chown( uint bfield = XFS_IS_REALTIME_INODE(ip) ? XFS_TRANS_DQ_RTBCOUNT : XFS_TRANS_DQ_BCOUNT; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); /* old dquot */ @@ -2607,7 +2602,7 @@ xfs_qm_vop_chown_reserve( uint delblks, blkflags, prjflags = 0; xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; - ASSERT(XFS_ISLOCKED_INODE(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); mp = ip->i_mount; ASSERT(XFS_IS_QUOTA_RUNNING(mp)); @@ -2717,7 +2712,7 @@ xfs_qm_vop_dqattach_and_dqmod_newinode( if (!XFS_IS_QUOTA_ON(tp->t_mountp)) return; - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(tp->t_mountp)); if (udqp) { diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 8342823dbdc..768a3b27d2b 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -1366,12 +1366,6 @@ xfs_qm_internalqcheck_adjust( return (error); } - if (ip->i_d.di_mode == 0) { - xfs_iput_new(ip, lock_flags); - *res = BULKSTAT_RV_NOTHING; - return XFS_ERROR(ENOENT); - } - /* * This inode can have blocks after eof which can get released * when we send it to inactive. Since we don't check the dquot diff --git a/fs/xfs/quota/xfs_quota_priv.h b/fs/xfs/quota/xfs_quota_priv.h index a8b85e2be9d..5e4a40b1c56 100644 --- a/fs/xfs/quota/xfs_quota_priv.h +++ b/fs/xfs/quota/xfs_quota_priv.h @@ -27,11 +27,6 @@ /* Number of dquots that fit in to a dquot block */ #define XFS_QM_DQPERBLK(mp) ((mp)->m_quotainfo->qi_dqperchunk) -#define XFS_ISLOCKED_INODE(ip) (ismrlocked(&(ip)->i_lock, \ - MR_UPDATE | MR_ACCESS) != 0) -#define XFS_ISLOCKED_INODE_EXCL(ip) (ismrlocked(&(ip)->i_lock, \ - MR_UPDATE) != 0) - #define XFS_DQ_IS_ADDEDTO_TRX(t, d) ((d)->q_transp == (t)) #define XFS_QI_MPLRECLAIMS(mp) ((mp)->m_quotainfo->qi_dqreclaims) diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index f441f836ca8..99611381e74 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -834,7 +834,7 @@ xfs_trans_reserve_quota_nblks( ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); - ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == XFS_TRANS_DQ_RES_RTBLKS || diff --git a/fs/xfs/support/debug.h b/fs/xfs/support/debug.h index 855da040864..75845f95081 100644 --- a/fs/xfs/support/debug.h +++ b/fs/xfs/support/debug.h @@ -49,8 +49,6 @@ extern void assfail(char *expr, char *f, int l); #else /* DEBUG */ -#include <linux/random.h> - #define ASSERT(expr) \ (unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__)) diff --git a/fs/xfs/xfs.h b/fs/xfs/xfs.h index 765aaf65e2d..540e4c98982 100644 --- a/fs/xfs/xfs.h +++ b/fs/xfs/xfs.h @@ -22,7 +22,7 @@ #define STATIC #define DEBUG 1 #define XFS_BUF_LOCK_TRACKING 1 -#define QUOTADEBUG 1 +/* #define QUOTADEBUG 1 */ #endif #ifdef CONFIG_XFS_TRACE diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 8e130b9720a..ebee3a4f703 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -72,7 +72,7 @@ xfs_acl_vhasacl_default( { int error; - if (!VN_ISDIR(vp)) + if (!S_ISDIR(vp->i_mode)) return 0; xfs_acl_get_attr(vp, NULL, _ACL_TYPE_DEFAULT, ATTR_KERNOVAL, &error); return (error == 0); @@ -238,15 +238,8 @@ xfs_acl_vget( error = EINVAL; goto out; } - if (kind == _ACL_TYPE_ACCESS) { - bhv_vattr_t va; - - va.va_mask = XFS_AT_MODE; - error = xfs_getattr(xfs_vtoi(vp), &va, 0); - if (error) - goto out; - xfs_acl_sync_mode(va.va_mode, xfs_acl); - } + if (kind == _ACL_TYPE_ACCESS) + xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, xfs_acl); error = -posix_acl_xfs_to_xattr(xfs_acl, ext_acl, size); } out: @@ -341,14 +334,15 @@ xfs_acl_iaccess( { xfs_acl_t *acl; int rval; + struct xfs_name acl_name = {SGI_ACL_FILE, SGI_ACL_FILE_SIZE}; if (!(_ACL_ALLOC(acl))) return -1; /* If the file has no ACL return -1. */ rval = sizeof(xfs_acl_t); - if (xfs_attr_fetch(ip, SGI_ACL_FILE, SGI_ACL_FILE_SIZE, - (char *)acl, &rval, ATTR_ROOT | ATTR_KERNACCESS, cr)) { + if (xfs_attr_fetch(ip, &acl_name, (char *)acl, &rval, + ATTR_ROOT | ATTR_KERNACCESS)) { _ACL_FREE(acl); return -1; } @@ -373,23 +367,15 @@ xfs_acl_allow_set( bhv_vnode_t *vp, int kind) { - xfs_inode_t *ip = xfs_vtoi(vp); - bhv_vattr_t va; - int error; - if (vp->i_flags & (S_IMMUTABLE|S_APPEND)) return EPERM; - if (kind == _ACL_TYPE_DEFAULT && !VN_ISDIR(vp)) + if (kind == _ACL_TYPE_DEFAULT && !S_ISDIR(vp->i_mode)) return ENOTDIR; if (vp->i_sb->s_flags & MS_RDONLY) return EROFS; - va.va_mask = XFS_AT_UID; - error = xfs_getattr(ip, &va, 0); - if (error) - return error; - if (va.va_uid != current->fsuid && !capable(CAP_FOWNER)) + if (xfs_vtoi(vp)->i_d.di_uid != current->fsuid && !capable(CAP_FOWNER)) return EPERM; - return error; + return 0; } /* @@ -594,7 +580,7 @@ xfs_acl_get_attr( *error = xfs_attr_get(xfs_vtoi(vp), kind == _ACL_TYPE_ACCESS ? SGI_ACL_FILE : SGI_ACL_DEFAULT, - (char *)aclp, &len, flags, sys_cred); + (char *)aclp, &len, flags); if (*error || (flags & ATTR_KERNOVAL)) return; xfs_acl_get_endian(aclp); @@ -643,7 +629,6 @@ xfs_acl_vtoacl( xfs_acl_t *access_acl, xfs_acl_t *default_acl) { - bhv_vattr_t va; int error = 0; if (access_acl) { @@ -652,16 +637,10 @@ xfs_acl_vtoacl( * be obtained for some reason, invalidate the access ACL. */ xfs_acl_get_attr(vp, access_acl, _ACL_TYPE_ACCESS, 0, &error); - if (!error) { - /* Got the ACL, need the mode... */ - va.va_mask = XFS_AT_MODE; - error = xfs_getattr(xfs_vtoi(vp), &va, 0); - } - if (error) access_acl->acl_cnt = XFS_ACL_NOT_PRESENT; else /* We have a good ACL and the file mode, synchronize. */ - xfs_acl_sync_mode(va.va_mode, access_acl); + xfs_acl_sync_mode(xfs_vtoi(vp)->i_d.di_mode, access_acl); } if (default_acl) { @@ -719,7 +698,7 @@ xfs_acl_inherit( * If the new file is a directory, its default ACL is a copy of * the containing directory's default ACL. */ - if (VN_ISDIR(vp)) + if (S_ISDIR(vp->i_mode)) xfs_acl_set_attr(vp, pdaclp, _ACL_TYPE_DEFAULT, &error); if (!error && !basicperms) xfs_acl_set_attr(vp, cacl, _ACL_TYPE_ACCESS, &error); @@ -744,7 +723,7 @@ xfs_acl_setmode( bhv_vattr_t va; xfs_acl_entry_t *ap; xfs_acl_entry_t *gap = NULL; - int i, error, nomask = 1; + int i, nomask = 1; *basicperms = 1; @@ -756,11 +735,7 @@ xfs_acl_setmode( * mode. The m:: bits take precedence over the g:: bits. */ va.va_mask = XFS_AT_MODE; - error = xfs_getattr(xfs_vtoi(vp), &va, 0); - if (error) - return error; - - va.va_mask = XFS_AT_MODE; + va.va_mode = xfs_vtoi(vp)->i_d.di_mode; va.va_mode &= ~(S_IRWXU|S_IRWXG|S_IRWXO); ap = acl->acl_entry; for (i = 0; i < acl->acl_cnt; ++i) { diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 36d781ee5fc..df151a85918 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -101,14 +101,28 @@ STATIC int xfs_attr_rmtval_remove(xfs_da_args_t *args); ktrace_t *xfs_attr_trace_buf; #endif +STATIC int +xfs_attr_name_to_xname( + struct xfs_name *xname, + const char *aname) +{ + if (!aname) + return EINVAL; + xname->name = aname; + xname->len = strlen(aname); + if (xname->len >= MAXNAMELEN) + return EFAULT; /* match IRIX behaviour */ + + return 0; +} /*======================================================================== * Overall external interface routines. *========================================================================*/ int -xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, - char *value, int *valuelenp, int flags, struct cred *cred) +xfs_attr_fetch(xfs_inode_t *ip, struct xfs_name *name, + char *value, int *valuelenp, int flags) { xfs_da_args_t args; int error; @@ -122,8 +136,8 @@ xfs_attr_fetch(xfs_inode_t *ip, const char *name, int namelen, * Fill in the arg structure for this request. */ memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; + args.name = name->name; + args.namelen = name->len; args.value = value; args.valuelen = *valuelenp; args.flags = flags; @@ -162,31 +176,29 @@ xfs_attr_get( const char *name, char *value, int *valuelenp, - int flags, - cred_t *cred) + int flags) { - int error, namelen; + int error; + struct xfs_name xname; XFS_STATS_INC(xs_attr_get); - if (!name) - return(EINVAL); - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return(EFAULT); /* match IRIX behaviour */ - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) return(EIO); + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_SHARED); - error = xfs_attr_fetch(ip, name, namelen, value, valuelenp, flags, cred); + error = xfs_attr_fetch(ip, &xname, value, valuelenp, flags); xfs_iunlock(ip, XFS_ILOCK_SHARED); return(error); } -int -xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, - char *value, int valuelen, int flags) +STATIC int +xfs_attr_set_int(xfs_inode_t *dp, struct xfs_name *name, + char *value, int valuelen, int flags) { xfs_da_args_t args; xfs_fsblock_t firstblock; @@ -209,7 +221,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, */ if (XFS_IFORK_Q(dp) == 0) { int sf_size = sizeof(xfs_attr_sf_hdr_t) + - XFS_ATTR_SF_ENTSIZE_BYNAME(namelen, valuelen); + XFS_ATTR_SF_ENTSIZE_BYNAME(name->len, valuelen); if ((error = xfs_bmap_add_attrfork(dp, sf_size, rsvd))) return(error); @@ -219,8 +231,8 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * Fill in the arg structure for this request. */ memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; + args.name = name->name; + args.namelen = name->len; args.value = value; args.valuelen = valuelen; args.flags = flags; @@ -236,7 +248,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, * Determine space new attribute will use, and if it would be * "local" or "remote" (note: local != inline). */ - size = xfs_attr_leaf_newentsize(namelen, valuelen, + size = xfs_attr_leaf_newentsize(name->len, valuelen, mp->m_sb.sb_blocksize, &local); nblks = XFS_DAENTER_SPACE_RES(mp, XFS_ATTR_FORK); @@ -429,26 +441,27 @@ xfs_attr_set( int valuelen, int flags) { - int namelen; - - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ + int error; + struct xfs_name xname; XFS_STATS_INC(xs_attr_set); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return (EIO); - return xfs_attr_set_int(dp, name, namelen, value, valuelen, flags); + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + + return xfs_attr_set_int(dp, &xname, value, valuelen, flags); } /* * Generic handler routine to remove a name from an attribute list. * Transitions attribute list from Btree to shortform as necessary. */ -int -xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) +STATIC int +xfs_attr_remove_int(xfs_inode_t *dp, struct xfs_name *name, int flags) { xfs_da_args_t args; xfs_fsblock_t firstblock; @@ -460,8 +473,8 @@ xfs_attr_remove_int(xfs_inode_t *dp, const char *name, int namelen, int flags) * Fill in the arg structure for this request. */ memset((char *)&args, 0, sizeof(args)); - args.name = name; - args.namelen = namelen; + args.name = name->name; + args.namelen = name->len; args.flags = flags; args.hashval = xfs_da_hashname(args.name, args.namelen); args.dp = dp; @@ -575,17 +588,18 @@ xfs_attr_remove( const char *name, int flags) { - int namelen; - - namelen = strlen(name); - if (namelen >= MAXNAMELEN) - return EFAULT; /* match IRIX behaviour */ + int error; + struct xfs_name xname; XFS_STATS_INC(xs_attr_remove); if (XFS_FORCED_SHUTDOWN(dp->i_mount)) return (EIO); + error = xfs_attr_name_to_xname(&xname, name); + if (error) + return error; + xfs_ilock(dp, XFS_ILOCK_SHARED); if (XFS_IFORK_Q(dp) == 0 || (dp->i_d.di_aformat == XFS_DINODE_FMT_EXTENTS && @@ -595,10 +609,10 @@ xfs_attr_remove( } xfs_iunlock(dp, XFS_ILOCK_SHARED); - return xfs_attr_remove_int(dp, name, namelen, flags); + return xfs_attr_remove_int(dp, &xname, flags); } -int /* error */ +STATIC int xfs_attr_list_int(xfs_attr_list_context_t *context) { int error; @@ -2522,8 +2536,7 @@ attr_generic_get( { int error, asize = size; - error = xfs_attr_get(xfs_vtoi(vp), name, data, - &asize, xflags, NULL); + error = xfs_attr_get(xfs_vtoi(vp), name, data, &asize, xflags); if (!error) return asize; return -error; diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h index 786eba3121c..6cfc9384fe3 100644 --- a/fs/xfs/xfs_attr.h +++ b/fs/xfs/xfs_attr.h @@ -158,14 +158,10 @@ struct xfs_da_args; /* * Overall external interface routines. */ -int xfs_attr_set_int(struct xfs_inode *, const char *, int, char *, int, int); -int xfs_attr_remove_int(struct xfs_inode *, const char *, int, int); -int xfs_attr_list_int(struct xfs_attr_list_context *); int xfs_attr_inactive(struct xfs_inode *dp); int xfs_attr_shortform_getvalue(struct xfs_da_args *); -int xfs_attr_fetch(struct xfs_inode *, const char *, int, - char *, int *, int, struct cred *); +int xfs_attr_fetch(struct xfs_inode *, struct xfs_name *, char *, int *, int); int xfs_attr_rmtval_get(struct xfs_da_args *args); #endif /* __XFS_ATTR_H__ */ diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index eb198c01c35..53c259f5a5a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -4074,7 +4074,6 @@ xfs_bmap_add_attrfork( error2: xfs_bmap_cancel(&flist); error1: - ASSERT(ismrlocked(&ip->i_lock,MR_UPDATE)); xfs_iunlock(ip, XFS_ILOCK_EXCL); error0: xfs_trans_cancel(tp, XFS_TRANS_RELEASE_LOG_RES|XFS_TRANS_ABORT); diff --git a/fs/xfs/xfs_dfrag.c b/fs/xfs/xfs_dfrag.c index 3f53fad356a..5f3647cb988 100644 --- a/fs/xfs/xfs_dfrag.c +++ b/fs/xfs/xfs_dfrag.c @@ -162,7 +162,7 @@ xfs_swap_extents( ips[1] = ip; } - xfs_lock_inodes(ips, 2, 0, lock_flags); + xfs_lock_inodes(ips, 2, lock_flags); locked = 1; /* Verify that both files have the same format */ @@ -265,7 +265,7 @@ xfs_swap_extents( locked = 0; goto error0; } - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); /* * Count the number of extended attribute blocks diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index d3a0f538d6a..381ebda4f7b 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -462,7 +462,7 @@ xfs_fs_counts( xfs_mount_t *mp, xfs_fsop_counts_t *cnt) { - xfs_icsb_sync_counters_flags(mp, XFS_ICSB_LAZY_COUNT); + xfs_icsb_sync_counters(mp, XFS_ICSB_LAZY_COUNT); spin_lock(&mp->m_sb_lock); cnt->freedata = mp->m_sb.sb_fdblocks - XFS_ALLOC_SET_ASIDE(mp); cnt->freertx = mp->m_sb.sb_frextents; @@ -524,7 +524,7 @@ xfs_reserve_blocks( */ retry: spin_lock(&mp->m_sb_lock); - xfs_icsb_sync_counters_flags(mp, XFS_ICSB_SB_LOCKED); + xfs_icsb_sync_counters_locked(mp, 0); /* * If our previous reservation was larger than the current value, @@ -552,11 +552,8 @@ retry: mp->m_resblks += free; mp->m_resblks_avail += free; fdblks_delta = -free; - mp->m_sb.sb_fdblocks = XFS_ALLOC_SET_ASIDE(mp); } else { fdblks_delta = -delta; - mp->m_sb.sb_fdblocks = - lcounter + XFS_ALLOC_SET_ASIDE(mp); mp->m_resblks = request; mp->m_resblks_avail += delta; } @@ -587,7 +584,6 @@ out: if (error == ENOSPC) goto retry; } - return 0; } diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index a64dfbd565a..aad8c5da38a 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -147,6 +147,7 @@ xfs_ialloc_ag_alloc( int version; /* inode version number to use */ int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ + unsigned int gen; args.tp = tp; args.mp = tp->t_mountp; @@ -290,6 +291,14 @@ xfs_ialloc_ag_alloc( else version = XFS_DINODE_VERSION_1; + /* + * Seed the new inode cluster with a random generation number. This + * prevents short-term reuse of generation numbers if a chunk is + * freed and then immediately reallocated. We use random numbers + * rather than a linear progression to prevent the next generation + * number from being easily guessable. + */ + gen = random32(); for (j = 0; j < nbufs; j++) { /* * Get the block. @@ -309,6 +318,7 @@ xfs_ialloc_ag_alloc( free = XFS_MAKE_IPTR(args.mp, fbuf, i); free->di_core.di_magic = cpu_to_be16(XFS_DINODE_MAGIC); free->di_core.di_version = version; + free->di_core.di_gen = cpu_to_be32(gen); free->di_next_unlinked = cpu_to_be32(NULLAGINO); xfs_ialloc_log_di(tp, fbuf, i, XFS_DI_CORE_BITS | XFS_DI_NEXT_UNLINKED); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e657c512846..b07604b94d9 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -593,8 +593,9 @@ xfs_iunlock_map_shared( * XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL */ void -xfs_ilock(xfs_inode_t *ip, - uint lock_flags) +xfs_ilock( + xfs_inode_t *ip, + uint lock_flags) { /* * You can't set both SHARED and EXCL for the same lock, @@ -607,16 +608,16 @@ xfs_ilock(xfs_inode_t *ip, (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - if (lock_flags & XFS_IOLOCK_EXCL) { + if (lock_flags & XFS_IOLOCK_EXCL) mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - } else if (lock_flags & XFS_IOLOCK_SHARED) { + else if (lock_flags & XFS_IOLOCK_SHARED) mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags)); - } - if (lock_flags & XFS_ILOCK_EXCL) { + + if (lock_flags & XFS_ILOCK_EXCL) mrupdate_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - } else if (lock_flags & XFS_ILOCK_SHARED) { + else if (lock_flags & XFS_ILOCK_SHARED) mraccess_nested(&ip->i_lock, XFS_ILOCK_DEP(lock_flags)); - } + xfs_ilock_trace(ip, 1, lock_flags, (inst_t *)__return_address); } @@ -631,15 +632,12 @@ xfs_ilock(xfs_inode_t *ip, * lock_flags -- this parameter indicates the inode's locks to be * to be locked. See the comment for xfs_ilock() for a list * of valid values. - * */ int -xfs_ilock_nowait(xfs_inode_t *ip, - uint lock_flags) +xfs_ilock_nowait( + xfs_inode_t *ip, + uint lock_flags) { - int iolocked; - int ilocked; - /* * You can't set both SHARED and EXCL for the same lock, * and only XFS_IOLOCK_SHARED, XFS_IOLOCK_EXCL, XFS_ILOCK_SHARED, @@ -651,37 +649,30 @@ xfs_ilock_nowait(xfs_inode_t *ip, (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_DEP_MASK)) == 0); - iolocked = 0; if (lock_flags & XFS_IOLOCK_EXCL) { - iolocked = mrtryupdate(&ip->i_iolock); - if (!iolocked) { - return 0; - } + if (!mrtryupdate(&ip->i_iolock)) + goto out; } else if (lock_flags & XFS_IOLOCK_SHARED) { - iolocked = mrtryaccess(&ip->i_iolock); - if (!iolocked) { - return 0; - } + if (!mrtryaccess(&ip->i_iolock)) + goto out; } if (lock_flags & XFS_ILOCK_EXCL) { - ilocked = mrtryupdate(&ip->i_lock); - if (!ilocked) { - if (iolocked) { - mrunlock(&ip->i_iolock); - } - return 0; - } + if (!mrtryupdate(&ip->i_lock)) + goto out_undo_iolock; } else if (lock_flags & XFS_ILOCK_SHARED) { - ilocked = mrtryaccess(&ip->i_lock); - if (!ilocked) { - if (iolocked) { - mrunlock(&ip->i_iolock); - } - return 0; - } + if (!mrtryaccess(&ip->i_lock)) + goto out_undo_iolock; } xfs_ilock_trace(ip, 2, lock_flags, (inst_t *)__return_address); return 1; + + out_undo_iolock: + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); + out: + return 0; } /* @@ -697,8 +688,9 @@ xfs_ilock_nowait(xfs_inode_t *ip, * */ void -xfs_iunlock(xfs_inode_t *ip, - uint lock_flags) +xfs_iunlock( + xfs_inode_t *ip, + uint lock_flags) { /* * You can't set both SHARED and EXCL for the same lock, @@ -713,31 +705,25 @@ xfs_iunlock(xfs_inode_t *ip, XFS_LOCK_DEP_MASK)) == 0); ASSERT(lock_flags != 0); - if (lock_flags & (XFS_IOLOCK_SHARED | XFS_IOLOCK_EXCL)) { - ASSERT(!(lock_flags & XFS_IOLOCK_SHARED) || - (ismrlocked(&ip->i_iolock, MR_ACCESS))); - ASSERT(!(lock_flags & XFS_IOLOCK_EXCL) || - (ismrlocked(&ip->i_iolock, MR_UPDATE))); - mrunlock(&ip->i_iolock); - } + if (lock_flags & XFS_IOLOCK_EXCL) + mrunlock_excl(&ip->i_iolock); + else if (lock_flags & XFS_IOLOCK_SHARED) + mrunlock_shared(&ip->i_iolock); - if (lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) { - ASSERT(!(lock_flags & XFS_ILOCK_SHARED) || - (ismrlocked(&ip->i_lock, MR_ACCESS))); - ASSERT(!(lock_flags & XFS_ILOCK_EXCL) || - (ismrlocked(&ip->i_lock, MR_UPDATE))); - mrunlock(&ip->i_lock); + if (lock_flags & XFS_ILOCK_EXCL) + mrunlock_excl(&ip->i_lock); + else if (lock_flags & XFS_ILOCK_SHARED) + mrunlock_shared(&ip->i_lock); + if ((lock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL)) && + !(lock_flags & XFS_IUNLOCK_NONOTIFY) && ip->i_itemp) { /* * Let the AIL know that this item has been unlocked in case * it is in the AIL and anyone is waiting on it. Don't do * this if the caller has asked us not to. */ - if (!(lock_flags & XFS_IUNLOCK_NONOTIFY) && - ip->i_itemp != NULL) { - xfs_trans_unlocked_item(ip->i_mount, - (xfs_log_item_t*)(ip->i_itemp)); - } + xfs_trans_unlocked_item(ip->i_mount, + (xfs_log_item_t*)(ip->i_itemp)); } xfs_ilock_trace(ip, 3, lock_flags, (inst_t *)__return_address); } @@ -747,21 +733,47 @@ xfs_iunlock(xfs_inode_t *ip, * if it is being demoted. */ void -xfs_ilock_demote(xfs_inode_t *ip, - uint lock_flags) +xfs_ilock_demote( + xfs_inode_t *ip, + uint lock_flags) { ASSERT(lock_flags & (XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)); ASSERT((lock_flags & ~(XFS_IOLOCK_EXCL|XFS_ILOCK_EXCL)) == 0); - if (lock_flags & XFS_ILOCK_EXCL) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + if (lock_flags & XFS_ILOCK_EXCL) mrdemote(&ip->i_lock); - } - if (lock_flags & XFS_IOLOCK_EXCL) { - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + if (lock_flags & XFS_IOLOCK_EXCL) mrdemote(&ip->i_iolock); +} + +#ifdef DEBUG +/* + * Debug-only routine, without additional rw_semaphore APIs, we can + * now only answer requests regarding whether we hold the lock for write + * (reader state is outside our visibility, we only track writer state). + * + * Note: this means !xfs_isilocked would give false positives, so don't do that. + */ +int +xfs_isilocked( + xfs_inode_t *ip, + uint lock_flags) +{ + if ((lock_flags & (XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)) == + XFS_ILOCK_EXCL) { + if (!ip->i_lock.mr_writer) + return 0; } + + if ((lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) == + XFS_IOLOCK_EXCL) { + if (!ip->i_iolock.mr_writer) + return 0; + } + + return 1; } +#endif /* * The following three routines simply manage the i_flock diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ca12acb9039..cf0bb9c1d62 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1291,7 +1291,7 @@ xfs_file_last_byte( xfs_fileoff_t size_last_block; int error; - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE | MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); mp = ip->i_mount; /* @@ -1402,7 +1402,7 @@ xfs_itruncate_start( bhv_vnode_t *vp; int error = 0; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT((new_size == 0) || (new_size <= ip->i_size)); ASSERT((flags == XFS_ITRUNC_DEFINITE) || (flags == XFS_ITRUNC_MAYBE)); @@ -1528,8 +1528,7 @@ xfs_itruncate_finish( xfs_bmap_free_t free_list; int error; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE) != 0); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); ASSERT((new_size == 0) || (new_size <= ip->i_size)); ASSERT(*tp != NULL); ASSERT((*tp)->t_flags & XFS_TRANS_PERM_LOG_RES); @@ -1780,8 +1779,7 @@ xfs_igrow_start( xfs_fsize_t new_size, cred_t *credp) { - ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); ASSERT(new_size > ip->i_size); /* @@ -1809,8 +1807,7 @@ xfs_igrow_finish( xfs_fsize_t new_size, int change_flag) { - ASSERT(ismrlocked(&(ip->i_lock), MR_UPDATE) != 0); - ASSERT(ismrlocked(&(ip->i_iolock), MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_IOLOCK_EXCL)); ASSERT(ip->i_transp == tp); ASSERT(new_size > ip->i_size); @@ -2287,7 +2284,7 @@ xfs_ifree( xfs_dinode_t *dip; xfs_buf_t *ibp; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_transp == tp); ASSERT(ip->i_d.di_nlink == 0); ASSERT(ip->i_d.di_nextents == 0); @@ -2746,7 +2743,7 @@ void xfs_ipin( xfs_inode_t *ip) { - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); atomic_inc(&ip->i_pincount); } @@ -2779,7 +2776,7 @@ __xfs_iunpin_wait( { xfs_inode_log_item_t *iip = ip->i_itemp; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); if (atomic_read(&ip->i_pincount) == 0) return; @@ -2829,7 +2826,7 @@ xfs_iextents_copy( xfs_fsblock_t start_block; ifp = XFS_IFORK_PTR(ip, whichfork); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(ifp->if_bytes > 0); nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t); @@ -3132,7 +3129,7 @@ xfs_iflush( XFS_STATS_INC(xs_iflush_count); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(issemalocked(&(ip->i_flock))); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > ip->i_df.if_ext_max); @@ -3297,7 +3294,7 @@ xfs_iflush_int( int first; #endif - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE|MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED)); ASSERT(issemalocked(&(ip->i_flock))); ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE || ip->i_d.di_nextents > ip->i_df.if_ext_max); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 93c37697a72..0a999fee4f0 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -386,20 +386,9 @@ xfs_iflags_test_and_clear(xfs_inode_t *ip, unsigned short flags) #define XFS_ILOCK_EXCL (1<<2) #define XFS_ILOCK_SHARED (1<<3) #define XFS_IUNLOCK_NONOTIFY (1<<4) -/* #define XFS_IOLOCK_NESTED (1<<5) */ -#define XFS_EXTENT_TOKEN_RD (1<<6) -#define XFS_SIZE_TOKEN_RD (1<<7) -#define XFS_EXTSIZE_RD (XFS_EXTENT_TOKEN_RD|XFS_SIZE_TOKEN_RD) -#define XFS_WILLLEND (1<<8) /* Always acquire tokens for lending */ -#define XFS_EXTENT_TOKEN_WR (XFS_EXTENT_TOKEN_RD | XFS_WILLLEND) -#define XFS_SIZE_TOKEN_WR (XFS_SIZE_TOKEN_RD | XFS_WILLLEND) -#define XFS_EXTSIZE_WR (XFS_EXTSIZE_RD | XFS_WILLLEND) -/* TODO:XFS_SIZE_TOKEN_WANT (1<<9) */ #define XFS_LOCK_MASK (XFS_IOLOCK_EXCL | XFS_IOLOCK_SHARED \ - | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED \ - | XFS_EXTENT_TOKEN_RD | XFS_SIZE_TOKEN_RD \ - | XFS_WILLLEND) + | XFS_ILOCK_EXCL | XFS_ILOCK_SHARED) /* * Flags for lockdep annotations. @@ -483,6 +472,7 @@ void xfs_ilock(xfs_inode_t *, uint); int xfs_ilock_nowait(xfs_inode_t *, uint); void xfs_iunlock(xfs_inode_t *, uint); void xfs_ilock_demote(xfs_inode_t *, uint); +int xfs_isilocked(xfs_inode_t *, uint); void xfs_iflock(xfs_inode_t *); int xfs_iflock_nowait(xfs_inode_t *); uint xfs_ilock_map_shared(xfs_inode_t *); @@ -534,7 +524,7 @@ int xfs_iflush(xfs_inode_t *, uint); void xfs_iflush_all(struct xfs_mount *); void xfs_ichgtime(xfs_inode_t *, int); xfs_fsize_t xfs_file_last_byte(xfs_inode_t *); -void xfs_lock_inodes(xfs_inode_t **, int, int, uint); +void xfs_lock_inodes(xfs_inode_t **, int, uint); void xfs_synchronize_atime(xfs_inode_t *); void xfs_mark_inode_dirty_sync(xfs_inode_t *); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 93b5db453ea..167b33f1577 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -547,7 +547,7 @@ STATIC void xfs_inode_item_pin( xfs_inode_log_item_t *iip) { - ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); + ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); xfs_ipin(iip->ili_inode); } @@ -664,13 +664,13 @@ xfs_inode_item_unlock( ASSERT(iip != NULL); ASSERT(iip->ili_inode->i_itemp != NULL); - ASSERT(ismrlocked(&(iip->ili_inode->i_lock), MR_UPDATE)); + ASSERT(xfs_isilocked(iip->ili_inode, XFS_ILOCK_EXCL)); ASSERT((!(iip->ili_inode->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)) || - ismrlocked(&(iip->ili_inode->i_iolock), MR_UPDATE)); + xfs_isilocked(iip->ili_inode, XFS_IOLOCK_EXCL)); ASSERT((!(iip->ili_inode->i_itemp->ili_flags & XFS_ILI_IOLOCKED_SHARED)) || - ismrlocked(&(iip->ili_inode->i_iolock), MR_ACCESS)); + xfs_isilocked(iip->ili_inode, XFS_IOLOCK_SHARED)); /* * Clear the transaction pointer in the inode. */ @@ -769,7 +769,7 @@ xfs_inode_item_pushbuf( ip = iip->ili_inode; - ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); /* * The ili_pushbuf_flag keeps others from @@ -857,7 +857,7 @@ xfs_inode_item_push( ip = iip->ili_inode; - ASSERT(ismrlocked(&(ip->i_lock), MR_ACCESS)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); ASSERT(issemalocked(&(ip->i_flock))); /* * Since we were able to lock the inode's flush lock and diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index fb3cf119141..7edcde691d1 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -196,14 +196,14 @@ xfs_iomap( break; case BMAPI_WRITE: xfs_iomap_enter_trace(XFS_IOMAP_WRITE_ENTER, ip, offset, count); - lockmode = XFS_ILOCK_EXCL|XFS_EXTSIZE_WR; + lockmode = XFS_ILOCK_EXCL; if (flags & BMAPI_IGNSTATE) bmapi_flags |= XFS_BMAPI_IGSTATE|XFS_BMAPI_ENTIRE; xfs_ilock(ip, lockmode); break; case BMAPI_ALLOCATE: xfs_iomap_enter_trace(XFS_IOMAP_ALLOC_ENTER, ip, offset, count); - lockmode = XFS_ILOCK_SHARED|XFS_EXTSIZE_RD; + lockmode = XFS_ILOCK_SHARED; bmapi_flags = XFS_BMAPI_ENTIRE; /* Attempt non-blocking lock */ @@ -523,8 +523,7 @@ xfs_iomap_write_direct( goto error_out; } - if (unlikely(!imap.br_startblock && - !(XFS_IS_REALTIME_INODE(ip)))) { + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) { error = xfs_cmn_err_fsblock_zero(ip, &imap); goto error_out; } @@ -624,7 +623,7 @@ xfs_iomap_write_delay( int prealloc, fsynced = 0; int error; - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE) != 0); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * Make sure that the dquots are there. This doesn't hold @@ -686,8 +685,7 @@ retry: goto retry; } - if (unlikely(!imap[0].br_startblock && - !(XFS_IS_REALTIME_INODE(ip)))) + if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_cmn_err_fsblock_zero(ip, &imap[0]); *ret_imap = imap[0]; @@ -838,9 +836,9 @@ xfs_iomap_write_allocate( * See if we were able to allocate an extent that * covers at least part of the callers request */ - if (unlikely(!imap.br_startblock && - XFS_IS_REALTIME_INODE(ip))) + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_cmn_err_fsblock_zero(ip, &imap); + if ((offset_fsb >= imap.br_startoff) && (offset_fsb < (imap.br_startoff + imap.br_blockcount))) { @@ -934,8 +932,7 @@ xfs_iomap_write_unwritten( if (error) return XFS_ERROR(error); - if (unlikely(!imap.br_startblock && - !(XFS_IS_REALTIME_INODE(ip)))) + if (!(imap.br_startblock || XFS_IS_REALTIME_INODE(ip))) return xfs_cmn_err_fsblock_zero(ip, &imap); if ((numblks_fsb = imap.br_blockcount) == 0) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index eb85bdedad0..419de15aeb4 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -71,11 +71,6 @@ xfs_bulkstat_one_iget( ASSERT(ip != NULL); ASSERT(ip->i_blkno != (xfs_daddr_t)0); - if (ip->i_d.di_mode == 0) { - *stat = BULKSTAT_RV_NOTHING; - error = XFS_ERROR(ENOENT); - goto out_iput; - } vp = XFS_ITOV(ip); dic = &ip->i_d; @@ -124,7 +119,6 @@ xfs_bulkstat_one_iget( break; } - out_iput: xfs_iput(ip, XFS_ILOCK_SHARED); return error; } diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 2fec452afbc..da3988453b7 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -54,8 +54,9 @@ STATIC void xfs_unmountfs_wait(xfs_mount_t *); #ifdef HAVE_PERCPU_SB STATIC void xfs_icsb_destroy_counters(xfs_mount_t *); STATIC void xfs_icsb_balance_counter(xfs_mount_t *, xfs_sb_field_t, - int, int); -STATIC void xfs_icsb_sync_counters(xfs_mount_t *); + int); +STATIC void xfs_icsb_balance_counter_locked(xfs_mount_t *, xfs_sb_field_t, + int); STATIC int xfs_icsb_modify_counters(xfs_mount_t *, xfs_sb_field_t, int64_t, int); STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); @@ -63,8 +64,8 @@ STATIC void xfs_icsb_disable_counter(xfs_mount_t *, xfs_sb_field_t); #else #define xfs_icsb_destroy_counters(mp) do { } while (0) -#define xfs_icsb_balance_counter(mp, a, b, c) do { } while (0) -#define xfs_icsb_sync_counters(mp) do { } while (0) +#define xfs_icsb_balance_counter(mp, a, b) do { } while (0) +#define xfs_icsb_balance_counter_locked(mp, a, b) do { } while (0) #define xfs_icsb_modify_counters(mp, a, b, c) do { } while (0) #endif @@ -1400,7 +1401,7 @@ xfs_log_sbcount( if (!xfs_fs_writable(mp)) return 0; - xfs_icsb_sync_counters(mp); + xfs_icsb_sync_counters(mp, 0); /* * we don't need to do this if we are updating the superblock @@ -2026,9 +2027,9 @@ xfs_icsb_cpu_notify( case CPU_ONLINE: case CPU_ONLINE_FROZEN: xfs_icsb_lock(mp); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); xfs_icsb_unlock(mp); break; case CPU_DEAD: @@ -2048,12 +2049,9 @@ xfs_icsb_cpu_notify( memset(cntp, 0, sizeof(xfs_icsb_cnts_t)); - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, - XFS_ICSB_SB_LOCKED, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, - XFS_ICSB_SB_LOCKED, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, - XFS_ICSB_SB_LOCKED, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter_locked(mp, XFS_SBS_FDBLOCKS, 0); spin_unlock(&mp->m_sb_lock); xfs_icsb_unlock(mp); break; @@ -2105,9 +2103,9 @@ xfs_icsb_reinit_counters( * initial balance kicks us off correctly */ mp->m_icsb_counters = -1; - xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0, 0); - xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_ICOUNT, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_IFREE, 0); + xfs_icsb_balance_counter(mp, XFS_SBS_FDBLOCKS, 0); xfs_icsb_unlock(mp); } @@ -2223,7 +2221,7 @@ xfs_icsb_disable_counter( if (!test_and_set_bit(field, &mp->m_icsb_counters)) { /* drain back to superblock */ - xfs_icsb_count(mp, &cnt, XFS_ICSB_SB_LOCKED|XFS_ICSB_LAZY_COUNT); + xfs_icsb_count(mp, &cnt, XFS_ICSB_LAZY_COUNT); switch(field) { case XFS_SBS_ICOUNT: mp->m_sb.sb_icount = cnt.icsb_icount; @@ -2278,38 +2276,33 @@ xfs_icsb_enable_counter( } void -xfs_icsb_sync_counters_flags( +xfs_icsb_sync_counters_locked( xfs_mount_t *mp, int flags) { xfs_icsb_cnts_t cnt; - /* Pass 1: lock all counters */ - if ((flags & XFS_ICSB_SB_LOCKED) == 0) - spin_lock(&mp->m_sb_lock); - xfs_icsb_count(mp, &cnt, flags); - /* Step 3: update mp->m_sb fields */ if (!xfs_icsb_counter_disabled(mp, XFS_SBS_ICOUNT)) mp->m_sb.sb_icount = cnt.icsb_icount; if (!xfs_icsb_counter_disabled(mp, XFS_SBS_IFREE)) mp->m_sb.sb_ifree = cnt.icsb_ifree; if (!xfs_icsb_counter_disabled(mp, XFS_SBS_FDBLOCKS)) mp->m_sb.sb_fdblocks = cnt.icsb_fdblocks; - - if ((flags & XFS_ICSB_SB_LOCKED) == 0) - spin_unlock(&mp->m_sb_lock); } /* * Accurate update of per-cpu counters to incore superblock */ -STATIC void +void xfs_icsb_sync_counters( - xfs_mount_t *mp) + xfs_mount_t *mp, + int flags) { - xfs_icsb_sync_counters_flags(mp, 0); + spin_lock(&mp->m_sb_lock); + xfs_icsb_sync_counters_locked(mp, flags); + spin_unlock(&mp->m_sb_lock); } /* @@ -2332,19 +2325,15 @@ xfs_icsb_sync_counters( #define XFS_ICSB_FDBLK_CNTR_REENABLE(mp) \ (uint64_t)(512 + XFS_ALLOC_SET_ASIDE(mp)) STATIC void -xfs_icsb_balance_counter( +xfs_icsb_balance_counter_locked( xfs_mount_t *mp, xfs_sb_field_t field, - int flags, int min_per_cpu) { uint64_t count, resid; int weight = num_online_cpus(); uint64_t min = (uint64_t)min_per_cpu; - if (!(flags & XFS_ICSB_SB_LOCKED)) - spin_lock(&mp->m_sb_lock); - /* disable counter and sync counter */ xfs_icsb_disable_counter(mp, field); @@ -2354,19 +2343,19 @@ xfs_icsb_balance_counter( count = mp->m_sb.sb_icount; resid = do_div(count, weight); if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - goto out; + return; break; case XFS_SBS_IFREE: count = mp->m_sb.sb_ifree; resid = do_div(count, weight); if (count < max(min, XFS_ICSB_INO_CNTR_REENABLE)) - goto out; + return; break; case XFS_SBS_FDBLOCKS: count = mp->m_sb.sb_fdblocks; resid = do_div(count, weight); if (count < max(min, XFS_ICSB_FDBLK_CNTR_REENABLE(mp))) - goto out; + return; break; default: BUG(); @@ -2375,9 +2364,17 @@ xfs_icsb_balance_counter( } xfs_icsb_enable_counter(mp, field, count, resid); -out: - if (!(flags & XFS_ICSB_SB_LOCKED)) - spin_unlock(&mp->m_sb_lock); +} + +STATIC void +xfs_icsb_balance_counter( + xfs_mount_t *mp, + xfs_sb_field_t fields, + int min_per_cpu) +{ + spin_lock(&mp->m_sb_lock); + xfs_icsb_balance_counter_locked(mp, fields, min_per_cpu); + spin_unlock(&mp->m_sb_lock); } STATIC int @@ -2484,7 +2481,7 @@ slow_path: * we are done. */ if (ret != ENOSPC) - xfs_icsb_balance_counter(mp, field, 0, 0); + xfs_icsb_balance_counter(mp, field, 0); xfs_icsb_unlock(mp); return ret; @@ -2508,7 +2505,7 @@ balance_counter: * will either succeed through the fast path or slow path without * another balance operation being required. */ - xfs_icsb_balance_counter(mp, field, 0, delta); + xfs_icsb_balance_counter(mp, field, delta); xfs_icsb_unlock(mp); goto again; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index 1ed575110ff..63e0693a358 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -206,17 +206,18 @@ typedef struct xfs_icsb_cnts { #define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */ -#define XFS_ICSB_SB_LOCKED (1 << 0) /* sb already locked */ #define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */ extern int xfs_icsb_init_counters(struct xfs_mount *); extern void xfs_icsb_reinit_counters(struct xfs_mount *); -extern void xfs_icsb_sync_counters_flags(struct xfs_mount *, int); +extern void xfs_icsb_sync_counters(struct xfs_mount *, int); +extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int); #else #define xfs_icsb_init_counters(mp) (0) #define xfs_icsb_reinit_counters(mp) do { } while (0) -#define xfs_icsb_sync_counters_flags(mp, flags) do { } while (0) +#define xfs_icsb_sync_counters(mp, flags) do { } while (0) +#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0) #endif typedef struct xfs_ail { diff --git a/fs/xfs/xfs_rename.c b/fs/xfs/xfs_rename.c index ee371890d85..d8063e1ad29 100644 --- a/fs/xfs/xfs_rename.c +++ b/fs/xfs/xfs_rename.c @@ -55,85 +55,32 @@ xfs_rename_unlock4( xfs_iunlock(i_tab[0], lock_mode); for (i = 1; i < 4; i++) { - if (i_tab[i] == NULL) { + if (i_tab[i] == NULL) break; - } + /* * Watch out for duplicate entries in the table. */ - if (i_tab[i] != i_tab[i-1]) { + if (i_tab[i] != i_tab[i-1]) xfs_iunlock(i_tab[i], lock_mode); - } } } -#ifdef DEBUG -int xfs_rename_skip, xfs_rename_nskip; -#endif - /* - * The following routine will acquire the locks required for a rename - * operation. The code understands the semantics of renames and will - * validate that name1 exists under dp1 & that name2 may or may not - * exist under dp2. - * - * We are renaming dp1/name1 to dp2/name2. - * - * Return ENOENT if dp1 does not exist, other lookup errors, or 0 for success. + * Enter all inodes for a rename transaction into a sorted array. */ -STATIC int -xfs_lock_for_rename( +STATIC void +xfs_sort_for_rename( xfs_inode_t *dp1, /* in: old (source) directory inode */ xfs_inode_t *dp2, /* in: new (target) directory inode */ xfs_inode_t *ip1, /* in: inode of old entry */ - struct xfs_name *name2, /* in: new entry name */ - xfs_inode_t **ipp2, /* out: inode of new entry, if it + xfs_inode_t *ip2, /* in: inode of new entry, if it already exists, NULL otherwise. */ xfs_inode_t **i_tab,/* out: array of inode returned, sorted */ int *num_inodes) /* out: number of inodes in array */ { - xfs_inode_t *ip2 = NULL; xfs_inode_t *temp; - xfs_ino_t inum1, inum2; - int error; int i, j; - uint lock_mode; - int diff_dirs = (dp1 != dp2); - - /* - * First, find out the current inums of the entries so that we - * can determine the initial locking order. We'll have to - * sanity check stuff after all the locks have been acquired - * to see if we still have the right inodes, directories, etc. - */ - lock_mode = xfs_ilock_map_shared(dp1); - IHOLD(ip1); - xfs_itrace_ref(ip1); - - inum1 = ip1->i_ino; - - /* - * Unlock dp1 and lock dp2 if they are different. - */ - if (diff_dirs) { - xfs_iunlock_map_shared(dp1, lock_mode); - lock_mode = xfs_ilock_map_shared(dp2); - } - - error = xfs_dir_lookup_int(dp2, lock_mode, name2, &inum2, &ip2); - if (error == ENOENT) { /* target does not need to exist. */ - inum2 = 0; - } else if (error) { - /* - * If dp2 and dp1 are the same, the next line unlocks dp1. - * Got it? - */ - xfs_iunlock_map_shared(dp2, lock_mode); - IRELE (ip1); - return error; - } else { - xfs_itrace_ref(ip2); - } /* * i_tab contains a list of pointers to inodes. We initialize @@ -145,21 +92,20 @@ xfs_lock_for_rename( i_tab[0] = dp1; i_tab[1] = dp2; i_tab[2] = ip1; - if (inum2 == 0) { - *num_inodes = 3; - i_tab[3] = NULL; - } else { + if (ip2) { *num_inodes = 4; i_tab[3] = ip2; + } else { + *num_inodes = 3; + i_tab[3] = NULL; } - *ipp2 = i_tab[3]; /* * Sort the elements via bubble sort. (Remember, there are at * most 4 elements to sort, so this is adequate.) */ - for (i=0; i < *num_inodes; i++) { - for (j=1; j < *num_inodes; j++) { + for (i = 0; i < *num_inodes; i++) { + for (j = 1; j < *num_inodes; j++) { if (i_tab[j]->i_ino < i_tab[j-1]->i_ino) { temp = i_tab[j]; i_tab[j] = i_tab[j-1]; @@ -167,30 +113,6 @@ xfs_lock_for_rename( } } } - - /* - * We have dp2 locked. If it isn't first, unlock it. - * If it is first, tell xfs_lock_inodes so it can skip it - * when locking. if dp1 == dp2, xfs_lock_inodes will skip both - * since they are equal. xfs_lock_inodes needs all these inodes - * so that it can unlock and retry if there might be a dead-lock - * potential with the log. - */ - - if (i_tab[0] == dp2 && lock_mode == XFS_ILOCK_SHARED) { -#ifdef DEBUG - xfs_rename_skip++; -#endif - xfs_lock_inodes(i_tab, *num_inodes, 1, XFS_ILOCK_SHARED); - } else { -#ifdef DEBUG - xfs_rename_nskip++; -#endif - xfs_iunlock_map_shared(dp2, lock_mode); - xfs_lock_inodes(i_tab, *num_inodes, 0, XFS_ILOCK_SHARED); - } - - return 0; } /* @@ -202,10 +124,10 @@ xfs_rename( struct xfs_name *src_name, xfs_inode_t *src_ip, xfs_inode_t *target_dp, - struct xfs_name *target_name) + struct xfs_name *target_name, + xfs_inode_t *target_ip) { - xfs_trans_t *tp; - xfs_inode_t *target_ip; + xfs_trans_t *tp = NULL; xfs_mount_t *mp = src_dp->i_mount; int new_parent; /* moving to a new dir */ int src_is_directory; /* src_name is a directory */ @@ -215,9 +137,7 @@ xfs_rename( int cancel_flags; int committed; xfs_inode_t *inodes[4]; - int target_ip_dropped = 0; /* dropped target_ip link? */ int spaceres; - int target_link_zero = 0; int num_inodes; xfs_itrace_entry(src_dp); @@ -230,64 +150,27 @@ xfs_rename( target_dp, DM_RIGHT_NULL, src_name->name, target_name->name, 0, 0, 0); - if (error) { + if (error) return error; - } } /* Return through std_return after this point. */ - /* - * Lock all the participating inodes. Depending upon whether - * the target_name exists in the target directory, and - * whether the target directory is the same as the source - * directory, we can lock from 2 to 4 inodes. - * xfs_lock_for_rename() will return ENOENT if src_name - * does not exist in the source directory. - */ - tp = NULL; - error = xfs_lock_for_rename(src_dp, target_dp, src_ip, target_name, - &target_ip, inodes, &num_inodes); - if (error) { - /* - * We have nothing locked, no inode references, and - * no transaction, so just get out. - */ - goto std_return; - } - - ASSERT(src_ip != NULL); + new_parent = (src_dp != target_dp); + src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); - if ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR) { + if (src_is_directory) { /* * Check for link count overflow on target_dp */ - if (target_ip == NULL && (src_dp != target_dp) && + if (target_ip == NULL && new_parent && target_dp->i_d.di_nlink >= XFS_MAXLINK) { error = XFS_ERROR(EMLINK); - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - goto rele_return; + goto std_return; } } - /* - * If we are using project inheritance, we only allow renames - * into our tree when the project IDs are the same; else the - * tree quota mechanism would be circumvented. - */ - if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && - (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { - error = XFS_ERROR(EXDEV); - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); - goto rele_return; - } - - new_parent = (src_dp != target_dp); - src_is_directory = ((src_ip->i_d.di_mode & S_IFMT) == S_IFDIR); - - /* - * Drop the locks on our inodes so that we can start the transaction. - */ - xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + xfs_sort_for_rename(src_dp, target_dp, src_ip, target_ip, + inodes, &num_inodes); XFS_BMAP_INIT(&free_list, &first_block); tp = xfs_trans_alloc(mp, XFS_TRANS_RENAME); @@ -302,7 +185,7 @@ xfs_rename( } if (error) { xfs_trans_cancel(tp, 0); - goto rele_return; + goto std_return; } /* @@ -310,13 +193,29 @@ xfs_rename( */ if ((error = XFS_QM_DQVOPRENAME(mp, inodes))) { xfs_trans_cancel(tp, cancel_flags); - goto rele_return; + goto std_return; } /* - * Reacquire the inode locks we dropped above. + * Lock all the participating inodes. Depending upon whether + * the target_name exists in the target directory, and + * whether the target directory is the same as the source + * directory, we can lock from 2 to 4 inodes. + */ + xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL); + + /* + * If we are using project inheritance, we only allow renames + * into our tree when the project IDs are the same; else the + * tree quota mechanism would be circumvented. */ - xfs_lock_inodes(inodes, num_inodes, 0, XFS_ILOCK_EXCL); + if (unlikely((target_dp->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) && + (target_dp->i_d.di_projid != src_ip->i_d.di_projid))) { + error = XFS_ERROR(EXDEV); + xfs_rename_unlock4(inodes, XFS_ILOCK_SHARED); + xfs_trans_cancel(tp, cancel_flags); + goto std_return; + } /* * Join all the inodes to the transaction. From this point on, @@ -328,17 +227,17 @@ xfs_rename( */ IHOLD(src_dp); xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL); + if (new_parent) { IHOLD(target_dp); xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL); } - if ((src_ip != src_dp) && (src_ip != target_dp)) { - xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); - } - if ((target_ip != NULL) && - (target_ip != src_ip) && - (target_ip != src_dp) && - (target_ip != target_dp)) { + + IHOLD(src_ip); + xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL); + + if (target_ip) { + IHOLD(target_ip); xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL); } @@ -412,7 +311,6 @@ xfs_rename( error = xfs_droplink(tp, target_ip); if (error) goto abort_return; - target_ip_dropped = 1; if (src_is_directory) { /* @@ -422,10 +320,6 @@ xfs_rename( if (error) goto abort_return; } - - /* Do this test while we still hold the locks */ - target_link_zero = (target_ip)->i_d.di_nlink==0; - } /* target_ip != NULL */ /* @@ -492,15 +386,6 @@ xfs_rename( } /* - * If there was a target inode, take an extra reference on - * it here so that it doesn't go to xfs_inactive() from - * within the commit. - */ - if (target_ip != NULL) { - IHOLD(target_ip); - } - - /* * If this is a synchronous mount, make sure that the * rename transaction goes to disk before returning to * the user. @@ -509,30 +394,11 @@ xfs_rename( xfs_trans_set_sync(tp); } - /* - * Take refs. for vop_link_removed calls below. No need to worry - * about directory refs. because the caller holds them. - * - * Do holds before the xfs_bmap_finish since it might rele them down - * to zero. - */ - - if (target_ip_dropped) - IHOLD(target_ip); - IHOLD(src_ip); - error = xfs_bmap_finish(&tp, &free_list, &committed); if (error) { xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); - if (target_ip != NULL) { - IRELE(target_ip); - } - if (target_ip_dropped) { - IRELE(target_ip); - } - IRELE(src_ip); goto std_return; } @@ -541,15 +407,6 @@ xfs_rename( * the vnode references. */ error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (target_ip != NULL) - IRELE(target_ip); - /* - * Let interposed file systems know about removed links. - */ - if (target_ip_dropped) - IRELE(target_ip); - - IRELE(src_ip); /* Fall through to std_return with error = 0 or errno from * xfs_trans_commit */ @@ -571,11 +428,4 @@ std_return: xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, cancel_flags); goto std_return; - - rele_return: - IRELE(src_ip); - if (target_ip != NULL) { - IRELE(target_ip); - } - goto std_return; } diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index b8db1d5cde5..4c70bf5e998 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -111,13 +111,13 @@ xfs_trans_iget( */ ASSERT(ip->i_itemp != NULL); ASSERT(lock_flags & XFS_ILOCK_EXCL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || - ismrlocked(&ip->i_iolock, MR_UPDATE)); + xfs_isilocked(ip, XFS_IOLOCK_EXCL)); ASSERT((!(lock_flags & XFS_IOLOCK_EXCL)) || (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_EXCL)); ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || - ismrlocked(&ip->i_iolock, (MR_UPDATE | MR_ACCESS))); + xfs_isilocked(ip, XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)); ASSERT((!(lock_flags & XFS_IOLOCK_SHARED)) || (ip->i_itemp->ili_flags & XFS_ILI_IOLOCKED_ANY)); @@ -185,7 +185,7 @@ xfs_trans_ijoin( xfs_inode_log_item_t *iip; ASSERT(ip->i_transp == NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(lock_flags & XFS_ILOCK_EXCL); if (ip->i_itemp == NULL) xfs_inode_item_init(ip, ip->i_mount); @@ -232,7 +232,7 @@ xfs_trans_ihold( { ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ip->i_itemp->ili_flags |= XFS_ILI_HOLD; } @@ -257,7 +257,7 @@ xfs_trans_log_inode( ASSERT(ip->i_transp == tp); ASSERT(ip->i_itemp != NULL); - ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); lidp = xfs_trans_find_item(tp, (xfs_log_item_t*)(ip->i_itemp)); ASSERT(lidp != NULL); diff --git a/fs/xfs/xfs_utils.c b/fs/xfs/xfs_utils.c index 2b8dc7e4077..98e5f110ba5 100644 --- a/fs/xfs/xfs_utils.c +++ b/fs/xfs/xfs_utils.c @@ -41,49 +41,6 @@ #include "xfs_utils.h" -int -xfs_dir_lookup_int( - xfs_inode_t *dp, - uint lock_mode, - struct xfs_name *name, - xfs_ino_t *inum, - xfs_inode_t **ipp) -{ - int error; - - xfs_itrace_entry(dp); - - error = xfs_dir_lookup(NULL, dp, name, inum); - if (!error) { - /* - * Unlock the directory. We do this because we can't - * hold the directory lock while doing the vn_get() - * in xfs_iget(). Doing so could cause us to hold - * a lock while waiting for the inode to finish - * being inactive while it's waiting for a log - * reservation in the inactive routine. - */ - xfs_iunlock(dp, lock_mode); - error = xfs_iget(dp->i_mount, NULL, *inum, 0, 0, ipp, 0); - xfs_ilock(dp, lock_mode); - - if (error) { - *ipp = NULL; - } else if ((*ipp)->i_d.di_mode == 0) { - /* - * The inode has been freed. Something is - * wrong so just get out of here. - */ - xfs_iunlock(dp, lock_mode); - xfs_iput_new(*ipp, 0); - *ipp = NULL; - xfs_ilock(dp, lock_mode); - error = XFS_ERROR(ENOENT); - } - } - return error; -} - /* * Allocates a new inode from disk and return a pointer to the * incore copy. This routine will internally commit the current @@ -310,7 +267,7 @@ xfs_bump_ino_vers2( { xfs_mount_t *mp; - ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ASSERT(ip->i_d.di_version == XFS_DINODE_VERSION_1); ip->i_d.di_version = XFS_DINODE_VERSION_2; diff --git a/fs/xfs/xfs_utils.h b/fs/xfs/xfs_utils.h index 175b126d2ca..f316cb85d8e 100644 --- a/fs/xfs/xfs_utils.h +++ b/fs/xfs/xfs_utils.h @@ -21,8 +21,6 @@ #define IRELE(ip) VN_RELE(XFS_ITOV(ip)) #define IHOLD(ip) VN_HOLD(XFS_ITOV(ip)) -extern int xfs_dir_lookup_int(xfs_inode_t *, uint, struct xfs_name *, - xfs_ino_t *, xfs_inode_t **); extern int xfs_truncate_file(xfs_mount_t *, xfs_inode_t *); extern int xfs_dir_ialloc(xfs_trans_t **, xfs_inode_t *, mode_t, xfs_nlink_t, xfs_dev_t, cred_t *, prid_t, int, diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index fc48158fe47..30bacd8bb0e 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -186,6 +186,7 @@ xfs_cleanup(void) kmem_zone_destroy(xfs_efi_zone); kmem_zone_destroy(xfs_ifork_zone); kmem_zone_destroy(xfs_ili_zone); + kmem_zone_destroy(xfs_log_ticket_zone); } /* diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 6650601c64f..70702a60b4b 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -76,132 +76,6 @@ xfs_open( } /* - * xfs_getattr - */ -int -xfs_getattr( - xfs_inode_t *ip, - bhv_vattr_t *vap, - int flags) -{ - bhv_vnode_t *vp = XFS_ITOV(ip); - xfs_mount_t *mp = ip->i_mount; - - xfs_itrace_entry(ip); - - if (XFS_FORCED_SHUTDOWN(mp)) - return XFS_ERROR(EIO); - - if (!(flags & ATTR_LAZY)) - xfs_ilock(ip, XFS_ILOCK_SHARED); - - vap->va_size = XFS_ISIZE(ip); - if (vap->va_mask == XFS_AT_SIZE) - goto all_done; - - vap->va_nblocks = - XFS_FSB_TO_BB(mp, ip->i_d.di_nblocks + ip->i_delayed_blks); - vap->va_nodeid = ip->i_ino; -#if XFS_BIG_INUMS - vap->va_nodeid += mp->m_inoadd; -#endif - vap->va_nlink = ip->i_d.di_nlink; - - /* - * Quick exit for non-stat callers - */ - if ((vap->va_mask & - ~(XFS_AT_SIZE|XFS_AT_FSID|XFS_AT_NODEID| - XFS_AT_NLINK|XFS_AT_BLKSIZE)) == 0) - goto all_done; - - /* - * Copy from in-core inode. - */ - vap->va_mode = ip->i_d.di_mode; - vap->va_uid = ip->i_d.di_uid; - vap->va_gid = ip->i_d.di_gid; - vap->va_projid = ip->i_d.di_projid; - - /* - * Check vnode type block/char vs. everything else. - */ - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFBLK: - case S_IFCHR: - vap->va_rdev = ip->i_df.if_u2.if_rdev; - vap->va_blocksize = BLKDEV_IOSIZE; - break; - default: - vap->va_rdev = 0; - - if (!(XFS_IS_REALTIME_INODE(ip))) { - vap->va_blocksize = xfs_preferred_iosize(mp); - } else { - - /* - * If the file blocks are being allocated from a - * realtime partition, then return the inode's - * realtime extent size or the realtime volume's - * extent size. - */ - vap->va_blocksize = - xfs_get_extsz_hint(ip) << mp->m_sb.sb_blocklog; - } - break; - } - - vn_atime_to_timespec(vp, &vap->va_atime); - vap->va_mtime.tv_sec = ip->i_d.di_mtime.t_sec; - vap->va_mtime.tv_nsec = ip->i_d.di_mtime.t_nsec; - vap->va_ctime.tv_sec = ip->i_d.di_ctime.t_sec; - vap->va_ctime.tv_nsec = ip->i_d.di_ctime.t_nsec; - - /* - * Exit for stat callers. See if any of the rest of the fields - * to be filled in are needed. - */ - if ((vap->va_mask & - (XFS_AT_XFLAGS|XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| - XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) - goto all_done; - - /* - * Convert di_flags to xflags. - */ - vap->va_xflags = xfs_ip2xflags(ip); - - /* - * Exit for inode revalidate. See if any of the rest of - * the fields to be filled in are needed. - */ - if ((vap->va_mask & - (XFS_AT_EXTSIZE|XFS_AT_NEXTENTS|XFS_AT_ANEXTENTS| - XFS_AT_GENCOUNT|XFS_AT_VCODE)) == 0) - goto all_done; - - vap->va_extsize = ip->i_d.di_extsize << mp->m_sb.sb_blocklog; - vap->va_nextents = - (ip->i_df.if_flags & XFS_IFEXTENTS) ? - ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) : - ip->i_d.di_nextents; - if (ip->i_afp) - vap->va_anextents = - (ip->i_afp->if_flags & XFS_IFEXTENTS) ? - ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) : - ip->i_d.di_anextents; - else - vap->va_anextents = 0; - vap->va_gen = ip->i_d.di_gen; - - all_done: - if (!(flags & ATTR_LAZY)) - xfs_iunlock(ip, XFS_ILOCK_SHARED); - return 0; -} - - -/* * xfs_setattr */ int @@ -211,7 +85,6 @@ xfs_setattr( int flags, cred_t *credp) { - bhv_vnode_t *vp = XFS_ITOV(ip); xfs_mount_t *mp = ip->i_mount; xfs_trans_t *tp; int mask; @@ -222,7 +95,6 @@ xfs_setattr( gid_t gid=0, igid=0; int timeflags = 0; xfs_prid_t projid=0, iprojid=0; - int mandlock_before, mandlock_after; struct xfs_dquot *udqp, *gdqp, *olddquot1, *olddquot2; int file_owner; int need_iolock = 1; @@ -383,7 +255,7 @@ xfs_setattr( m |= S_ISGID; #if 0 /* Linux allows this, Irix doesn't. */ - if ((vap->va_mode & S_ISVTX) && !VN_ISDIR(vp)) + if ((vap->va_mode & S_ISVTX) && !S_ISDIR(ip->i_d.di_mode)) m |= S_ISVTX; #endif if (m && !capable(CAP_FSETID)) @@ -461,10 +333,10 @@ xfs_setattr( goto error_return; } - if (VN_ISDIR(vp)) { + if (S_ISDIR(ip->i_d.di_mode)) { code = XFS_ERROR(EISDIR); goto error_return; - } else if (!VN_ISREG(vp)) { + } else if (!S_ISREG(ip->i_d.di_mode)) { code = XFS_ERROR(EINVAL); goto error_return; } @@ -626,9 +498,6 @@ xfs_setattr( xfs_trans_ihold(tp, ip); } - /* determine whether mandatory locking mode changes */ - mandlock_before = MANDLOCK(vp, ip->i_d.di_mode); - /* * Truncate file. Must have write permission and not be a directory. */ @@ -858,13 +727,6 @@ xfs_setattr( code = xfs_trans_commit(tp, commit_flags); } - /* - * If the (regular) file's mandatory locking mode changed, then - * notify the vnode. We do this under the inode lock to prevent - * racing calls to vop_vnode_change. - */ - mandlock_after = MANDLOCK(vp, ip->i_d.di_mode); - xfs_iunlock(ip, lock_flags); /* @@ -1443,7 +1305,7 @@ xfs_inactive_attrs( int error; xfs_mount_t *mp; - ASSERT(ismrlocked(&ip->i_iolock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); tp = *tpp; mp = ip->i_mount; ASSERT(ip->i_d.di_forkoff != 0); @@ -1491,7 +1353,7 @@ xfs_release( xfs_mount_t *mp = ip->i_mount; int error; - if (!VN_ISREG(vp) || (ip->i_d.di_mode == 0)) + if (!S_ISREG(ip->i_d.di_mode) || (ip->i_d.di_mode == 0)) return 0; /* If this is a read-only mount, don't do this (would generate I/O) */ @@ -1774,8 +1636,7 @@ xfs_lookup( struct xfs_name *name, xfs_inode_t **ipp) { - xfs_inode_t *ip; - xfs_ino_t e_inum; + xfs_ino_t inum; int error; uint lock_mode; @@ -1785,12 +1646,21 @@ xfs_lookup( return XFS_ERROR(EIO); lock_mode = xfs_ilock_map_shared(dp); - error = xfs_dir_lookup_int(dp, lock_mode, name, &e_inum, &ip); - if (!error) { - *ipp = ip; - xfs_itrace_ref(ip); - } + error = xfs_dir_lookup(NULL, dp, name, &inum); xfs_iunlock_map_shared(dp, lock_mode); + + if (error) + goto out; + + error = xfs_iget(dp->i_mount, NULL, inum, 0, 0, ipp, 0); + if (error) + goto out; + + xfs_itrace_ref(*ipp); + return 0; + + out: + *ipp = NULL; return error; } @@ -1906,7 +1776,7 @@ xfs_create( * It is locked (and joined to the transaction). */ - ASSERT(ismrlocked (&ip->i_lock, MR_UPDATE)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); /* * Now we join the directory inode to the transaction. We do not do it @@ -2112,7 +1982,7 @@ again: ips[0] = ip; ips[1] = dp; - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); } /* else e_inum == dp->i_ino */ /* This can happen if we're asked to lock /x/.. @@ -2160,7 +2030,6 @@ void xfs_lock_inodes( xfs_inode_t **ips, int inodes, - int first_locked, uint lock_mode) { int attempts = 0, i, j, try_lock; @@ -2168,13 +2037,8 @@ xfs_lock_inodes( ASSERT(ips && (inodes >= 2)); /* we need at least two */ - if (first_locked) { - try_lock = 1; - i = 1; - } else { - try_lock = 0; - i = 0; - } + try_lock = 0; + i = 0; again: for (; i < inodes; i++) { @@ -2298,29 +2162,14 @@ xfs_remove( return error; } - /* - * We need to get a reference to ip before we get our log - * reservation. The reason for this is that we cannot call - * xfs_iget for an inode for which we do not have a reference - * once we've acquired a log reservation. This is because the - * inode we are trying to get might be in xfs_inactive going - * for a log reservation. Since we'll have to wait for the - * inactive code to complete before returning from xfs_iget, - * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked reference - * to the inode before getting our log reservation. - */ - IHOLD(ip); - xfs_itrace_entry(ip); xfs_itrace_ref(ip); error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error && dp != ip) + if (!error) error = XFS_QM_DQATTACH(mp, ip, 0); if (error) { REMOVE_DEBUG_TRACE(__LINE__); - IRELE(ip); goto std_return; } @@ -2347,7 +2196,6 @@ xfs_remove( ASSERT(error != ENOSPC); REMOVE_DEBUG_TRACE(__LINE__); xfs_trans_cancel(tp, 0); - IRELE(ip); return error; } @@ -2355,7 +2203,6 @@ xfs_remove( if (error) { REMOVE_DEBUG_TRACE(__LINE__); xfs_trans_cancel(tp, cancel_flags); - IRELE(ip); goto std_return; } @@ -2363,23 +2210,18 @@ xfs_remove( * At this point, we've gotten both the directory and the entry * inodes locked. */ + IHOLD(ip); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - if (dp != ip) { - /* - * Increment vnode ref count only in this case since - * there's an extra vnode reference in the case where - * dp == ip. - */ - IHOLD(dp); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - } + + IHOLD(dp); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); /* * Entry must exist since we did a lookup in xfs_lock_dir_and_entry. */ XFS_BMAP_INIT(&free_list, &first_block); error = xfs_dir_removename(tp, dp, name, ip->i_ino, - &first_block, &free_list, 0); + &first_block, &free_list, resblks); if (error) { ASSERT(error != ENOENT); REMOVE_DEBUG_TRACE(__LINE__); @@ -2402,12 +2244,6 @@ xfs_remove( link_zero = (ip)->i_d.di_nlink==0; /* - * Take an extra ref on the inode so that it doesn't - * go to xfs_inactive() from within the commit. - */ - IHOLD(ip); - - /* * If this is a synchronous mount, make sure that the * remove transaction goes to disk before returning to * the user. @@ -2423,10 +2259,8 @@ xfs_remove( } error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); - if (error) { - IRELE(ip); + if (error) goto std_return; - } /* * If we are using filestreams, kill the stream association. @@ -2438,7 +2272,6 @@ xfs_remove( xfs_filestream_deassociate(ip); xfs_itrace_exit(ip); - IRELE(ip); /* Fall through to std_return with error = 0 */ std_return: @@ -2467,8 +2300,6 @@ xfs_remove( cancel_flags |= XFS_TRANS_ABORT; xfs_trans_cancel(tp, cancel_flags); - IRELE(ip); - goto std_return; } @@ -2536,7 +2367,7 @@ xfs_link( ips[1] = sip; } - xfs_lock_inodes(ips, 2, 0, XFS_ILOCK_EXCL); + xfs_lock_inodes(ips, 2, XFS_ILOCK_EXCL); /* * Increment vnode ref counts since xfs_trans_commit & @@ -2840,7 +2671,6 @@ xfs_rmdir( struct xfs_name *name, xfs_inode_t *cdp) { - bhv_vnode_t *dir_vp = XFS_ITOV(dp); xfs_mount_t *mp = dp->i_mount; xfs_trans_t *tp; int error; @@ -2866,27 +2696,12 @@ xfs_rmdir( } /* - * We need to get a reference to cdp before we get our log - * reservation. The reason for this is that we cannot call - * xfs_iget for an inode for which we do not have a reference - * once we've acquired a log reservation. This is because the - * inode we are trying to get might be in xfs_inactive going - * for a log reservation. Since we'll have to wait for the - * inactive code to complete before returning from xfs_iget, - * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked reference - * to the inode before getting our log reservation. - */ - IHOLD(cdp); - - /* * Get the dquots for the inodes. */ error = XFS_QM_DQATTACH(mp, dp, 0); - if (!error && dp != cdp) + if (!error) error = XFS_QM_DQATTACH(mp, cdp, 0); if (error) { - IRELE(cdp); REMOVE_DEBUG_TRACE(__LINE__); goto std_return; } @@ -2913,7 +2728,6 @@ xfs_rmdir( if (error) { ASSERT(error != ENOSPC); cancel_flags = 0; - IRELE(cdp); goto error_return; } XFS_BMAP_INIT(&free_list, &first_block); @@ -2927,21 +2741,13 @@ xfs_rmdir( error = xfs_lock_dir_and_entry(dp, cdp); if (error) { xfs_trans_cancel(tp, cancel_flags); - IRELE(cdp); goto std_return; } + IHOLD(dp); xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL); - if (dp != cdp) { - /* - * Only increment the parent directory vnode count if - * we didn't bump it in looking up cdp. The only time - * we don't bump it is when we're looking up ".". - */ - VN_HOLD(dir_vp); - } - xfs_itrace_ref(cdp); + IHOLD(cdp); xfs_trans_ijoin(tp, cdp, XFS_ILOCK_EXCL); ASSERT(cdp->i_d.di_nlink >= 2); @@ -2995,12 +2801,6 @@ xfs_rmdir( last_cdp_link = (cdp)->i_d.di_nlink==0; /* - * Take an extra ref on the child vnode so that it - * does not go to xfs_inactive() from within the commit. - */ - IHOLD(cdp); - - /* * If this is a synchronous mount, make sure that the * rmdir transaction goes to disk before returning to * the user. @@ -3014,19 +2814,15 @@ xfs_rmdir( xfs_bmap_cancel(&free_list); xfs_trans_cancel(tp, (XFS_TRANS_RELEASE_LOG_RES | XFS_TRANS_ABORT)); - IRELE(cdp); goto std_return; } error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES); if (error) { - IRELE(cdp); goto std_return; } - IRELE(cdp); - /* Fall through to std_return with error = 0 or the errno * from xfs_trans_commit. */ std_return: diff --git a/fs/xfs/xfs_vnodeops.h b/fs/xfs/xfs_vnodeops.h index 24c53923dc2..8abe8f186e2 100644 --- a/fs/xfs/xfs_vnodeops.h +++ b/fs/xfs/xfs_vnodeops.h @@ -15,7 +15,6 @@ struct xfs_iomap; int xfs_open(struct xfs_inode *ip); -int xfs_getattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags); int xfs_setattr(struct xfs_inode *ip, struct bhv_vattr *vap, int flags, struct cred *credp); int xfs_readlink(struct xfs_inode *ip, char *link); @@ -48,9 +47,9 @@ int xfs_change_file_space(struct xfs_inode *ip, int cmd, struct cred *credp, int attr_flags); int xfs_rename(struct xfs_inode *src_dp, struct xfs_name *src_name, struct xfs_inode *src_ip, struct xfs_inode *target_dp, - struct xfs_name *target_name); + struct xfs_name *target_name, struct xfs_inode *target_ip); int xfs_attr_get(struct xfs_inode *ip, const char *name, char *value, - int *valuelenp, int flags, cred_t *cred); + int *valuelenp, int flags); int xfs_attr_set(struct xfs_inode *dp, const char *name, char *value, int valuelen, int flags); int xfs_attr_remove(struct xfs_inode *dp, const char *name, int flags); @@ -61,9 +60,6 @@ int xfs_ioctl(struct xfs_inode *ip, struct file *filp, ssize_t xfs_read(struct xfs_inode *ip, struct kiocb *iocb, const struct iovec *iovp, unsigned int segs, loff_t *offset, int ioflags); -ssize_t xfs_sendfile(struct xfs_inode *ip, struct file *filp, - loff_t *offset, int ioflags, size_t count, - read_actor_t actor, void *target); ssize_t xfs_splice_read(struct xfs_inode *ip, struct file *infilp, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, int flags, int ioflags); |