diff options
Diffstat (limited to 'fs')
67 files changed, 7974 insertions, 1318 deletions
diff --git a/fs/Kconfig b/fs/Kconfig index f54a157a029..9e9d70c02a0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -220,17 +220,16 @@ config JBD tristate help This is a generic journalling layer for block devices. It is - currently used by the ext3 and OCFS2 file systems, but it could - also be used to add journal support to other file systems or block + currently used by the ext3 file system, but it could also be + used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using the ext3 or OCFS2 file systems, you need to - say Y here. If you are not using ext3 OCFS2 then you will probably - want to say N. + If you are using the ext3 file system, you need to say Y here. + If you are not using ext3 then you will probably want to say N. To compile this device as a module, choose M here: the module will be - called jbd. If you are compiling ext3 or OCFS2 into the kernel, - you cannot compile this code as a module. + called jbd. If you are compiling ext3 into the kernel, you + cannot compile this code as a module. config JBD_DEBUG bool "JBD (ext3) debugging support" @@ -254,15 +253,16 @@ config JBD2 help This is a generic journaling layer for block devices that support both 32-bit and 64-bit block numbers. It is currently used by - the ext4 filesystem, but it could also be used to add + the ext4 and OCFS2 filesystems, but it could also be used to add journal support to other file systems or block devices such as RAID or LVM. - If you are using ext4, you need to say Y here. If you are not - using ext4 then you will probably want to say N. + If you are using ext4 or OCFS2, you need to say Y here. + If you are not using ext4 or OCFS2 then you will + probably want to say N. To compile this device as a module, choose M here. The module will be - called jbd2. If you are compiling ext4 into the kernel, + called jbd2. If you are compiling ext4 or OCFS2 into the kernel, you cannot compile this code as a module. config JBD2_DEBUG @@ -433,6 +433,14 @@ config FS_POSIX_ACL bool default n +config FILE_LOCKING + bool "Enable POSIX file locking API" if EMBEDDED + default y + help + This option enables standard file locking support, required + for filesystems like NFS and for the flock() system + call. Disabling this option saves about 11k. + source "fs/xfs/Kconfig" source "fs/gfs2/Kconfig" @@ -440,7 +448,7 @@ config OCFS2_FS tristate "OCFS2 file system support" depends on NET && SYSFS select CONFIGFS_FS - select JBD + select JBD2 select CRC32 help OCFS2 is a general purpose extent based shared disk cluster file @@ -511,6 +519,16 @@ config OCFS2_DEBUG_FS this option for debugging only as it is likely to decrease performance of the filesystem. +config OCFS2_COMPAT_JBD + bool "Use JBD for compatibility" + depends on OCFS2_FS + default n + select JBD + help + The ocfs2 filesystem now uses JBD2 for its journalling. JBD2 + is backwards compatible with JBD. It is safe to say N here. + However, if you really want to use the original JBD, say Y here. + endif # BLOCK config DNOTIFY @@ -1779,6 +1797,28 @@ config SUNRPC_XPRT_RDMA If unsure, say N. +config SUNRPC_REGISTER_V4 + bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)" + depends on SUNRPC && EXPERIMENTAL + default n + help + Sun added support for registering RPC services at an IPv6 + address by creating two new versions of the rpcbind protocol + (RFC 1833). + + This option enables support in the kernel RPC server for + registering kernel RPC services via version 4 of the rpcbind + protocol. If you enable this option, you must run a portmapper + daemon that supports rpcbind protocol version 4. + + Serving NFS over IPv6 from knfsd (the kernel's NFS server) + requires that you enable this option and use a portmapper that + supports rpcbind version 4. + + If unsure, say N to get traditional behavior (register kernel + RPC services using only rpcbind version 2). Distributions + using the legacy Linux portmapper daemon must say N here. + config RPCSEC_GSS_KRB5 tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)" depends on SUNRPC && EXPERIMENTAL diff --git a/fs/Makefile b/fs/Makefile index de404b00eb0..b6f27dc26b7 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -7,7 +7,7 @@ obj-y := open.o read_write.o file_table.o super.o \ char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \ - ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ + ioctl.o readdir.o select.o fifo.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o \ pnode.o drop_caches.o splice.o sync.o utimes.o \ @@ -27,6 +27,7 @@ obj-$(CONFIG_ANON_INODES) += anon_inodes.o obj-$(CONFIG_SIGNALFD) += signalfd.o obj-$(CONFIG_TIMERFD) += timerfd.o obj-$(CONFIG_EVENTFD) += eventfd.o +obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o nfsd-$(CONFIG_NFSD) := nfsctl.o diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index 7725a0a9a55..97f6073ab33 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,6 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o host.o svc.o svclock.o svcshare.o \ - svcproc.o svcsubs.o mon.o xdr.o + svcproc.o svcsubs.o mon.o xdr.o grace.o lockd-objs-$(CONFIG_LOCKD_V4) += xdr4.o svc4proc.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 0b45fd3a4bf..8307dd64bf4 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -54,14 +54,13 @@ struct nlm_host *nlmclnt_init(const struct nlmclnt_initdata *nlm_init) u32 nlm_version = (nlm_init->nfs_version == 2) ? 1 : 4; int status; - status = lockd_up(nlm_init->protocol); + status = lockd_up(); if (status < 0) return ERR_PTR(status); - host = nlmclnt_lookup_host((struct sockaddr_in *)nlm_init->address, + host = nlmclnt_lookup_host(nlm_init->address, nlm_init->addrlen, nlm_init->protocol, nlm_version, - nlm_init->hostname, - strlen(nlm_init->hostname)); + nlm_init->hostname); if (host == NULL) { lockd_down(); return ERR_PTR(-ENOLCK); @@ -142,7 +141,7 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) /* * The server lockd has called us back to tell us the lock was granted */ -__be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock) +__be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) { const struct file_lock *fl = &lock->fl; const struct nfs_fh *fh = &lock->fh; @@ -166,7 +165,7 @@ __be32 nlmclnt_grant(const struct sockaddr_in *addr, const struct nlm_lock *lock */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(&block->b_host->h_addr, addr)) + if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; @@ -216,7 +215,7 @@ reclaimer(void *ptr) /* This one ensures that our parent doesn't terminate while the * reclaim is in progress */ lock_kernel(); - lockd_up(0); /* note: this cannot fail as lockd is already running */ + lockd_up(); /* note: this cannot fail as lockd is already running */ dprintk("lockd: reclaiming locks for host %s\n", host->h_name); diff --git a/fs/lockd/grace.c b/fs/lockd/grace.c new file mode 100644 index 00000000000..183cc1f0af1 --- /dev/null +++ b/fs/lockd/grace.c @@ -0,0 +1,59 @@ +/* + * Common code for control of lockd and nfsv4 grace periods. + */ + +#include <linux/module.h> +#include <linux/lockd/bind.h> + +static LIST_HEAD(grace_list); +static DEFINE_SPINLOCK(grace_lock); + +/** + * locks_start_grace + * @lm: who this grace period is for + * + * A grace period is a period during which locks should not be given + * out. Currently grace periods are only enforced by the two lock + * managers (lockd and nfsd), using the locks_in_grace() function to + * check when they are in a grace period. + * + * This function is called to start a grace period. + */ +void locks_start_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_add(&lm->list, &grace_list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_start_grace); + +/** + * locks_end_grace + * @lm: who this grace period is for + * + * Call this function to state that the given lock manager is ready to + * resume regular locking. The grace period will not end until all lock + * managers that called locks_start_grace() also call locks_end_grace(). + * Note that callers count on it being safe to call this more than once, + * and the second call should be a no-op. + */ +void locks_end_grace(struct lock_manager *lm) +{ + spin_lock(&grace_lock); + list_del_init(&lm->list); + spin_unlock(&grace_lock); +} +EXPORT_SYMBOL_GPL(locks_end_grace); + +/** + * locks_in_grace + * + * Lock managers call this function to determine when it is OK for them + * to answer ordinary lock requests, and when they should accept only + * lock reclaims. + */ +int locks_in_grace(void) +{ + return !list_empty(&grace_list); +} +EXPORT_SYMBOL_GPL(locks_in_grace); diff --git a/fs/lockd/host.c b/fs/lockd/host.c index a17664c7eac..9fd8889097b 100644 --- a/fs/lockd/host.c +++ b/fs/lockd/host.c @@ -11,16 +11,17 @@ #include <linux/types.h> #include <linux/slab.h> #include <linux/in.h> +#include <linux/in6.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/svc.h> #include <linux/lockd/lockd.h> #include <linux/lockd/sm_inter.h> #include <linux/mutex.h> +#include <net/ipv6.h> #define NLMDBG_FACILITY NLMDBG_HOSTCACHE #define NLM_HOST_NRHASH 32 -#define NLM_ADDRHASH(addr) (ntohl(addr) & (NLM_HOST_NRHASH-1)) #define NLM_HOST_REBIND (60 * HZ) #define NLM_HOST_EXPIRE (300 * HZ) #define NLM_HOST_COLLECT (120 * HZ) @@ -30,42 +31,115 @@ static unsigned long next_gc; static int nrhosts; static DEFINE_MUTEX(nlm_host_mutex); - static void nlm_gc_hosts(void); -static struct nsm_handle * __nsm_find(const struct sockaddr_in *, - const char *, unsigned int, int); -static struct nsm_handle * nsm_find(const struct sockaddr_in *sin, - const char *hostname, - unsigned int hostname_len); +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create); + +struct nlm_lookup_host_info { + const int server; /* search for server|client */ + const struct sockaddr *sap; /* address to search for */ + const size_t salen; /* it's length */ + const unsigned short protocol; /* transport to search for*/ + const u32 version; /* NLM version to search for */ + const char *hostname; /* remote's hostname */ + const size_t hostname_len; /* it's length */ + const struct sockaddr *src_sap; /* our address (optional) */ + const size_t src_len; /* it's length */ +}; + +/* + * Hash function must work well on big- and little-endian platforms + */ +static unsigned int __nlm_hash32(const __be32 n) +{ + unsigned int hash = (__force u32)n ^ ((__force u32)n >> 16); + return hash ^ (hash >> 8); +} + +static unsigned int __nlm_hash_addr4(const struct sockaddr *sap) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + return __nlm_hash32(sin->sin_addr.s_addr); +} + +static unsigned int __nlm_hash_addr6(const struct sockaddr *sap) +{ + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + const struct in6_addr addr = sin6->sin6_addr; + return __nlm_hash32(addr.s6_addr32[0]) ^ + __nlm_hash32(addr.s6_addr32[1]) ^ + __nlm_hash32(addr.s6_addr32[2]) ^ + __nlm_hash32(addr.s6_addr32[3]); +} + +static unsigned int nlm_hash_address(const struct sockaddr *sap) +{ + unsigned int hash; + + switch (sap->sa_family) { + case AF_INET: + hash = __nlm_hash_addr4(sap); + break; + case AF_INET6: + hash = __nlm_hash_addr6(sap); + break; + default: + hash = 0; + } + return hash & (NLM_HOST_NRHASH - 1); +} + +static void nlm_clear_port(struct sockaddr *sap) +{ + switch (sap->sa_family) { + case AF_INET: + ((struct sockaddr_in *)sap)->sin_port = 0; + break; + case AF_INET6: + ((struct sockaddr_in6 *)sap)->sin6_port = 0; + break; + } +} + +static void nlm_display_address(const struct sockaddr *sap, + char *buf, const size_t len) +{ + const struct sockaddr_in *sin = (struct sockaddr_in *)sap; + const struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *)sap; + + switch (sap->sa_family) { + case AF_UNSPEC: + snprintf(buf, len, "unspecified"); + break; + case AF_INET: + snprintf(buf, len, NIPQUAD_FMT, NIPQUAD(sin->sin_addr.s_addr)); + break; + case AF_INET6: + if (ipv6_addr_v4mapped(&sin6->sin6_addr)) + snprintf(buf, len, NIPQUAD_FMT, + NIPQUAD(sin6->sin6_addr.s6_addr32[3])); + else + snprintf(buf, len, NIP6_FMT, NIP6(sin6->sin6_addr)); + break; + default: + snprintf(buf, len, "unsupported address family"); + break; + } +} /* * Common host lookup routine for server & client */ -static struct nlm_host *nlm_lookup_host(int server, - const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len, - const struct sockaddr_in *ssin) +static struct nlm_host *nlm_lookup_host(struct nlm_lookup_host_info *ni) { struct hlist_head *chain; struct hlist_node *pos; struct nlm_host *host; struct nsm_handle *nsm = NULL; - int hash; - - dprintk("lockd: nlm_lookup_host("NIPQUAD_FMT"->"NIPQUAD_FMT - ", p=%d, v=%u, my role=%s, name=%.*s)\n", - NIPQUAD(ssin->sin_addr.s_addr), - NIPQUAD(sin->sin_addr.s_addr), proto, version, - server? "server" : "client", - hostname_len, - hostname? hostname : "<none>"); - - hash = NLM_ADDRHASH(sin->sin_addr.s_addr); - - /* Lock hash table */ mutex_lock(&nlm_host_mutex); if (time_after_eq(jiffies, next_gc)) @@ -78,22 +152,22 @@ static struct nlm_host *nlm_lookup_host(int server, * different NLM rpc_clients into one single nlm_host object. * This would allow us to have one nlm_host per address. */ - chain = &nlm_hosts[hash]; + chain = &nlm_hosts[nlm_hash_address(ni->sap)]; hlist_for_each_entry(host, pos, chain, h_hash) { - if (!nlm_cmp_addr(&host->h_addr, sin)) + if (!nlm_cmp_addr(nlm_addr(host), ni->sap)) continue; /* See if we have an NSM handle for this client */ if (!nsm) nsm = host->h_nsmhandle; - if (host->h_proto != proto) + if (host->h_proto != ni->protocol) continue; - if (host->h_version != version) + if (host->h_version != ni->version) continue; - if (host->h_server != server) + if (host->h_server != ni->server) continue; - if (!nlm_cmp_addr(&host->h_saddr, ssin)) + if (!nlm_cmp_addr(nlm_srcaddr(host), ni->src_sap)) continue; /* Move to head of hash chain. */ @@ -101,30 +175,41 @@ static struct nlm_host *nlm_lookup_host(int server, hlist_add_head(&host->h_hash, chain); nlm_get_host(host); + dprintk("lockd: nlm_lookup_host found host %s (%s)\n", + host->h_name, host->h_addrbuf); goto out; } - if (nsm) - atomic_inc(&nsm->sm_count); - - host = NULL; - /* Sadly, the host isn't in our hash table yet. See if - * we have an NSM handle for it. If not, create one. + /* + * The host wasn't in our hash table. If we don't + * have an NSM handle for it yet, create one. */ - if (!nsm && !(nsm = nsm_find(sin, hostname, hostname_len))) - goto out; + if (nsm) + atomic_inc(&nsm->sm_count); + else { + host = NULL; + nsm = nsm_find(ni->sap, ni->salen, + ni->hostname, ni->hostname_len, 1); + if (!nsm) { + dprintk("lockd: nlm_lookup_host failed; " + "no nsm handle\n"); + goto out; + } + } host = kzalloc(sizeof(*host), GFP_KERNEL); if (!host) { nsm_release(nsm); + dprintk("lockd: nlm_lookup_host failed; no memory\n"); goto out; } host->h_name = nsm->sm_name; - host->h_addr = *sin; - host->h_addr.sin_port = 0; /* ouch! */ - host->h_saddr = *ssin; - host->h_version = version; - host->h_proto = proto; + memcpy(nlm_addr(host), ni->sap, ni->salen); + host->h_addrlen = ni->salen; + nlm_clear_port(nlm_addr(host)); + memcpy(nlm_srcaddr(host), ni->src_sap, ni->src_len); + host->h_version = ni->version; + host->h_proto = ni->protocol; host->h_rpcclnt = NULL; mutex_init(&host->h_mutex); host->h_nextrebind = jiffies + NLM_HOST_REBIND; @@ -135,7 +220,7 @@ static struct nlm_host *nlm_lookup_host(int server, host->h_state = 0; /* pseudo NSM state */ host->h_nsmstate = 0; /* real NSM state */ host->h_nsmhandle = nsm; - host->h_server = server; + host->h_server = ni->server; hlist_add_head(&host->h_hash, chain); INIT_LIST_HEAD(&host->h_lockowners); spin_lock_init(&host->h_lock); @@ -143,6 +228,15 @@ static struct nlm_host *nlm_lookup_host(int server, INIT_LIST_HEAD(&host->h_reclaim); nrhosts++; + + nlm_display_address((struct sockaddr *)&host->h_addr, + host->h_addrbuf, sizeof(host->h_addrbuf)); + nlm_display_address((struct sockaddr *)&host->h_srcaddr, + host->h_srcaddrbuf, sizeof(host->h_srcaddrbuf)); + + dprintk("lockd: nlm_lookup_host created host %s\n", + host->h_name); + out: mutex_unlock(&nlm_host_mutex); return host; @@ -170,33 +264,103 @@ nlm_destroy_host(struct nlm_host *host) kfree(host); } -/* - * Find an NLM server handle in the cache. If there is none, create it. +/** + * nlmclnt_lookup_host - Find an NLM host handle matching a remote server + * @sap: network address of server + * @salen: length of server address + * @protocol: transport protocol to use + * @version: NLM protocol version + * @hostname: '\0'-terminated hostname of server + * + * Returns an nlm_host structure that matches the passed-in + * [server address, transport protocol, NLM version, server hostname]. + * If one doesn't already exist in the host cache, a new handle is + * created and returned. */ -struct nlm_host *nlmclnt_lookup_host(const struct sockaddr_in *sin, - int proto, u32 version, - const char *hostname, - unsigned int hostname_len) +struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, + const size_t salen, + const unsigned short protocol, + const u32 version, const char *hostname) { - struct sockaddr_in ssin = {0}; - - return nlm_lookup_host(0, sin, proto, version, - hostname, hostname_len, &ssin); + const struct sockaddr source = { + .sa_family = AF_UNSPEC, + }; + struct nlm_lookup_host_info ni = { + .server = 0, + .sap = sap, + .salen = salen, + .protocol = protocol, + .version = version, + .hostname = hostname, + .hostname_len = strlen(hostname), + .src_sap = &source, + .src_len = sizeof(source), + }; + + dprintk("lockd: %s(host='%s', vers=%u, proto=%s)\n", __func__, + (hostname ? hostname : "<none>"), version, + (protocol == IPPROTO_UDP ? "udp" : "tcp")); + + return nlm_lookup_host(&ni); } -/* - * Find an NLM client handle in the cache. If there is none, create it. +/** + * nlmsvc_lookup_host - Find an NLM host handle matching a remote client + * @rqstp: incoming NLM request + * @hostname: name of client host + * @hostname_len: length of client hostname + * + * Returns an nlm_host structure that matches the [client address, + * transport protocol, NLM version, client hostname] of the passed-in + * NLM request. If one doesn't already exist in the host cache, a + * new handle is created and returned. + * + * Before possibly creating a new nlm_host, construct a sockaddr + * for a specific source address in case the local system has + * multiple network addresses. The family of the address in + * rq_daddr is guaranteed to be the same as the family of the + * address in rq_addr, so it's safe to use the same family for + * the source address. */ -struct nlm_host * -nlmsvc_lookup_host(struct svc_rqst *rqstp, - const char *hostname, unsigned int hostname_len) +struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, + const char *hostname, + const size_t hostname_len) { - struct sockaddr_in ssin = {0}; + struct sockaddr_in sin = { + .sin_family = AF_INET, + }; + struct sockaddr_in6 sin6 = { + .sin6_family = AF_INET6, + }; + struct nlm_lookup_host_info ni = { + .server = 1, + .sap = svc_addr(rqstp), + .salen = rqstp->rq_addrlen, + .protocol = rqstp->rq_prot, + .version = rqstp->rq_vers, + .hostname = hostname, + .hostname_len = hostname_len, + .src_len = rqstp->rq_addrlen, + }; + + dprintk("lockd: %s(host='%*s', vers=%u, proto=%s)\n", __func__, + (int)hostname_len, hostname, rqstp->rq_vers, + (rqstp->rq_prot == IPPROTO_UDP ? "udp" : "tcp")); + + switch (ni.sap->sa_family) { + case AF_INET: + sin.sin_addr.s_addr = rqstp->rq_daddr.addr.s_addr; + ni.src_sap = (struct sockaddr *)&sin; + break; + case AF_INET6: + ipv6_addr_copy(&sin6.sin6_addr, &rqstp->rq_daddr.addr6); + ni.src_sap = (struct sockaddr *)&sin6; + break; + default: + return NULL; + } - ssin.sin_addr = rqstp->rq_daddr.addr; - return nlm_lookup_host(1, svc_addr_in(rqstp), - rqstp->rq_prot, rqstp->rq_vers, - hostname, hostname_len, &ssin); + return nlm_lookup_host(&ni); } /* @@ -207,9 +371,8 @@ nlm_bind_host(struct nlm_host *host) { struct rpc_clnt *clnt; - dprintk("lockd: nlm_bind_host("NIPQUAD_FMT"->"NIPQUAD_FMT")\n", - NIPQUAD(host->h_saddr.sin_addr), - NIPQUAD(host->h_addr.sin_addr)); + dprintk("lockd: nlm_bind_host %s (%s), my addr=%s\n", + host->h_name, host->h_addrbuf, host->h_srcaddrbuf); /* Lock host handle */ mutex_lock(&host->h_mutex); @@ -221,7 +384,7 @@ nlm_bind_host(struct nlm_host *host) if (time_after_eq(jiffies, host->h_nextrebind)) { rpc_force_rebind(clnt); host->h_nextrebind = jiffies + NLM_HOST_REBIND; - dprintk("lockd: next rebind in %ld jiffies\n", + dprintk("lockd: next rebind in %lu jiffies\n", host->h_nextrebind - jiffies); } } else { @@ -234,9 +397,9 @@ nlm_bind_host(struct nlm_host *host) }; struct rpc_create_args args = { .protocol = host->h_proto, - .address = (struct sockaddr *)&host->h_addr, - .addrsize = sizeof(host->h_addr), - .saddress = (struct sockaddr *)&host->h_saddr, + .address = nlm_addr(host), + .addrsize = host->h_addrlen, + .saddress = nlm_srcaddr(host), .timeout = &timeparms, .servername = host->h_name, .program = &nlm_program, @@ -324,12 +487,16 @@ void nlm_host_rebooted(const struct sockaddr_in *sin, struct nsm_handle *nsm; struct nlm_host *host; - dprintk("lockd: nlm_host_rebooted(%s, %u.%u.%u.%u)\n", - hostname, NIPQUAD(sin->sin_addr)); - - /* Find the NSM handle for this peer */ - if (!(nsm = __nsm_find(sin, hostname, hostname_len, 0))) + nsm = nsm_find((struct sockaddr *)sin, sizeof(*sin), + hostname, hostname_len, 0); + if (nsm == NULL) { + dprintk("lockd: never saw rebooted peer '%.*s' before\n", + hostname_len, hostname); return; + } + + dprintk("lockd: nlm_host_rebooted(%.*s, %s)\n", + hostname_len, hostname, nsm->sm_addrbuf); /* When reclaiming locks on this peer, make sure that * we set up a new notification */ @@ -461,22 +628,23 @@ nlm_gc_hosts(void) static LIST_HEAD(nsm_handles); static DEFINE_SPINLOCK(nsm_lock); -static struct nsm_handle * -__nsm_find(const struct sockaddr_in *sin, - const char *hostname, unsigned int hostname_len, - int create) +static struct nsm_handle *nsm_find(const struct sockaddr *sap, + const size_t salen, + const char *hostname, + const size_t hostname_len, + const int create) { struct nsm_handle *nsm = NULL; struct nsm_handle *pos; - if (!sin) + if (!sap) return NULL; if (hostname && memchr(hostname, '/', hostname_len) != NULL) { if (printk_ratelimit()) { printk(KERN_WARNING "Invalid hostname \"%.*s\" " "in NFS lock request\n", - hostname_len, hostname); + (int)hostname_len, hostname); } return NULL; } @@ -489,7 +657,7 @@ retry: if (strlen(pos->sm_name) != hostname_len || memcmp(pos->sm_name, hostname, hostname_len)) continue; - } else if (!nlm_cmp_addr(&pos->sm_addr, sin)) + } else if (!nlm_cmp_addr(nsm_addr(pos), sap)) continue; atomic_inc(&pos->sm_count); kfree(nsm); @@ -509,10 +677,13 @@ retry: if (nsm == NULL) return NULL; - nsm->sm_addr = *sin; + memcpy(nsm_addr(nsm), sap, salen); + nsm->sm_addrlen = salen; nsm->sm_name = (char *) (nsm + 1); memcpy(nsm->sm_name, hostname, hostname_len); nsm->sm_name[hostname_len] = '\0'; + nlm_display_address((struct sockaddr *)&nsm->sm_addr, + nsm->sm_addrbuf, sizeof(nsm->sm_addrbuf)); atomic_set(&nsm->sm_count, 1); goto retry; @@ -521,13 +692,6 @@ found: return nsm; } -static struct nsm_handle * -nsm_find(const struct sockaddr_in *sin, const char *hostname, - unsigned int hostname_len) -{ - return __nsm_find(sin, hostname, hostname_len, 1); -} - /* * Release an NSM handle */ diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index e4d563543b1..4e7e958e8f6 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -51,7 +51,7 @@ nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res) memset(&args, 0, sizeof(args)); args.mon_name = nsm->sm_name; - args.addr = nsm->sm_addr.sin_addr.s_addr; + args.addr = nsm_addr_in(nsm)->sin_addr.s_addr; args.prog = NLM_PROGRAM; args.vers = 3; args.proc = NLMPROC_NSM_NOTIFY; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 5bd9bf0fa9d..c631a83931c 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -51,7 +51,6 @@ static DEFINE_MUTEX(nlmsvc_mutex); static unsigned int nlmsvc_users; static struct task_struct *nlmsvc_task; static struct svc_rqst *nlmsvc_rqst; -int nlmsvc_grace_period; unsigned long nlmsvc_timeout; /* @@ -85,27 +84,23 @@ static unsigned long get_lockd_grace_period(void) return nlm_timeout * 5 * HZ; } -unsigned long get_nfs_grace_period(void) -{ - unsigned long lockdgrace = get_lockd_grace_period(); - unsigned long nfsdgrace = 0; - - if (nlmsvc_ops) - nfsdgrace = nlmsvc_ops->get_grace_period(); - - return max(lockdgrace, nfsdgrace); -} -EXPORT_SYMBOL(get_nfs_grace_period); +static struct lock_manager lockd_manager = { +}; -static unsigned long set_grace_period(void) +static void grace_ender(struct work_struct *not_used) { - nlmsvc_grace_period = 1; - return get_nfs_grace_period() + jiffies; + locks_end_grace(&lockd_manager); } -static inline void clear_grace_period(void) +static DECLARE_DELAYED_WORK(grace_period_end, grace_ender); + +static void set_grace_period(void) { - nlmsvc_grace_period = 0; + unsigned long grace_period = get_lockd_grace_period(); + + locks_start_grace(&lockd_manager); + cancel_delayed_work_sync(&grace_period_end); + schedule_delayed_work(&grace_period_end, grace_period); } /* @@ -116,7 +111,6 @@ lockd(void *vrqstp) { int err = 0, preverr = 0; struct svc_rqst *rqstp = vrqstp; - unsigned long grace_period_expire; /* try_to_freeze() is called from svc_recv() */ set_freezable(); @@ -139,7 +133,7 @@ lockd(void *vrqstp) nlm_timeout = LOCKD_DFLT_TIMEO; nlmsvc_timeout = nlm_timeout * HZ; - grace_period_expire = set_grace_period(); + set_grace_period(); /* * The main request loop. We don't terminate until the last @@ -153,21 +147,12 @@ lockd(void *vrqstp) flush_signals(current); if (nlmsvc_ops) { nlmsvc_invalidate_all(); - grace_period_expire = set_grace_period(); + set_grace_period(); } continue; } - /* - * Retry any blocked locks that have been notified by - * the VFS. Don't do this during grace period. - * (Theoretically, there shouldn't even be blocked locks - * during grace period). - */ - if (!nlmsvc_grace_period) { - timeout = nlmsvc_retry_blocked(); - } else if (time_before(grace_period_expire, jiffies)) - clear_grace_period(); + timeout = nlmsvc_retry_blocked(); /* * Find a socket with data available and call its @@ -195,6 +180,7 @@ lockd(void *vrqstp) svc_process(rqstp); } flush_signals(current); + cancel_delayed_work_sync(&grace_period_end); if (nlmsvc_ops) nlmsvc_invalidate_all(); nlm_shutdown_hosts(); @@ -203,25 +189,28 @@ lockd(void *vrqstp) } /* - * Make any sockets that are needed but not present. - * If nlm_udpport or nlm_tcpport were set as module - * options, make those sockets unconditionally + * Ensure there are active UDP and TCP listeners for lockd. + * + * Even if we have only TCP NFS mounts and/or TCP NFSDs, some + * local services (such as rpc.statd) still require UDP, and + * some NFS servers do not yet support NLM over TCP. + * + * Returns zero if all listeners are available; otherwise a + * negative errno value is returned. */ -static int make_socks(struct svc_serv *serv, int proto) +static int make_socks(struct svc_serv *serv) { static int warned; struct svc_xprt *xprt; int err = 0; - if (proto == IPPROTO_UDP || nlm_udpport) { - xprt = svc_find_xprt(serv, "udp", 0, 0); - if (!xprt) - err = svc_create_xprt(serv, "udp", nlm_udpport, - SVC_SOCK_DEFAULTS); - else - svc_xprt_put(xprt); - } - if (err >= 0 && (proto == IPPROTO_TCP || nlm_tcpport)) { + xprt = svc_find_xprt(serv, "udp", 0, 0); + if (!xprt) + err = svc_create_xprt(serv, "udp", nlm_udpport, + SVC_SOCK_DEFAULTS); + else + svc_xprt_put(xprt); + if (err >= 0) { xprt = svc_find_xprt(serv, "tcp", 0, 0); if (!xprt) err = svc_create_xprt(serv, "tcp", nlm_tcpport, @@ -241,8 +230,7 @@ static int make_socks(struct svc_serv *serv, int proto) /* * Bring up the lockd process if it's not already up. */ -int -lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ +int lockd_up(void) { struct svc_serv *serv; int error = 0; @@ -251,11 +239,8 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ /* * Check whether we're already up and running. */ - if (nlmsvc_rqst) { - if (proto) - error = make_socks(nlmsvc_rqst->rq_server, proto); + if (nlmsvc_rqst) goto out; - } /* * Sanity check: if there's no pid, @@ -266,13 +251,14 @@ lockd_up(int proto) /* Maybe add a 'family' option when IPv6 is supported ?? */ "lockd_up: no pid, %d users??\n", nlmsvc_users); error = -ENOMEM; - serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL); + serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, AF_INET, NULL); if (!serv) { printk(KERN_WARNING "lockd_up: create service failed\n"); goto out; } - if ((error = make_socks(serv, proto)) < 0) + error = make_socks(serv); + if (error < 0) goto destroy_and_out; /* diff --git a/fs/lockd/svc4proc.c b/fs/lockd/svc4proc.c index 4a714f64515..014f6ce4817 100644 --- a/fs/lockd/svc4proc.c +++ b/fs/lockd/svc4proc.c @@ -88,12 +88,6 @@ nlm4svc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST4 called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -122,12 +116,6 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlm4svc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -146,7 +134,8 @@ nlm4svc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie); + argp->block, &argp->cookie, + argp->reclaim); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -169,7 +158,7 @@ nlm4svc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -202,7 +191,7 @@ nlm4svc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -231,7 +220,7 @@ nlm4svc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -341,7 +330,7 @@ nlm4svc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -374,7 +363,7 @@ nlm4svc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -432,11 +421,9 @@ nlm4svc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index cf0d5c2c318..6063a8e4b9f 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -360,7 +360,7 @@ nlmsvc_defer_lock_rqst(struct svc_rqst *rqstp, struct nlm_block *block) __be32 nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, int wait, - struct nlm_cookie *cookie) + struct nlm_cookie *cookie, int reclaim) { struct nlm_block *block = NULL; int error; @@ -406,6 +406,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace() && !reclaim) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (reclaim && !locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } + if (!wait) lock->fl.fl_flags &= ~FL_SLEEP; error = vfs_lock_file(file->f_file, F_SETLK, &lock->fl, NULL); @@ -502,6 +511,10 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + if (locks_in_grace()) { + ret = nlm_lck_denied_grace_period; + goto out; + } error = vfs_test_lock(file->f_file, &lock->fl); if (error == FILE_LOCK_DEFERRED) { ret = nlmsvc_defer_lock_rqst(rqstp, block); @@ -582,6 +595,9 @@ nlmsvc_cancel_blocked(struct nlm_file *file, struct nlm_lock *lock) (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); + if (locks_in_grace()) + return nlm_lck_denied_grace_period; + mutex_lock(&file->f_mutex); block = nlmsvc_lookup_block(file, lock); mutex_unlock(&file->f_mutex); diff --git a/fs/lockd/svcproc.c b/fs/lockd/svcproc.c index 76262c1986f..548b0bb2b84 100644 --- a/fs/lockd/svcproc.c +++ b/fs/lockd/svcproc.c @@ -117,12 +117,6 @@ nlmsvc_proc_test(struct svc_rqst *rqstp, struct nlm_args *argp, dprintk("lockd: TEST called\n"); resp->cookie = argp->cookie; - /* Don't accept test requests during grace period */ - if (nlmsvc_grace_period) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -152,12 +146,6 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; - /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { - resp->status = nlm_lck_denied_grace_period; - return rc; - } - /* Obtain client and file */ if ((resp->status = nlmsvc_retrieve_args(rqstp, argp, &host, &file))) return resp->status == nlm_drop_reply ? rpc_drop_reply :rpc_success; @@ -176,7 +164,8 @@ nlmsvc_proc_lock(struct svc_rqst *rqstp, struct nlm_args *argp, /* Now try to lock the file */ resp->status = cast_status(nlmsvc_lock(rqstp, file, host, &argp->lock, - argp->block, &argp->cookie)); + argp->block, &argp->cookie, + argp->reclaim)); if (resp->status == nlm_drop_reply) rc = rpc_drop_reply; else @@ -199,7 +188,7 @@ nlmsvc_proc_cancel(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -232,7 +221,7 @@ nlmsvc_proc_unlock(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -261,7 +250,7 @@ nlmsvc_proc_granted(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; dprintk("lockd: GRANTED called\n"); - resp->status = nlmclnt_grant(svc_addr_in(rqstp), &argp->lock); + resp->status = nlmclnt_grant(svc_addr(rqstp), &argp->lock); dprintk("lockd: GRANTED status %d\n", ntohl(resp->status)); return rpc_success; } @@ -373,7 +362,7 @@ nlmsvc_proc_share(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept new lock requests during grace period */ - if (nlmsvc_grace_period && !argp->reclaim) { + if (locks_in_grace() && !argp->reclaim) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -406,7 +395,7 @@ nlmsvc_proc_unshare(struct svc_rqst *rqstp, struct nlm_args *argp, resp->cookie = argp->cookie; /* Don't accept requests during grace period */ - if (nlmsvc_grace_period) { + if (locks_in_grace()) { resp->status = nlm_lck_denied_grace_period; return rpc_success; } @@ -464,11 +453,9 @@ nlmsvc_proc_sm_notify(struct svc_rqst *rqstp, struct nlm_reboot *argp, { struct sockaddr_in saddr; - memcpy(&saddr, svc_addr_in(rqstp), sizeof(saddr)); - dprintk("lockd: SM_NOTIFY called\n"); - if (saddr.sin_addr.s_addr != htonl(INADDR_LOOPBACK) - || ntohs(saddr.sin_port) >= 1024) { + + if (!nlm_privileged_requester(rqstp)) { char buf[RPC_MAX_ADDRBUFLEN]; printk(KERN_WARNING "lockd: rejected NSM callback from %s\n", svc_print_addr(rqstp, buf, sizeof(buf))); diff --git a/fs/lockd/svcsubs.c b/fs/lockd/svcsubs.c index 198b4e55b37..34c2766e27c 100644 --- a/fs/lockd/svcsubs.c +++ b/fs/lockd/svcsubs.c @@ -418,7 +418,7 @@ EXPORT_SYMBOL_GPL(nlmsvc_unlock_all_by_sb); static int nlmsvc_match_ip(void *datap, struct nlm_host *host) { - return nlm_cmp_addr(&host->h_saddr, datap); + return nlm_cmp_addr(nlm_srcaddr(host), datap); } /** diff --git a/fs/lockd/xdr.c b/fs/lockd/xdr.c index 3e459e18cc3..1f226290c67 100644 --- a/fs/lockd/xdr.c +++ b/fs/lockd/xdr.c @@ -351,8 +351,6 @@ nlmsvc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp) argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/lockd/xdr4.c b/fs/lockd/xdr4.c index 43ff9397e6c..50c493a8ad8 100644 --- a/fs/lockd/xdr4.c +++ b/fs/lockd/xdr4.c @@ -358,8 +358,6 @@ nlm4svc_decode_reboot(struct svc_rqst *rqstp, __be32 *p, struct nlm_reboot *argp argp->state = ntohl(*p++); /* Preserve the address in network byte order */ argp->addr = *p++; - argp->vers = *p++; - argp->proto = *p++; return xdr_argsize_check(rqstp, p); } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f447f4b4476..6a09760c596 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -105,7 +105,8 @@ int nfs_callback_up(void) mutex_lock(&nfs_callback_mutex); if (nfs_callback_info.users++ || nfs_callback_info.task != NULL) goto out; - serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL); + serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, + AF_INET, NULL); ret = -ENOMEM; if (!serv) goto out_err; diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index 15c6faeec77..b2786a5f9af 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -70,7 +70,6 @@ nlm_fclose(struct file *filp) static struct nlmsvc_binding nfsd_nlm_ops = { .fopen = nlm_fopen, /* open file for locking */ .fclose = nlm_fclose, /* close file */ - .get_grace_period = get_nfs4_grace_period, }; void diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 4d617ea28cf..9dbd2eb9128 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -63,7 +63,8 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); if (nfserr) RETURN_STATUS(nfserr); @@ -530,7 +531,7 @@ nfsd3_proc_fsstat(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: FSSTAT(3) %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); RETURN_STATUS(nfserr); } @@ -558,7 +559,8 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, resp->f_maxfilesize = ~(u32) 0; resp->f_properties = NFS3_FSF_DEFAULT; - nfserr = fh_verify(rqstp, &argp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &argp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); /* Check special features of the file system. May request * different read/write sizes for file systems known to have diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 702fa577aa6..094747a1227 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -225,7 +225,8 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); WRITE32(OP_CB_RECALL); - WRITEMEM(&cb_rec->cbr_stateid, sizeof(stateid_t)); + WRITE32(cb_rec->cbr_stateid.si_generation); + WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); WRITE32(cb_rec->cbr_trunc); WRITE32(len); WRITEMEM(cb_rec->cbr_fhval, len); @@ -379,6 +380,7 @@ static int do_probe_callback(void *data) .addrsize = sizeof(addr), .timeout = &timeparms, .program = &cb_program, + .prognumber = cb->cb_prog, .version = nfs_cb_version[1]->number, .authflavor = RPC_AUTH_UNIX, /* XXX: need AUTH_GSS... */ .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), @@ -396,9 +398,6 @@ static int do_probe_callback(void *data) addr.sin_port = htons(cb->cb_port); addr.sin_addr.s_addr = htonl(cb->cb_addr); - /* Initialize rpc_stat */ - memset(args.program->stats, 0, sizeof(struct rpc_stat)); - /* Create RPC client */ client = rpc_create(&args); if (IS_ERR(client)) { diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index e5b51ffafc6..669461e291a 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -201,10 +201,10 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, /* Openowner is now set, so sequence id will get bumped. Now we need * these checks before we do any creates: */ status = nfserr_grace; - if (nfs4_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) + if (locks_in_grace() && open->op_claim_type != NFS4_OPEN_CLAIM_PREVIOUS) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) + if (!locks_in_grace() && open->op_claim_type == NFS4_OPEN_CLAIM_PREVIOUS) goto out; switch (open->op_claim_type) { @@ -575,7 +575,7 @@ nfsd4_remove(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, { __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; status = nfsd_unlink(rqstp, &cstate->current_fh, 0, remove->rm_name, remove->rm_namelen); @@ -596,7 +596,7 @@ nfsd4_rename(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!cstate->save_fh.fh_dentry) return status; - if (nfs4_in_grace() && !(cstate->save_fh.fh_export->ex_flags + if (locks_in_grace() && !(cstate->save_fh.fh_export->ex_flags & NFSEXP_NOSUBTREECHECK)) return nfserr_grace; status = nfsd_rename(rqstp, &cstate->save_fh, rename->rn_sname, diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 1578d7a2667..0cc7ff5d5ab 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -61,7 +61,6 @@ static time_t lease_time = 90; /* default lease time */ static time_t user_lease_time = 90; static time_t boot_time; -static int in_grace = 1; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; @@ -1640,7 +1639,7 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta case NFS4_OPEN_CLAIM_NULL: /* Let's not give out any delegations till everyone's * had the chance to reclaim theirs.... */ - if (nfs4_in_grace()) + if (locks_in_grace()) goto out; if (!atomic_read(&cb->cb_set) || !sop->so_confirmed) goto out; @@ -1816,12 +1815,15 @@ out: return status; } +struct lock_manager nfsd4_manager = { +}; + static void -end_grace(void) +nfsd4_end_grace(void) { dprintk("NFSD: end of grace period\n"); nfsd4_recdir_purge_old(); - in_grace = 0; + locks_end_grace(&nfsd4_manager); } static time_t @@ -1838,8 +1840,8 @@ nfs4_laundromat(void) nfs4_lock_state(); dprintk("NFSD: laundromat service - starting\n"); - if (in_grace) - end_grace(); + if (locks_in_grace()) + nfsd4_end_grace(); list_for_each_safe(pos, next, &client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); if (time_after((unsigned long)clp->cl_time, (unsigned long)cutoff)) { @@ -1974,7 +1976,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) return nfserr_bad_stateid; else if (ONE_STATEID(stateid) && (flags & RD_STATE)) return nfs_ok; - else if (nfs4_in_grace()) { + else if (locks_in_grace()) { /* Answer in remaining cases depends on existance of * conflicting state; so we must wait out the grace period. */ return nfserr_grace; @@ -1993,7 +1995,7 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) static inline int io_during_grace_disallowed(struct inode *inode, int flags) { - return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE)) + return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) && mandatory_lock(inode); } @@ -2693,10 +2695,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, filp = lock_stp->st_vfs_file; status = nfserr_grace; - if (nfs4_in_grace() && !lock->lk_reclaim) + if (locks_in_grace() && !lock->lk_reclaim) goto out; status = nfserr_no_grace; - if (!nfs4_in_grace() && lock->lk_reclaim) + if (!locks_in_grace() && lock->lk_reclaim) goto out; locks_init_lock(&file_lock); @@ -2779,7 +2781,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, int error; __be32 status; - if (nfs4_in_grace()) + if (locks_in_grace()) return nfserr_grace; if (check_lock_length(lockt->lt_offset, lockt->lt_length)) @@ -3192,9 +3194,9 @@ __nfs4_state_start(void) unsigned long grace_time; boot_time = get_seconds(); - grace_time = get_nfs_grace_period(); + grace_time = get_nfs4_grace_period(); lease_time = user_lease_time; - in_grace = 1; + locks_start_grace(&nfsd4_manager); printk(KERN_INFO "NFSD: starting %ld-second grace period\n", grace_time/HZ); laundry_wq = create_singlethread_workqueue("nfsd4"); @@ -3213,12 +3215,6 @@ nfs4_state_start(void) return; } -int -nfs4_in_grace(void) -{ - return in_grace; -} - time_t nfs4_lease_time(void) { diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 14ba4d9b285..afcdf4b7684 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -413,6 +413,18 @@ out_nfserr: } static __be32 +nfsd4_decode_stateid(struct nfsd4_compoundargs *argp, stateid_t *sid) +{ + DECODE_HEAD; + + READ_BUF(sizeof(stateid_t)); + READ32(sid->si_generation); + COPYMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_access(struct nfsd4_compoundargs *argp, struct nfsd4_access *access) { DECODE_HEAD; @@ -429,10 +441,9 @@ nfsd4_decode_close(struct nfsd4_compoundargs *argp, struct nfsd4_close *close) DECODE_HEAD; close->cl_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); + READ_BUF(4); READ32(close->cl_seqid); - READ32(close->cl_stateid.si_generation); - COPYMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); + return nfsd4_decode_stateid(argp, &close->cl_stateid); DECODE_TAIL; } @@ -493,13 +504,7 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create static inline __be32 nfsd4_decode_delegreturn(struct nfsd4_compoundargs *argp, struct nfsd4_delegreturn *dr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(dr->dr_stateid.si_generation); - COPYMEM(&dr->dr_stateid.si_opaque, sizeof(stateid_opaque_t)); - - DECODE_TAIL; + return nfsd4_decode_stateid(argp, &dr->dr_stateid); } static inline __be32 @@ -542,20 +547,22 @@ nfsd4_decode_lock(struct nfsd4_compoundargs *argp, struct nfsd4_lock *lock) READ32(lock->lk_is_new); if (lock->lk_is_new) { - READ_BUF(36); + READ_BUF(4); READ32(lock->lk_new_open_seqid); - READ32(lock->lk_new_open_stateid.si_generation); - - COPYMEM(&lock->lk_new_open_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_new_open_stateid); + if (status) + return status; + READ_BUF(8 + sizeof(clientid_t)); READ32(lock->lk_new_lock_seqid); COPYMEM(&lock->lk_new_clientid, sizeof(clientid_t)); READ32(lock->lk_new_owner.len); READ_BUF(lock->lk_new_owner.len); READMEM(lock->lk_new_owner.data, lock->lk_new_owner.len); } else { - READ_BUF(20); - READ32(lock->lk_old_lock_stateid.si_generation); - COPYMEM(&lock->lk_old_lock_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &lock->lk_old_lock_stateid); + if (status) + return status; + READ_BUF(4); READ32(lock->lk_old_lock_seqid); } @@ -587,13 +594,15 @@ nfsd4_decode_locku(struct nfsd4_compoundargs *argp, struct nfsd4_locku *locku) DECODE_HEAD; locku->lu_stateowner = NULL; - READ_BUF(24 + sizeof(stateid_t)); + READ_BUF(8); READ32(locku->lu_type); if ((locku->lu_type < NFS4_READ_LT) || (locku->lu_type > NFS4_WRITEW_LT)) goto xdr_error; READ32(locku->lu_seqid); - READ32(locku->lu_stateid.si_generation); - COPYMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &locku->lu_stateid); + if (status) + return status; + READ_BUF(16); READ64(locku->lu_offset); READ64(locku->lu_length); @@ -678,8 +687,10 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open) READ32(open->op_delegate_type); break; case NFS4_OPEN_CLAIM_DELEGATE_CUR: - READ_BUF(sizeof(stateid_t) + 4); - COPYMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + status = nfsd4_decode_stateid(argp, &open->op_delegate_stateid); + if (status) + return status; + READ_BUF(4); READ32(open->op_fname.len); READ_BUF(open->op_fname.len); SAVEMEM(open->op_fname.data, open->op_fname.len); @@ -699,9 +710,10 @@ nfsd4_decode_open_confirm(struct nfsd4_compoundargs *argp, struct nfsd4_open_con DECODE_HEAD; open_conf->oc_stateowner = NULL; - READ_BUF(4 + sizeof(stateid_t)); - READ32(open_conf->oc_req_stateid.si_generation); - COPYMEM(&open_conf->oc_req_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_conf->oc_req_stateid); + if (status) + return status; + READ_BUF(4); READ32(open_conf->oc_seqid); DECODE_TAIL; @@ -713,9 +725,10 @@ nfsd4_decode_open_downgrade(struct nfsd4_compoundargs *argp, struct nfsd4_open_d DECODE_HEAD; open_down->od_stateowner = NULL; - READ_BUF(12 + sizeof(stateid_t)); - READ32(open_down->od_stateid.si_generation); - COPYMEM(&open_down->od_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &open_down->od_stateid); + if (status) + return status; + READ_BUF(12); READ32(open_down->od_seqid); READ32(open_down->od_share_access); READ32(open_down->od_share_deny); @@ -743,9 +756,10 @@ nfsd4_decode_read(struct nfsd4_compoundargs *argp, struct nfsd4_read *read) { DECODE_HEAD; - READ_BUF(sizeof(stateid_t) + 12); - READ32(read->rd_stateid.si_generation); - COPYMEM(&read->rd_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &read->rd_stateid); + if (status) + return status; + READ_BUF(12); READ64(read->rd_offset); READ32(read->rd_length); @@ -834,15 +848,13 @@ nfsd4_decode_secinfo(struct nfsd4_compoundargs *argp, static __be32 nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *setattr) { - DECODE_HEAD; - - READ_BUF(sizeof(stateid_t)); - READ32(setattr->sa_stateid.si_generation); - COPYMEM(&setattr->sa_stateid.si_opaque, sizeof(stateid_opaque_t)); - if ((status = nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr, &setattr->sa_acl))) - goto out; + __be32 status; - DECODE_TAIL; + status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); + if (status) + return status; + return nfsd4_decode_fattr(argp, setattr->sa_bmval, + &setattr->sa_iattr, &setattr->sa_acl); } static __be32 @@ -927,9 +939,10 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) int len; DECODE_HEAD; - READ_BUF(sizeof(stateid_opaque_t) + 20); - READ32(write->wr_stateid.si_generation); - COPYMEM(&write->wr_stateid.si_opaque, sizeof(stateid_opaque_t)); + status = nfsd4_decode_stateid(argp, &write->wr_stateid); + if (status) + return status; + READ_BUF(16); READ64(write->wr_offset); READ32(write->wr_stable_how); if (write->wr_stable_how > 2) @@ -1183,7 +1196,6 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) * Header routine to setup seqid operation replay cache */ #define ENCODE_SEQID_OP_HEAD \ - __be32 *p; \ __be32 *save; \ \ save = resp->p; @@ -1950,6 +1962,17 @@ fail: return -EINVAL; } +static void +nfsd4_encode_stateid(struct nfsd4_compoundres *resp, stateid_t *sid) +{ + ENCODE_HEAD; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); + ADJUST_ARGS(); +} + static __be32 nfsd4_encode_access(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_access *access) { @@ -1969,12 +1992,9 @@ nfsd4_encode_close(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_c { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(close->cl_stateid.si_generation); - WRITEMEM(&close->cl_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + if (!nfserr) + nfsd4_encode_stateid(resp, &close->cl_stateid); + ENCODE_SEQID_OP_TAIL(close->cl_stateowner); return nfserr; } @@ -2074,12 +2094,9 @@ nfsd4_encode_lock(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_lo { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(4 + sizeof(stateid_t)); - WRITE32(lock->lk_resp_stateid.si_generation); - WRITEMEM(&lock->lk_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } else if (nfserr == nfserr_denied) + if (!nfserr) + nfsd4_encode_stateid(resp, &lock->lk_resp_stateid); + else if (nfserr == nfserr_denied) nfsd4_encode_lock_denied(resp, &lock->lk_denied); ENCODE_SEQID_OP_TAIL(lock->lk_replay_owner); @@ -2099,13 +2116,9 @@ nfsd4_encode_locku(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_l { ENCODE_SEQID_OP_HEAD; - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(locku->lu_stateid.si_generation); - WRITEMEM(&locku->lu_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } - + if (!nfserr) + nfsd4_encode_stateid(resp, &locku->lu_stateid); + ENCODE_SEQID_OP_TAIL(locku->lu_stateowner); return nfserr; } @@ -2128,14 +2141,14 @@ nfsd4_encode_link(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_li static __be32 nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open *open) { + ENCODE_HEAD; ENCODE_SEQID_OP_HEAD; if (nfserr) goto out; - RESERVE_SPACE(36 + sizeof(stateid_t)); - WRITE32(open->op_stateid.si_generation); - WRITEMEM(&open->op_stateid.si_opaque, sizeof(stateid_opaque_t)); + nfsd4_encode_stateid(resp, &open->op_stateid); + RESERVE_SPACE(40); WRITECINFO(open->op_cinfo); WRITE32(open->op_rflags); WRITE32(2); @@ -2148,8 +2161,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op case NFS4_OPEN_DELEGATE_NONE: break; case NFS4_OPEN_DELEGATE_READ: - RESERVE_SPACE(20 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(20); WRITE32(open->op_recall); /* @@ -2162,8 +2175,8 @@ nfsd4_encode_open(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_op ADJUST_ARGS(); break; case NFS4_OPEN_DELEGATE_WRITE: - RESERVE_SPACE(32 + sizeof(stateid_t)); - WRITEMEM(&open->op_delegate_stateid, sizeof(stateid_t)); + nfsd4_encode_stateid(resp, &open->op_delegate_stateid); + RESERVE_SPACE(32); WRITE32(0); /* @@ -2195,13 +2208,9 @@ static __be32 nfsd4_encode_open_confirm(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_confirm *oc) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(oc->oc_resp_stateid.si_generation); - WRITEMEM(&oc->oc_resp_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &oc->oc_resp_stateid); ENCODE_SEQID_OP_TAIL(oc->oc_stateowner); return nfserr; @@ -2211,13 +2220,9 @@ static __be32 nfsd4_encode_open_downgrade(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_open_downgrade *od) { ENCODE_SEQID_OP_HEAD; - - if (!nfserr) { - RESERVE_SPACE(sizeof(stateid_t)); - WRITE32(od->od_stateid.si_generation); - WRITEMEM(&od->od_stateid.si_opaque, sizeof(stateid_opaque_t)); - ADJUST_ARGS(); - } + + if (!nfserr) + nfsd4_encode_stateid(resp, &od->od_stateid); ENCODE_SEQID_OP_TAIL(od->od_stateowner); return nfserr; diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index c53e65f8f3a..97543df5824 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -614,10 +614,9 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size) return -EINVAL; err = nfsd_create_serv(); if (!err) { - int proto = 0; - err = svc_addsock(nfsd_serv, fd, buf, &proto); + err = svc_addsock(nfsd_serv, fd, buf); if (err >= 0) { - err = lockd_up(proto); + err = lockd_up(); if (err < 0) svc_sock_names(buf+strlen(buf)+1, nfsd_serv, buf); } diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index ea37c96f044..cd25d91895a 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -302,17 +302,27 @@ fh_verify(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, int access) if (error) goto out; - if (!(access & NFSD_MAY_LOCK)) { - /* - * pseudoflavor restrictions are not enforced on NLM, - * which clients virtually always use auth_sys for, - * even while using RPCSEC_GSS for NFS. - */ - error = check_nfsd_access(exp, rqstp); - if (error) - goto out; - } + /* + * pseudoflavor restrictions are not enforced on NLM, + * which clients virtually always use auth_sys for, + * even while using RPCSEC_GSS for NFS. + */ + if (access & NFSD_MAY_LOCK) + goto skip_pseudoflavor_check; + /* + * Clients may expect to be able to use auth_sys during mount, + * even if they use gss for everything else; see section 2.3.2 + * of rfc 2623. + */ + if (access & NFSD_MAY_BYPASS_GSS_ON_ROOT + && exp->ex_path.dentry == dentry) + goto skip_pseudoflavor_check; + + error = check_nfsd_access(exp, rqstp); + if (error) + goto out; +skip_pseudoflavor_check: /* Finally, check access permissions. */ error = nfsd_permission(rqstp, exp, dentry, access); diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 0766f95d236..5cffeca7ace 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -65,7 +65,8 @@ nfsd_proc_getattr(struct svc_rqst *rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); fh_copy(&resp->fh, &argp->fh); - nfserr = fh_verify(rqstp, &resp->fh, 0, NFSD_MAY_NOP); + nfserr = fh_verify(rqstp, &resp->fh, 0, + NFSD_MAY_NOP | NFSD_MAY_BYPASS_GSS_ON_ROOT); return nfsd_return_attrs(nfserr, resp); } @@ -521,7 +522,8 @@ nfsd_proc_statfs(struct svc_rqst * rqstp, struct nfsd_fhandle *argp, dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats); + nfserr = nfsd_statfs(rqstp, &argp->fh, &resp->stats, + NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); return nfserr; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 80292ff5e92..59eeb46f82c 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -229,6 +229,7 @@ int nfsd_create_serv(void) atomic_set(&nfsd_busy, 0); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, + AF_INET, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) err = -ENOMEM; @@ -243,25 +244,20 @@ static int nfsd_init_socks(int port) if (!list_empty(&nfsd_serv->sv_permsocks)) return 0; - error = lockd_up(IPPROTO_UDP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "udp", port, + error = svc_create_xprt(nfsd_serv, "udp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; - error = lockd_up(IPPROTO_TCP); - if (error >= 0) { - error = svc_create_xprt(nfsd_serv, "tcp", port, + error = svc_create_xprt(nfsd_serv, "tcp", port, SVC_SOCK_DEFAULTS); - if (error < 0) - lockd_down(); - } if (error < 0) return error; + + error = lockd_up(); + if (error < 0) + return error; + return 0; } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 18060bed526..aa1d0d6489a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -83,7 +83,6 @@ struct raparm_hbucket { spinlock_t pb_lock; } ____cacheline_aligned_in_smp; -static struct raparms * raparml; #define RAPARM_HASH_BITS 4 #define RAPARM_HASH_SIZE (1<<RAPARM_HASH_BITS) #define RAPARM_HASH_MASK (RAPARM_HASH_SIZE-1) @@ -1866,9 +1865,9 @@ out: * N.B. After this call fhp needs an fh_put */ __be32 -nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat) +nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, int access) { - __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP); + __be32 err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err && vfs_statfs(fhp->fh_dentry,stat)) err = nfserr_io; return err; @@ -1966,11 +1965,20 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, void nfsd_racache_shutdown(void) { - if (!raparml) - return; + struct raparms *raparm, *last_raparm; + unsigned int i; + dprintk("nfsd: freeing readahead buffers.\n"); - kfree(raparml); - raparml = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { + raparm = raparm_hash[i].pb_head; + while(raparm) { + last_raparm = raparm; + raparm = raparm->p_next; + kfree(last_raparm); + } + raparm_hash[i].pb_head = NULL; + } } /* * Initialize readahead param cache @@ -1981,35 +1989,38 @@ nfsd_racache_init(int cache_size) int i; int j = 0; int nperbucket; + struct raparms **raparm = NULL; - if (raparml) + if (raparm_hash[0].pb_head) return 0; - if (cache_size < 2*RAPARM_HASH_SIZE) - cache_size = 2*RAPARM_HASH_SIZE; - raparml = kcalloc(cache_size, sizeof(struct raparms), GFP_KERNEL); - - if (!raparml) { - printk(KERN_WARNING - "nfsd: Could not allocate memory read-ahead cache.\n"); - return -ENOMEM; - } + nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); + if (nperbucket < 2) + nperbucket = 2; + cache_size = nperbucket * RAPARM_HASH_SIZE; dprintk("nfsd: allocating %d readahead buffers.\n", cache_size); - for (i = 0 ; i < RAPARM_HASH_SIZE ; i++) { - raparm_hash[i].pb_head = NULL; + + for (i = 0; i < RAPARM_HASH_SIZE; i++) { spin_lock_init(&raparm_hash[i].pb_lock); - } - nperbucket = DIV_ROUND_UP(cache_size, RAPARM_HASH_SIZE); - for (i = 0; i < cache_size - 1; i++) { - if (i % nperbucket == 0) - raparm_hash[j++].pb_head = raparml + i; - if (i % nperbucket < nperbucket-1) - raparml[i].p_next = raparml + i + 1; + + raparm = &raparm_hash[i].pb_head; + for (j = 0; j < nperbucket; j++) { + *raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL); + if (!*raparm) + goto out_nomem; + raparm = &(*raparm)->p_next; + } + *raparm = NULL; } nfsdstats.ra_size = cache_size; return 0; + +out_nomem: + dprintk("nfsd: kmalloc failed, freeing readahead buffers\n"); + nfsd_racache_shutdown(); + return -ENOMEM; } #if defined(CONFIG_NFSD_V2_ACL) || defined(CONFIG_NFSD_V3_ACL) diff --git a/fs/ocfs2/Makefile b/fs/ocfs2/Makefile index f6956de56fd..589dcdfdfe3 100644 --- a/fs/ocfs2/Makefile +++ b/fs/ocfs2/Makefile @@ -34,7 +34,8 @@ ocfs2-objs := \ symlink.o \ sysfile.o \ uptodate.o \ - ver.o + ver.o \ + xattr.o ocfs2_stackglue-objs := stackglue.o ocfs2_stack_o2cb-objs := stack_o2cb.o diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 29ff57ec5d1..0cc2deb9394 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -49,6 +49,340 @@ #include "buffer_head_io.h" + +/* + * Operations for a specific extent tree type. + * + * To implement an on-disk btree (extent tree) type in ocfs2, add + * an ocfs2_extent_tree_operations structure and the matching + * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it + * for the allocation portion of the extent tree. + */ +struct ocfs2_extent_tree_operations { + /* + * last_eb_blk is the block number of the right most leaf extent + * block. Most on-disk structures containing an extent tree store + * this value for fast access. The ->eo_set_last_eb_blk() and + * ->eo_get_last_eb_blk() operations access this value. They are + * both required. + */ + void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, + u64 blkno); + u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); + + /* + * The on-disk structure usually keeps track of how many total + * clusters are stored in this extent tree. This function updates + * that value. new_clusters is the delta, and must be + * added to the total. Required. + */ + void (*eo_update_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 new_clusters); + + /* + * If ->eo_insert_check() exists, it is called before rec is + * inserted into the extent tree. It is optional. + */ + int (*eo_insert_check)(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); + int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); + + /* + * -------------------------------------------------------------- + * The remaining are internal to ocfs2_extent_tree and don't have + * accessor functions + */ + + /* + * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. + * It is required. + */ + void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); + + /* + * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if + * it exists. If it does not, et->et_max_leaf_clusters is set + * to 0 (unlimited). Optional. + */ + void (*eo_fill_max_leaf_clusters)(struct inode *inode, + struct ocfs2_extent_tree *et); +}; + + +/* + * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check + * in the methods. + */ +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno); +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters); +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec); +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et); +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); +static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { + .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, + .eo_update_clusters = ocfs2_dinode_update_clusters, + .eo_insert_check = ocfs2_dinode_insert_check, + .eo_sanity_check = ocfs2_dinode_sanity_check, + .eo_fill_root_el = ocfs2_dinode_fill_root_el, +}; + +static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + di->i_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + return le64_to_cpu(di->i_last_eb_blk); +} + +static void ocfs2_dinode_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_dinode *di = et->et_object; + + le32_add_cpu(&di->i_clusters, clusters); + spin_lock(&OCFS2_I(inode)->ip_lock); + OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); + spin_unlock(&OCFS2_I(inode)->ip_lock); +} + +static int ocfs2_dinode_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); + mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && + (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + "Device %s, asking for sparse allocation: inode %llu, " + "cpos %u, clusters %u\n", + osb->dev_str, + (unsigned long long)OCFS2_I(inode)->ip_blkno, + rec->e_cpos, + OCFS2_I(inode)->ip_clusters); + + return 0; +} + +static int ocfs2_dinode_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + struct ocfs2_dinode *di; + + BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); + + di = et->et_object; + if (!OCFS2_IS_VALID_DINODE(di)) { + ret = -EIO; + ocfs2_error(inode->i_sb, + "Inode %llu has invalid path root", + (unsigned long long)OCFS2_I(inode)->ip_blkno); + } + + return ret; +} + +static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_dinode *di = et->et_object; + + et->et_root_el = &di->id2.i_list; +} + + +static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = et->et_object; + + et->et_root_el = &xv->xr_list; +} + +static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + xv->xr_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *) et->et_object; + + return le64_to_cpu(xv->xr_last_eb_blk); +} + +static void ocfs2_xattr_value_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_value_root *xv = + (struct ocfs2_xattr_value_root *)et->et_object; + + le32_add_cpu(&xv->xr_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_value_update_clusters, + .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, +}; + +static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + et->et_root_el = &xb->xb_attrs.xb_root.xt_list; +} + +static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + et->et_max_leaf_clusters = + ocfs2_clusters_for_bytes(inode->i_sb, + OCFS2_MAX_XATTR_TREE_LEAF_SIZE); +} + +static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 blkno) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + xt->xt_last_eb_blk = cpu_to_le64(blkno); +} + +static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + struct ocfs2_xattr_block *xb = et->et_object; + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + + return le64_to_cpu(xt->xt_last_eb_blk); +} + +static void ocfs2_xattr_tree_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + struct ocfs2_xattr_block *xb = et->et_object; + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); +} + +static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { + .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, + .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, + .eo_update_clusters = ocfs2_xattr_tree_update_clusters, + .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, + .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, +}; + +static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + void *obj, + struct ocfs2_extent_tree_operations *ops) +{ + et->et_ops = ops; + et->et_root_bh = bh; + if (!obj) + obj = (void *)bh->b_data; + et->et_object = obj; + + et->et_ops->eo_fill_root_el(et); + if (!et->et_ops->eo_fill_max_leaf_clusters) + et->et_max_leaf_clusters = 0; + else + et->et_ops->eo_fill_max_leaf_clusters(inode, et); +} + +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); +} + +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh) +{ + __ocfs2_init_extent_tree(et, inode, bh, NULL, + &ocfs2_xattr_tree_et_ops); +} + +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv) +{ + __ocfs2_init_extent_tree(et, inode, bh, xv, + &ocfs2_xattr_value_et_ops); +} + +static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, + u64 new_last_eb_blk) +{ + et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); +} + +static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) +{ + return et->et_ops->eo_get_last_eb_blk(et); +} + +static inline void ocfs2_et_update_clusters(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters) +{ + et->et_ops->eo_update_clusters(inode, et, clusters); +} + +static inline int ocfs2_et_insert_check(struct inode *inode, + struct ocfs2_extent_tree *et, + struct ocfs2_extent_rec *rec) +{ + int ret = 0; + + if (et->et_ops->eo_insert_check) + ret = et->et_ops->eo_insert_check(inode, et, rec); + return ret; +} + +static inline int ocfs2_et_sanity_check(struct inode *inode, + struct ocfs2_extent_tree *et) +{ + int ret = 0; + + if (et->et_ops->eo_sanity_check) + ret = et->et_ops->eo_sanity_check(inode, et); + return ret; +} + static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, struct ocfs2_extent_block *eb); @@ -205,17 +539,6 @@ static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, } /* - * Allocate and initialize a new path based on a disk inode tree. - */ -static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) -{ - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - struct ocfs2_extent_list *el = &di->id2.i_list; - - return ocfs2_new_path(di_bh, el); -} - -/* * Convenience function to journal all components in a path. */ static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, @@ -368,39 +691,35 @@ struct ocfs2_merge_ctxt { */ int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et) { int retval; - struct ocfs2_extent_list *el; + struct ocfs2_extent_list *el = NULL; struct ocfs2_extent_block *eb; struct buffer_head *eb_bh = NULL; + u64 last_eb_blk = 0; mlog_entry_void(); - if (!OCFS2_IS_VALID_DINODE(fe)) { - OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); - retval = -EIO; - goto bail; - } + el = et->et_root_el; + last_eb_blk = ocfs2_et_get_last_eb_blk(et); - if (fe->i_last_eb_blk) { - retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &eb_bh, OCFS2_BH_CACHED, inode); + if (last_eb_blk) { + retval = ocfs2_read_block(inode, last_eb_blk, + &eb_bh); if (retval < 0) { mlog_errno(retval); goto bail; } eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; - } else - el = &fe->id2.i_list; + } BUG_ON(el->l_tree_depth != 0); retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); bail: - if (eb_bh) - brelse(eb_bh); + brelse(eb_bh); mlog_exit(retval); return retval; @@ -486,8 +805,7 @@ static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, bail: if (status < 0) { for(i = 0; i < wanted; i++) { - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); bhs[i] = NULL; } } @@ -531,7 +849,7 @@ static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) static int ocfs2_add_branch(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head *eb_bh, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) @@ -540,7 +858,6 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *eb_el; struct ocfs2_extent_list *el; @@ -550,13 +867,11 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, BUG_ON(!last_eb_bh || !*last_eb_bh); - fe = (struct ocfs2_dinode *) fe_bh->b_data; - if (eb_bh) { eb = (struct ocfs2_extent_block *) eb_bh->b_data; el = &eb->h_list; } else - el = &fe->id2.i_list; + el = et->et_root_el; /* we never add a branch to a leaf. */ BUG_ON(!el->l_tree_depth); @@ -646,7 +961,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, mlog_errno(status); goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -662,7 +977,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, } /* Link the new branch into the rest of the tree (el will - * either be on the fe, or the extent block passed in. */ + * either be on the root_bh, or the extent block passed in. */ i = le16_to_cpu(el->l_next_free_rec); el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); @@ -671,7 +986,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, /* fe needs a new last extent block pointer, as does the * next_leaf on the previously last-extent-block. */ - fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); + ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); @@ -679,7 +994,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, status = ocfs2_journal_dirty(handle, *last_eb_bh); if (status < 0) mlog_errno(status); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) mlog_errno(status); if (eb_bh) { @@ -700,8 +1015,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb, bail: if (new_eb_bhs) { for (i = 0; i < new_blocks; i++) - if (new_eb_bhs[i]) - brelse(new_eb_bhs[i]); + brelse(new_eb_bhs[i]); kfree(new_eb_bhs); } @@ -717,16 +1031,15 @@ bail: static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { int status, i; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; - struct ocfs2_extent_list *fe_el; + struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; mlog_entry_void(); @@ -746,8 +1059,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, } eb_el = &eb->h_list; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - fe_el = &fe->id2.i_list; + root_el = et->et_root_el; status = ocfs2_journal_access(handle, inode, new_eb_bh, OCFS2_JOURNAL_ACCESS_CREATE); @@ -756,11 +1068,11 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - /* copy the fe data into the new extent block */ - eb_el->l_tree_depth = fe_el->l_tree_depth; - eb_el->l_next_free_rec = fe_el->l_next_free_rec; - for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - eb_el->l_recs[i] = fe_el->l_recs[i]; + /* copy the root extent list data into the new extent block */ + eb_el->l_tree_depth = root_el->l_tree_depth; + eb_el->l_next_free_rec = root_el->l_next_free_rec; + for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) + eb_el->l_recs[i] = root_el->l_recs[i]; status = ocfs2_journal_dirty(handle, new_eb_bh); if (status < 0) { @@ -768,7 +1080,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, goto bail; } - status = ocfs2_journal_access(handle, inode, fe_bh, + status = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); @@ -777,21 +1089,21 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_clusters = ocfs2_sum_rightmost_rec(eb_el); - /* update fe now */ - le16_add_cpu(&fe_el->l_tree_depth, 1); - fe_el->l_recs[0].e_cpos = 0; - fe_el->l_recs[0].e_blkno = eb->h_blkno; - fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); - for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) - memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); - fe_el->l_next_free_rec = cpu_to_le16(1); + /* update root_bh now */ + le16_add_cpu(&root_el->l_tree_depth, 1); + root_el->l_recs[0].e_cpos = 0; + root_el->l_recs[0].e_blkno = eb->h_blkno; + root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); + for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) + memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); + root_el->l_next_free_rec = cpu_to_le16(1); /* If this is our 1st tree depth shift, then last_eb_blk * becomes the allocated extent block */ - if (fe_el->l_tree_depth == cpu_to_le16(1)) - fe->i_last_eb_blk = eb->h_blkno; + if (root_el->l_tree_depth == cpu_to_le16(1)) + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); - status = ocfs2_journal_dirty(handle, fe_bh); + status = ocfs2_journal_dirty(handle, et->et_root_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -801,8 +1113,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, new_eb_bh = NULL; status = 0; bail: - if (new_eb_bh) - brelse(new_eb_bh); + brelse(new_eb_bh); mlog_exit(status); return status; @@ -817,22 +1128,21 @@ bail: * 1) a lowest extent block is found, then we pass it back in * *lowest_eb_bh and return '0' * - * 2) the search fails to find anything, but the dinode has room. We + * 2) the search fails to find anything, but the root_el has room. We * pass NULL back in *lowest_eb_bh, but still return '0' * - * 3) the search fails to find anything AND the dinode is full, in + * 3) the search fails to find anything AND the root_el is full, in * which case we return > 0 * * return status < 0 indicates an error. */ static int ocfs2_find_branch_target(struct ocfs2_super *osb, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, struct buffer_head **target_bh) { int status = 0, i; u64 blkno; - struct ocfs2_dinode *fe; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct buffer_head *bh = NULL; @@ -842,8 +1152,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, *target_bh = NULL; - fe = (struct ocfs2_dinode *) fe_bh->b_data; - el = &fe->id2.i_list; + el = et->et_root_el; while(le16_to_cpu(el->l_tree_depth) > 1) { if (le16_to_cpu(el->l_next_free_rec) == 0) { @@ -864,13 +1173,10 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, goto bail; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; - status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -886,8 +1192,7 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, if (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count)) { - if (lowest_bh) - brelse(lowest_bh); + brelse(lowest_bh); lowest_bh = bh; get_bh(lowest_bh); } @@ -895,14 +1200,13 @@ static int ocfs2_find_branch_target(struct ocfs2_super *osb, /* If we didn't find one and the fe doesn't have any room, * then return '1' */ - if (!lowest_bh - && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) + el = et->et_root_el; + if (!lowest_bh && (el->l_next_free_rec == el->l_count)) status = 1; *target_bh = lowest_bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -919,19 +1223,19 @@ bail: * *last_eb_bh will be updated by ocfs2_add_branch(). */ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, int *final_depth, + struct ocfs2_extent_tree *et, int *final_depth, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { int ret, shift; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); + struct ocfs2_extent_list *el = et->et_root_el; + int depth = le16_to_cpu(el->l_tree_depth); struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct buffer_head *bh = NULL; BUG_ON(meta_ac == NULL); - shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); + shift = ocfs2_find_branch_target(osb, inode, et, &bh); if (shift < 0) { ret = shift; mlog_errno(ret); @@ -948,7 +1252,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* ocfs2_shift_tree_depth will return us a buffer with * the new extent block (so we can pass that to * ocfs2_add_branch). */ - ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, + ret = ocfs2_shift_tree_depth(osb, handle, inode, et, meta_ac, &bh); if (ret < 0) { mlog_errno(ret); @@ -975,7 +1279,7 @@ static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, /* call ocfs2_add_branch to add the final part of the tree with * the new data. */ mlog(0, "add branch. bh = %p\n", bh); - ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, + ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, meta_ac); if (ret < 0) { mlog_errno(ret); @@ -1236,8 +1540,7 @@ static int __ocfs2_find_path(struct inode *inode, brelse(bh); bh = NULL; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, - &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, blkno, &bh); if (ret) { mlog_errno(ret); goto out; @@ -2058,11 +2361,11 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *right_path, int subtree_index, struct ocfs2_cached_dealloc_ctxt *dealloc, - int *deleted) + int *deleted, + struct ocfs2_extent_tree *et) { int ret, i, del_right_subtree = 0, right_has_empty = 0; - struct buffer_head *root_bh, *di_bh = path_root_bh(right_path); - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; struct ocfs2_extent_block *eb; @@ -2114,7 +2417,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, * We have to update i_last_eb_blk during the meta * data delete. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -2189,7 +2492,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); /* * Removal of the extent in the left leaf was skipped @@ -2199,7 +2502,7 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (right_has_empty) ocfs2_remove_empty_extent(left_leaf_el); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et_root_bh); if (ret) mlog_errno(ret); @@ -2322,7 +2625,8 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, int orig_credits, struct ocfs2_path *path, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_path **empty_extent_path) + struct ocfs2_path **empty_extent_path, + struct ocfs2_extent_tree *et) { int ret, subtree_root, deleted; u32 right_cpos; @@ -2395,7 +2699,7 @@ static int __ocfs2_rotate_tree_left(struct inode *inode, ret = ocfs2_rotate_subtree_left(inode, handle, left_path, right_path, subtree_root, - dealloc, &deleted); + dealloc, &deleted, et); if (ret == -EAGAIN) { /* * The rotation has to temporarily stop due to @@ -2438,29 +2742,20 @@ out: } static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, - struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_path *path, + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, subtree_index; u32 cpos; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - /* - * XXX: This code assumes that the root is an inode, which is - * true for now but may change as tree code gets generic. - */ - di = (struct ocfs2_dinode *)path_root_bh(path)->b_data; - if (!OCFS2_IS_VALID_DINODE(di)) { - ret = -EIO; - ocfs2_error(inode->i_sb, - "Inode %llu has invalid path root", - (unsigned long long)OCFS2_I(inode)->ip_blkno); - goto out; - } + ret = ocfs2_et_sanity_check(inode, et); + if (ret) + goto out; /* * There's two ways we handle this depending on * whether path is the only existing one. @@ -2517,7 +2812,7 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_update_edge_lengths(inode, handle, left_path); eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; - di->i_last_eb_blk = eb->h_blkno; + ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); } else { /* * 'path' is also the leftmost path which @@ -2528,12 +2823,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, */ ocfs2_unlink_path(inode, handle, dealloc, path, 1); - el = &di->id2.i_list; + el = et->et_root_el; el->l_tree_depth = 0; el->l_next_free_rec = 0; memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); - di->i_last_eb_blk = 0; + ocfs2_et_set_last_eb_blk(et, 0); } ocfs2_journal_dirty(handle, path_root_bh(path)); @@ -2561,7 +2856,8 @@ out: */ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct ocfs2_cached_dealloc_ctxt *dealloc) + struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et) { int ret, orig_credits = handle->h_buffer_credits; struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; @@ -2575,7 +2871,7 @@ static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, if (path->p_tree_depth == 0) { rightmost_no_delete: /* - * In-inode extents. This is trivially handled, so do + * Inline extents. This is trivially handled, so do * it up front. */ ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, @@ -2629,7 +2925,7 @@ rightmost_no_delete: */ ret = ocfs2_remove_rightmost_path(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); goto out; @@ -2641,7 +2937,7 @@ rightmost_no_delete: */ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, - dealloc, &restart_path); + dealloc, &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2653,7 +2949,7 @@ try_rotate: ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, tmp_path, dealloc, - &restart_path); + &restart_path, et); if (ret && ret != -EAGAIN) { mlog_errno(ret); goto out; @@ -2939,6 +3235,7 @@ static int ocfs2_merge_rec_left(struct inode *inode, handle_t *handle, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, + struct ocfs2_extent_tree *et, int index) { int ret, i, subtree_index = 0, has_empty_extent = 0; @@ -3059,7 +3356,8 @@ static int ocfs2_merge_rec_left(struct inode *inode, le16_to_cpu(el->l_next_free_rec) == 1) { ret = ocfs2_remove_rightmost_path(inode, handle, - right_path, dealloc); + right_path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3086,7 +3384,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, int split_index, struct ocfs2_extent_rec *split_rec, struct ocfs2_cached_dealloc_ctxt *dealloc, - struct ocfs2_merge_ctxt *ctxt) + struct ocfs2_merge_ctxt *ctxt, + struct ocfs2_extent_tree *et) { int ret = 0; @@ -3104,7 +3403,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * illegal. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3147,7 +3446,8 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); /* The merge left us with an empty extent, remove it. */ - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, + dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -3161,7 +3461,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, */ ret = ocfs2_merge_rec_left(inode, path, handle, rec, - dealloc, + dealloc, et, split_index); if (ret) { @@ -3170,7 +3470,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, } ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); /* * Error from this last rotate is not critical, so * print but don't bubble it up. @@ -3190,7 +3490,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, ret = ocfs2_merge_rec_left(inode, path, handle, split_rec, - dealloc, + dealloc, et, split_index); if (ret) { mlog_errno(ret); @@ -3213,7 +3513,7 @@ static int ocfs2_try_to_merge_extent(struct inode *inode, * our leaf. Try to rotate it away. */ ret = ocfs2_rotate_tree_left(inode, handle, path, - dealloc); + dealloc, et); if (ret) mlog_errno(ret); ret = 0; @@ -3347,16 +3647,6 @@ rotate: ocfs2_rotate_leaf(el, insert_rec); } -static inline void ocfs2_update_dinode_clusters(struct inode *inode, - struct ocfs2_dinode *di, - u32 clusters) -{ - le32_add_cpu(&di->i_clusters, clusters); - spin_lock(&OCFS2_I(inode)->ip_lock); - OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); - spin_unlock(&OCFS2_I(inode)->ip_lock); -} - static void ocfs2_adjust_rightmost_records(struct inode *inode, handle_t *handle, struct ocfs2_path *path, @@ -3558,8 +3848,8 @@ static void ocfs2_split_record(struct inode *inode, } /* - * This function only does inserts on an allocation b-tree. For dinode - * lists, ocfs2_insert_at_leaf() is called directly. + * This function only does inserts on an allocation b-tree. For tree + * depth = 0, ocfs2_insert_at_leaf() is called directly. * * right_path is the path we want to do the actual insert * in. left_path should only be passed in if we need to update that @@ -3656,7 +3946,7 @@ out: static int ocfs2_do_insert_extent(struct inode *inode, handle_t *handle, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *insert_rec, struct ocfs2_insert_type *type) { @@ -3664,13 +3954,11 @@ static int ocfs2_do_insert_extent(struct inode *inode, u32 cpos; struct ocfs2_path *right_path = NULL; struct ocfs2_path *left_path = NULL; - struct ocfs2_dinode *di; struct ocfs2_extent_list *el; - di = (struct ocfs2_dinode *) di_bh->b_data; - el = &di->id2.i_list; + el = et->et_root_el; - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3682,7 +3970,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, goto out_update_clusters; } - right_path = ocfs2_new_inode_path(di_bh); + right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!right_path) { ret = -ENOMEM; mlog_errno(ret); @@ -3732,7 +4020,7 @@ static int ocfs2_do_insert_extent(struct inode *inode, * ocfs2_rotate_tree_right() might have extended the * transaction without re-journaling our tree root. */ - ret = ocfs2_journal_access(handle, inode, di_bh, + ret = ocfs2_journal_access(handle, inode, et->et_root_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); @@ -3757,10 +4045,10 @@ static int ocfs2_do_insert_extent(struct inode *inode, out_update_clusters: if (type->ins_split == SPLIT_NONE) - ocfs2_update_dinode_clusters(inode, di, - le16_to_cpu(insert_rec->e_leaf_clusters)); + ocfs2_et_update_clusters(inode, et, + le16_to_cpu(insert_rec->e_leaf_clusters)); - ret = ocfs2_journal_dirty(handle, di_bh); + ret = ocfs2_journal_dirty(handle, et->et_root_bh); if (ret) mlog_errno(ret); @@ -3890,7 +4178,8 @@ out: static void ocfs2_figure_contig_type(struct inode *inode, struct ocfs2_insert_type *insert, struct ocfs2_extent_list *el, - struct ocfs2_extent_rec *insert_rec) + struct ocfs2_extent_rec *insert_rec, + struct ocfs2_extent_tree *et) { int i; enum ocfs2_contig_type contig_type = CONTIG_NONE; @@ -3906,6 +4195,21 @@ static void ocfs2_figure_contig_type(struct inode *inode, } } insert->ins_contig = contig_type; + + if (insert->ins_contig != CONTIG_NONE) { + struct ocfs2_extent_rec *rec = + &el->l_recs[insert->ins_contig_index]; + unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + + le16_to_cpu(insert_rec->e_leaf_clusters); + + /* + * Caller might want us to limit the size of extents, don't + * calculate contiguousness if we might exceed that limit. + */ + if (et->et_max_leaf_clusters && + (len > et->et_max_leaf_clusters)) + insert->ins_contig = CONTIG_NONE; + } } /* @@ -3914,8 +4218,8 @@ static void ocfs2_figure_contig_type(struct inode *inode, * ocfs2_figure_appending_type() will figure out whether we'll have to * insert at the tail of the rightmost leaf. * - * This should also work against the dinode list for tree's with 0 - * depth. If we consider the dinode list to be the rightmost leaf node + * This should also work against the root extent list for tree's with 0 + * depth. If we consider the root extent list to be the rightmost leaf node * then the logic here makes sense. */ static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, @@ -3966,14 +4270,13 @@ set_tail_append: * structure. */ static int ocfs2_figure_insert_type(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, struct ocfs2_extent_rec *insert_rec, int *free_records, struct ocfs2_insert_type *insert) { int ret; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; struct ocfs2_path *path = NULL; @@ -3981,7 +4284,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, insert->ins_split = SPLIT_NONE; - el = &di->id2.i_list; + el = et->et_root_el; insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); if (el->l_tree_depth) { @@ -3991,9 +4294,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * ocfs2_figure_insert_type() and ocfs2_add_branch() * may want it later. */ - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), &bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); if (ret) { mlog_exit(ret); goto out; @@ -4014,12 +4315,12 @@ static int ocfs2_figure_insert_type(struct inode *inode, le16_to_cpu(el->l_next_free_rec); if (!insert->ins_tree_depth) { - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); ocfs2_figure_appending_type(insert, el, insert_rec); return 0; } - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4048,7 +4349,7 @@ static int ocfs2_figure_insert_type(struct inode *inode, * into two types of appends: simple record append, or a * rotate inside the tail leaf. */ - ocfs2_figure_contig_type(inode, insert, el, insert_rec); + ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); /* * The insert code isn't quite ready to deal with all cases of @@ -4069,7 +4370,8 @@ static int ocfs2_figure_insert_type(struct inode *inode, * the case that we're doing a tail append, so maybe we can * take advantage of that information somehow. */ - if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { + if (ocfs2_et_get_last_eb_blk(et) == + path_leaf_bh(path)->b_blocknr) { /* * Ok, ocfs2_find_path() returned us the rightmost * tree path. This might be an appending insert. There are @@ -4099,7 +4401,7 @@ out: int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, @@ -4112,26 +4414,21 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, struct ocfs2_insert_type insert = {0, }; struct ocfs2_extent_rec rec; - BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); - mlog(0, "add %u clusters at position %u to inode %llu\n", new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); - mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != cpos), - "Device %s, asking for sparse allocation: inode %llu, " - "cpos %u, clusters %u\n", - osb->dev_str, - (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, - OCFS2_I(inode)->ip_clusters); - memset(&rec, 0, sizeof(rec)); rec.e_cpos = cpu_to_le32(cpos); rec.e_blkno = cpu_to_le64(start_blk); rec.e_leaf_clusters = cpu_to_le16(new_clusters); rec.e_flags = flags; + status = ocfs2_et_insert_check(inode, et, &rec); + if (status) { + mlog_errno(status); + goto bail; + } - status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, + status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, &free_records, &insert); if (status < 0) { mlog_errno(status); @@ -4145,7 +4442,7 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, free_records, insert.ins_tree_depth); if (insert.ins_contig == CONTIG_NONE && free_records == 0) { - status = ocfs2_grow_tree(inode, handle, fe_bh, + status = ocfs2_grow_tree(inode, handle, et, &insert.ins_tree_depth, &last_eb_bh, meta_ac); if (status) { @@ -4155,17 +4452,124 @@ int ocfs2_insert_extent(struct ocfs2_super *osb, } /* Finally, we can add clusters. This might rotate the tree for us. */ - status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); + status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); if (status < 0) mlog_errno(status); - else + else if (et->et_ops == &ocfs2_dinode_et_ops) ocfs2_extent_map_insert_rec(inode, &rec); bail: - if (last_eb_bh) - brelse(last_eb_bh); + brelse(last_eb_bh); + + mlog_exit(status); + return status; +} + +/* + * Allcate and add clusters into the extent b-tree. + * The new clusters(clusters_to_add) will be inserted at logical_offset. + * The extent b-tree's root is specified by et, and + * it is not limited to the file storage. Any extent tree can use this + * function if it implements the proper ocfs2_extent_tree. + */ +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) +{ + int status = 0; + int free_extents; + enum ocfs2_alloc_restarted reason = RESTART_NONE; + u32 bit_off, num_bits; + u64 block; + u8 flags = 0; + + BUG_ON(!clusters_to_add); + + if (mark_unwritten) + flags = OCFS2_EXT_UNWRITTEN; + + free_extents = ocfs2_num_free_extents(osb, inode, et); + if (free_extents < 0) { + status = free_extents; + mlog_errno(status); + goto leave; + } + + /* there are two cases which could cause us to EAGAIN in the + * we-need-more-metadata case: + * 1) we haven't reserved *any* + * 2) we are so fragmented, we've needed to add metadata too + * many times. */ + if (!free_extents && !meta_ac) { + mlog(0, "we haven't reserved any metadata!\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } else if ((!free_extents) + && (ocfs2_alloc_context_bits_left(meta_ac) + < ocfs2_extend_meta_needed(et->et_root_el))) { + mlog(0, "filesystem is really fragmented...\n"); + status = -EAGAIN; + reason = RESTART_META; + goto leave; + } + + status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (status < 0) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + /* reserve our write early -- insert_extent may update the inode */ + status = ocfs2_journal_access(handle, inode, et->et_root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + status = ocfs2_insert_extent(osb, handle, inode, et, + *logical_offset, block, + num_bits, flags, meta_ac); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, et->et_root_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= num_bits; + *logical_offset += num_bits; + + if (clusters_to_add) { + mlog(0, "need to alloc once more, wanted = %u\n", + clusters_to_add); + status = -EAGAIN; + reason = RESTART_TRANS; + } + +leave: mlog_exit(status); + if (reason_ret) + *reason_ret = reason; return status; } @@ -4192,7 +4596,7 @@ static void ocfs2_make_right_split_rec(struct super_block *sb, static int ocfs2_split_and_insert(struct inode *inode, handle_t *handle, struct ocfs2_path *path, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, struct buffer_head **last_eb_bh, int split_index, struct ocfs2_extent_rec *orig_split_rec, @@ -4206,7 +4610,6 @@ static int ocfs2_split_and_insert(struct inode *inode, struct ocfs2_extent_rec split_rec = *orig_split_rec; struct ocfs2_insert_type insert; struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di; leftright: /* @@ -4215,8 +4618,7 @@ leftright: */ rec = path_leaf_el(path)->l_recs[split_index]; - di = (struct ocfs2_dinode *)di_bh->b_data; - rightmost_el = &di->id2.i_list; + rightmost_el = et->et_root_el; depth = le16_to_cpu(rightmost_el->l_tree_depth); if (depth) { @@ -4227,8 +4629,8 @@ leftright: if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, last_eb_bh, - meta_ac); + ret = ocfs2_grow_tree(inode, handle, et, + &depth, last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); goto out; @@ -4265,8 +4667,7 @@ leftright: do_leftright = 1; } - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, - &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) { mlog_errno(ret); goto out; @@ -4308,8 +4709,9 @@ out: * of the tree is required. All other cases will degrade into a less * optimal tree layout. * - * last_eb_bh should be the rightmost leaf block for any inode with a - * btree. Since a split may grow the tree or a merge might shrink it, the caller cannot trust the contents of that buffer after this call. + * last_eb_bh should be the rightmost leaf block for any extent + * btree. Since a split may grow the tree or a merge might shrink it, + * the caller cannot trust the contents of that buffer after this call. * * This code is optimized for readability - several passes might be * made over certain portions of the tree. All of those blocks will @@ -4317,7 +4719,7 @@ out: * extra overhead is not expressed in terms of disk reads. */ static int __ocfs2_mark_extent_written(struct inode *inode, - struct buffer_head *di_bh, + struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int split_index, @@ -4357,11 +4759,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode, */ if (path->p_tree_depth) { struct ocfs2_extent_block *eb; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret) { mlog_exit(ret); goto out; @@ -4394,7 +4794,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_split_covers_rec) el->l_recs[split_index] = *split_rec; else - ret = ocfs2_split_and_insert(inode, handle, path, di_bh, + ret = ocfs2_split_and_insert(inode, handle, path, et, &last_eb_bh, split_index, split_rec, meta_ac); if (ret) @@ -4402,7 +4802,7 @@ static int __ocfs2_mark_extent_written(struct inode *inode, } else { ret = ocfs2_try_to_merge_extent(inode, handle, path, split_index, split_rec, - dealloc, &ctxt); + dealloc, &ctxt, et); if (ret) mlog_errno(ret); } @@ -4420,7 +4820,8 @@ out: * * The caller is responsible for passing down meta_ac if we'll need it. */ -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4446,10 +4847,14 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, /* * XXX: This should be fixed up so that we just re-insert the * next extent records. + * + * XXX: This is a hack on the extent tree, maybe it should be + * an op? */ - ocfs2_extent_map_trunc(inode, 0); + if (et->et_ops == &ocfs2_dinode_et_ops) + ocfs2_extent_map_trunc(inode, 0); - left_path = ocfs2_new_inode_path(di_bh); + left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!left_path) { ret = -ENOMEM; mlog_errno(ret); @@ -4480,8 +4885,9 @@ int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; - ret = __ocfs2_mark_extent_written(inode, di_bh, handle, left_path, - index, &split_rec, meta_ac, dealloc); + ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, + index, &split_rec, meta_ac, + dealloc); if (ret) mlog_errno(ret); @@ -4490,13 +4896,12 @@ out: return ret; } -static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, +static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, handle_t *handle, struct ocfs2_path *path, int index, u32 new_range, struct ocfs2_alloc_context *meta_ac) { int ret, depth, credits = handle->h_buffer_credits; - struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *rightmost_el, *el; @@ -4513,9 +4918,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, depth = path->p_tree_depth; if (depth > 0) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - le64_to_cpu(di->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), + &last_eb_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -4526,7 +4930,8 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, } else rightmost_el = path_leaf_el(path); - credits += path->p_tree_depth + ocfs2_extend_meta_needed(di); + credits += path->p_tree_depth + + ocfs2_extend_meta_needed(et->et_root_el); ret = ocfs2_extend_trans(handle, credits); if (ret) { mlog_errno(ret); @@ -4535,7 +4940,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, if (le16_to_cpu(rightmost_el->l_next_free_rec) == le16_to_cpu(rightmost_el->l_count)) { - ret = ocfs2_grow_tree(inode, handle, di_bh, &depth, &last_eb_bh, + ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, meta_ac); if (ret) { mlog_errno(ret); @@ -4549,7 +4954,7 @@ static int ocfs2_split_tree(struct inode *inode, struct buffer_head *di_bh, insert.ins_split = SPLIT_RIGHT; insert.ins_tree_depth = depth; - ret = ocfs2_do_insert_extent(inode, handle, di_bh, &split_rec, &insert); + ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); if (ret) mlog_errno(ret); @@ -4561,7 +4966,8 @@ out: static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_path *path, int index, struct ocfs2_cached_dealloc_ctxt *dealloc, - u32 cpos, u32 len) + u32 cpos, u32 len, + struct ocfs2_extent_tree *et) { int ret; u32 left_cpos, rec_range, trunc_range; @@ -4573,7 +4979,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, struct ocfs2_extent_block *eb; if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4704,7 +5110,7 @@ static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path_leaf_bh(path)); - ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc); + ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); if (ret) { mlog_errno(ret); goto out; @@ -4715,7 +5121,8 @@ out: return ret; } -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc) @@ -4724,11 +5131,11 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, u32 rec_range, trunc_range; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; - struct ocfs2_path *path; + struct ocfs2_path *path = NULL; ocfs2_extent_map_trunc(inode, 0); - path = ocfs2_new_inode_path(di_bh); + path = ocfs2_new_path(et->et_root_bh, et->et_root_el); if (!path) { ret = -ENOMEM; mlog_errno(ret); @@ -4781,13 +5188,13 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; } } else { - ret = ocfs2_split_tree(inode, di_bh, handle, path, index, + ret = ocfs2_split_tree(inode, et, handle, path, index, trunc_range, meta_ac); if (ret) { mlog_errno(ret); @@ -4836,7 +5243,7 @@ int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, } ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, - cpos, len); + cpos, len, et); if (ret) { mlog_errno(ret); goto out; @@ -5179,8 +5586,7 @@ static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { iput(inode); mlog_errno(status); @@ -5255,8 +5661,7 @@ int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, bail: if (tl_inode) iput(tl_inode); - if (tl_bh) - brelse(tl_bh); + brelse(tl_bh); if (status < 0 && (*tl_copy)) { kfree(*tl_copy); @@ -5999,20 +6404,13 @@ bail: return status; } -static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) +static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) { set_buffer_uptodate(bh); mark_buffer_dirty(bh); return 0; } -static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) -{ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - return ocfs2_journal_dirty_data(handle, bh); -} - static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, unsigned int from, unsigned int to, struct page *page, int zero, u64 *phys) @@ -6031,17 +6429,18 @@ static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, * here if they aren't - ocfs2_map_page_blocks() * might've skipped some */ - if (ocfs2_should_order_data(inode)) { - ret = walk_page_buffers(handle, - page_buffers(page), - from, to, &partial, - ocfs2_ordered_zero_func); - if (ret < 0) - mlog_errno(ret); - } else { + ret = walk_page_buffers(handle, page_buffers(page), + from, to, &partial, + ocfs2_zero_func); + if (ret < 0) + mlog_errno(ret); + else if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, &partial, - ocfs2_writeback_zero_func); + ocfs2_journal_dirty_data); +#endif if (ret < 0) mlog_errno(ret); } @@ -6206,20 +6605,29 @@ out: return ret; } -static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) +static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, + struct ocfs2_dinode *di) { unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); - memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2) - + xattrsize); + else + memset(&di->id2, 0, blocksize - + offsetof(struct ocfs2_dinode, id2)); } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di) { - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); di->id2.i_list.l_tree_depth = 0; di->id2.i_list.l_next_free_rec = 0; - di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); + di->id2.i_list.l_count = cpu_to_le16( + ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); } void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) @@ -6236,9 +6644,10 @@ void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) * We clear the entire i_data structure here so that all * fields can be properly initialized. */ - ocfs2_zero_dinode_id2(inode, di); + ocfs2_zero_dinode_id2_with_xattr(inode, di); - idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); + idata->id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); } int ocfs2_convert_inline_data_to_extents(struct inode *inode, @@ -6253,6 +6662,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, struct ocfs2_alloc_context *data_ac = NULL; struct page **pages = NULL; loff_t end = osb->s_clustersize; + struct ocfs2_extent_tree et; has_data = i_size_read(inode) ? 1 : 0; @@ -6352,7 +6762,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode, * this proves to be false, we could always re-build * the in-inode data from our pages. */ - ret = ocfs2_insert_extent(osb, handle, inode, di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + ret = ocfs2_insert_extent(osb, handle, inode, &et, 0, block, 1, 0, NULL); if (ret) { mlog_errno(ret); @@ -6395,13 +6806,14 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb, handle_t *handle = NULL; struct inode *tl_inode = osb->osb_tl_inode; struct ocfs2_path *path = NULL; + struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; mlog_entry_void(); new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, i_size_read(inode)); - path = ocfs2_new_inode_path(fe_bh); + path = ocfs2_new_path(fe_bh, &di->id2.i_list); if (!path) { status = -ENOMEM; mlog_errno(status); @@ -6572,8 +6984,8 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb, ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); if (fe->id2.i_list.l_tree_depth) { - status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), - &last_eb_bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), + &last_eb_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -6686,8 +7098,7 @@ static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) mlog(ML_NOTICE, "Truncate completion has non-empty dealloc context\n"); - if (tc->tc_last_eb_bh) - brelse(tc->tc_last_eb_bh); + brelse(tc->tc_last_eb_bh); kfree(tc); } diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 60cd3d59230..70257c84cfb 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -26,30 +26,102 @@ #ifndef OCFS2_ALLOC_H #define OCFS2_ALLOC_H + +/* + * For xattr tree leaf, we limit the leaf byte size to be 64K. + */ +#define OCFS2_MAX_XATTR_TREE_LEAF_SIZE 65536 + +/* + * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract + * the b-tree operations in ocfs2. Now all the b-tree operations are not + * limited to ocfs2_dinode only. Any data which need to allocate clusters + * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree + * and operation. + * + * ocfs2_extent_tree becomes the first-class object for extent tree + * manipulation. Callers of the alloc.c code need to fill it via one of + * the ocfs2_init_*_extent_tree() operations below. + * + * ocfs2_extent_tree contains info for the root of the b-tree, it must have a + * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree + * functions. + * ocfs2_extent_tree_operations abstract the normal operations we do for + * the root of extent b-tree. + */ +struct ocfs2_extent_tree_operations; +struct ocfs2_extent_tree { + struct ocfs2_extent_tree_operations *et_ops; + struct buffer_head *et_root_bh; + struct ocfs2_extent_list *et_root_el; + void *et_object; + unsigned int et_max_leaf_clusters; +}; + +/* + * ocfs2_init_*_extent_tree() will fill an ocfs2_extent_tree from the + * specified object buffer. + */ +void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh); +void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, + struct inode *inode, + struct buffer_head *bh, + struct ocfs2_xattr_value_root *xv); + struct ocfs2_alloc_context; int ocfs2_insert_extent(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct buffer_head *fe_bh, + struct ocfs2_extent_tree *et, u32 cpos, u64 start_blk, u32 new_clusters, u8 flags, struct ocfs2_alloc_context *meta_ac); + +enum ocfs2_alloc_restarted { + RESTART_NONE = 0, + RESTART_TRANS, + RESTART_META +}; +int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct ocfs2_extent_tree *et, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); struct ocfs2_cached_dealloc_ctxt; -int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_mark_extent_written(struct inode *inode, + struct ocfs2_extent_tree *et, handle_t *handle, u32 cpos, u32 len, u32 phys, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); -int ocfs2_remove_extent(struct inode *inode, struct buffer_head *di_bh, +int ocfs2_remove_extent(struct inode *inode, + struct ocfs2_extent_tree *et, u32 cpos, u32 len, handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct ocfs2_cached_dealloc_ctxt *dealloc); int ocfs2_num_free_extents(struct ocfs2_super *osb, struct inode *inode, - struct ocfs2_dinode *fe); -/* how many new metadata chunks would an allocation need at maximum? */ -static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) + struct ocfs2_extent_tree *et); + +/* + * how many new metadata chunks would an allocation need at maximum? + * + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ +static inline int ocfs2_extend_meta_needed(struct ocfs2_extent_list *root_el) { /* * Rather than do all the work of determining how much we need @@ -59,7 +131,7 @@ static inline int ocfs2_extend_meta_needed(struct ocfs2_dinode *fe) * new tree_depth==0 extent_block, and one block at the new * top-of-the tree. */ - return le16_to_cpu(fe->id2.i_list.l_tree_depth) + 2; + return le16_to_cpu(root_el->l_tree_depth) + 2; } void ocfs2_dinode_new_extent_list(struct inode *inode, struct ocfs2_dinode *di); diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a53da146627..c22543b3342 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -68,9 +68,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, goto bail; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - &bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -128,8 +126,7 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, err = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(err); return err; @@ -261,13 +258,11 @@ static int ocfs2_readpage_inline(struct inode *inode, struct page *page) { int ret; struct buffer_head *di_bh = NULL; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); BUG_ON(!PageLocked(page)); BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)); - ret = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -485,11 +480,14 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode, } if (ocfs2_should_order_data(inode)) { + ret = ocfs2_jbd2_file_inode(handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD ret = walk_page_buffers(handle, page_buffers(page), from, to, NULL, ocfs2_journal_dirty_data); - if (ret < 0) +#endif + if (ret < 0) mlog_errno(ret); } out: @@ -669,7 +667,7 @@ static void ocfs2_invalidatepage(struct page *page, unsigned long offset) { journal_t *journal = OCFS2_SB(page->mapping->host->i_sb)->journal->j_journal; - journal_invalidatepage(journal, page, offset); + jbd2_journal_invalidatepage(journal, page, offset); } static int ocfs2_releasepage(struct page *page, gfp_t wait) @@ -678,7 +676,7 @@ static int ocfs2_releasepage(struct page *page, gfp_t wait) if (!page_has_buffers(page)) return 0; - return journal_try_to_free_buffers(journal, page, wait); + return jbd2_journal_try_to_free_buffers(journal, page, wait); } static ssize_t ocfs2_direct_IO(int rw, @@ -1074,11 +1072,15 @@ static void ocfs2_write_failure(struct inode *inode, tmppage = wc->w_pages[i]; if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD walk_page_buffers(wc->w_handle, page_buffers(tmppage), from, to, NULL, ocfs2_journal_dirty_data); +#endif + } block_commit_write(tmppage, from, to); } @@ -1242,6 +1244,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, int ret, i, new, should_zero = 0; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; + struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; if (new || unwritten) @@ -1255,10 +1258,10 @@ static int ocfs2_write_cluster(struct address_space *mapping, * any additional semaphores or cluster locks. */ tmp_pos = cpos; - ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode, - &tmp_pos, 1, 0, wc->w_di_bh, - wc->w_handle, data_ac, - meta_ac, NULL); + ret = ocfs2_add_inode_data(OCFS2_SB(inode->i_sb), inode, + &tmp_pos, 1, 0, wc->w_di_bh, + wc->w_handle, data_ac, + meta_ac, NULL); /* * This shouldn't happen because we must have already * calculated the correct meta data allocation required. The @@ -1276,7 +1279,8 @@ static int ocfs2_write_cluster(struct address_space *mapping, goto out; } } else if (unwritten) { - ret = ocfs2_mark_extent_written(inode, wc->w_di_bh, + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_mark_extent_written(inode, &et, wc->w_handle, cpos, 1, phys, meta_ac, &wc->w_dealloc); if (ret < 0) { @@ -1665,6 +1669,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle; + struct ocfs2_extent_tree et; ret = ocfs2_alloc_write_ctxt(&wc, osb, pos, len, di_bh); if (ret) { @@ -1712,14 +1717,23 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * ocfs2_lock_allocators(). It greatly over-estimates * the work to be done. */ - ret = ocfs2_lock_allocators(inode, di, clusters_to_alloc, - extents_to_split, &data_ac, &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u," + " clusters_to_add = %u, extents_to_split = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(di->i_clusters), + clusters_to_alloc, extents_to_split); + + ocfs2_init_dinode_extent_tree(&et, inode, wc->w_di_bh); + ret = ocfs2_lock_allocators(inode, &et, + clusters_to_alloc, extents_to_split, + &data_ac, &meta_ac); if (ret) { mlog_errno(ret); goto out; } - credits = ocfs2_calc_extend_credits(inode->i_sb, di, + credits = ocfs2_calc_extend_credits(inode->i_sb, + &di->id2.i_list, clusters_to_alloc); } @@ -1905,11 +1919,15 @@ int ocfs2_write_end_nolock(struct address_space *mapping, } if (page_has_buffers(tmppage)) { - if (ocfs2_should_order_data(inode)) + if (ocfs2_should_order_data(inode)) { + ocfs2_jbd2_file_inode(wc->w_handle, inode); +#ifdef CONFIG_OCFS2_COMPAT_JBD walk_page_buffers(wc->w_handle, page_buffers(tmppage), from, to, NULL, ocfs2_journal_dirty_data); +#endif + } block_commit_write(tmppage, from, to); } } diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c index f136639f5b4..7e947c67246 100644 --- a/fs/ocfs2/buffer_head_io.c +++ b/fs/ocfs2/buffer_head_io.c @@ -66,7 +66,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, /* remove from dirty list before I/O. */ clear_buffer_dirty(bh); - get_bh(bh); /* for end_buffer_write_sync() */ + get_bh(bh); /* for end_buffer_write_sync() */ bh->b_end_io = end_buffer_write_sync; submit_bh(WRITE, bh); @@ -88,22 +88,103 @@ out: return ret; } -int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, - struct buffer_head *bhs[], int flags, - struct inode *inode) +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]) +{ + int status = 0; + unsigned int i; + struct buffer_head *bh; + + if (!nr) { + mlog(ML_BH_IO, "No buffers will be read!\n"); + goto bail; + } + + for (i = 0 ; i < nr ; i++) { + if (bhs[i] == NULL) { + bhs[i] = sb_getblk(osb->sb, block++); + if (bhs[i] == NULL) { + status = -EIO; + mlog_errno(status); + goto bail; + } + } + bh = bhs[i]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "trying to sync read a jbd " + "managed bh (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + if (buffer_dirty(bh)) { + /* This should probably be a BUG, or + * at least return an error. */ + mlog(ML_ERROR, + "trying to sync read a dirty " + "buffer! (blocknr = %llu), skipping\n", + (unsigned long long)bh->b_blocknr); + continue; + } + + lock_buffer(bh); + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "block %llu had the JBD bit set " + "while I was in lock_buffer!", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + clear_buffer_uptodate(bh); + get_bh(bh); /* for end_buffer_read_sync() */ + bh->b_end_io = end_buffer_read_sync; + submit_bh(READ, bh); + } + + for (i = nr; i > 0; i--) { + bh = bhs[i - 1]; + + if (buffer_jbd(bh)) { + mlog(ML_ERROR, + "the journal got the buffer while it was " + "locked for io! (blocknr = %llu)\n", + (unsigned long long)bh->b_blocknr); + BUG(); + } + + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) { + /* Status won't be cleared from here on out, + * so we can safely record this and loop back + * to cleanup the other buffers. */ + status = -EIO; + put_bh(bh); + bhs[i - 1] = NULL; + } + } + +bail: + return status; +} + +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, + struct buffer_head *bhs[], int flags) { int status = 0; - struct super_block *sb; int i, ignore_cache = 0; struct buffer_head *bh; - mlog_entry("(block=(%llu), nr=(%d), flags=%d, inode=%p)\n", - (unsigned long long)block, nr, flags, inode); + mlog_entry("(inode=%p, block=(%llu), nr=(%d), flags=%d)\n", + inode, (unsigned long long)block, nr, flags); + BUG_ON(!inode); BUG_ON((flags & OCFS2_BH_READAHEAD) && - (!inode || !(flags & OCFS2_BH_CACHED))); + (flags & OCFS2_BH_IGNORE_CACHE)); - if (osb == NULL || osb->sb == NULL || bhs == NULL) { + if (bhs == NULL) { status = -EINVAL; mlog_errno(status); goto bail; @@ -122,26 +203,19 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, goto bail; } - sb = osb->sb; - - if (flags & OCFS2_BH_CACHED && !inode) - flags &= ~OCFS2_BH_CACHED; - - if (inode) - mutex_lock(&OCFS2_I(inode)->ip_io_mutex); + mutex_lock(&OCFS2_I(inode)->ip_io_mutex); for (i = 0 ; i < nr ; i++) { if (bhs[i] == NULL) { - bhs[i] = sb_getblk(sb, block++); + bhs[i] = sb_getblk(inode->i_sb, block++); if (bhs[i] == NULL) { - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); status = -EIO; mlog_errno(status); goto bail; } } bh = bhs[i]; - ignore_cache = 0; + ignore_cache = (flags & OCFS2_BH_IGNORE_CACHE); /* There are three read-ahead cases here which we need to * be concerned with. All three assume a buffer has @@ -167,26 +241,27 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * before our is-it-in-flight check. */ - if (flags & OCFS2_BH_CACHED && - !ocfs2_buffer_uptodate(inode, bh)) { + if (!ignore_cache && !ocfs2_buffer_uptodate(inode, bh)) { mlog(ML_UPTODATE, "bh (%llu), inode %llu not uptodate\n", (unsigned long long)bh->b_blocknr, (unsigned long long)OCFS2_I(inode)->ip_blkno); + /* We're using ignore_cache here to say + * "go to disk" */ ignore_cache = 1; } /* XXX: Can we ever get this and *not* have the cached * flag set? */ if (buffer_jbd(bh)) { - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) + if (ignore_cache) mlog(ML_BH_IO, "trying to sync read a jbd " "managed bh (blocknr = %llu)\n", (unsigned long long)bh->b_blocknr); continue; } - if (!(flags & OCFS2_BH_CACHED) || ignore_cache) { + if (ignore_cache) { if (buffer_dirty(bh)) { /* This should probably be a BUG, or * at least return an error. */ @@ -221,7 +296,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, * previously read-ahead buffer may have * completed I/O while we were waiting for the * buffer lock. */ - if ((flags & OCFS2_BH_CACHED) + if (!(flags & OCFS2_BH_IGNORE_CACHE) && !(flags & OCFS2_BH_READAHEAD) && ocfs2_buffer_uptodate(inode, bh)) { unlock_buffer(bh); @@ -265,15 +340,14 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr, /* Always set the buffer in the cache, even if it was * a forced read, or read-ahead which hasn't yet * completed. */ - if (inode) - ocfs2_set_buffer_uptodate(inode, bh); + ocfs2_set_buffer_uptodate(inode, bh); } - if (inode) - mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); + mutex_unlock(&OCFS2_I(inode)->ip_io_mutex); mlog(ML_BH_IO, "block=(%llu), nr=(%d), cached=%s, flags=0x%x\n", (unsigned long long)block, nr, - (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes", flags); + ((flags & OCFS2_BH_IGNORE_CACHE) || ignore_cache) ? "no" : "yes", + flags); bail: diff --git a/fs/ocfs2/buffer_head_io.h b/fs/ocfs2/buffer_head_io.h index c2e78614c3e..75e1dcb1ade 100644 --- a/fs/ocfs2/buffer_head_io.h +++ b/fs/ocfs2/buffer_head_io.h @@ -31,31 +31,29 @@ void ocfs2_end_buffer_io_sync(struct buffer_head *bh, int uptodate); -static inline int ocfs2_read_block(struct ocfs2_super *osb, +static inline int ocfs2_read_block(struct inode *inode, u64 off, - struct buffer_head **bh, - int flags, - struct inode *inode); + struct buffer_head **bh); int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh, struct inode *inode); -int ocfs2_read_blocks(struct ocfs2_super *osb, +int ocfs2_read_blocks(struct inode *inode, u64 block, int nr, struct buffer_head *bhs[], - int flags, - struct inode *inode); + int flags); +int ocfs2_read_blocks_sync(struct ocfs2_super *osb, u64 block, + unsigned int nr, struct buffer_head *bhs[]); int ocfs2_write_super_or_backup(struct ocfs2_super *osb, struct buffer_head *bh); -#define OCFS2_BH_CACHED 1 +#define OCFS2_BH_IGNORE_CACHE 1 #define OCFS2_BH_READAHEAD 8 -static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, - struct buffer_head **bh, int flags, - struct inode *inode) +static inline int ocfs2_read_block(struct inode *inode, u64 off, + struct buffer_head **bh) { int status = 0; @@ -65,8 +63,7 @@ static inline int ocfs2_read_block(struct ocfs2_super * osb, u64 off, goto bail; } - status = ocfs2_read_blocks(osb, off, 1, bh, - flags, inode); + status = ocfs2_read_blocks(inode, off, 1, bh, 0); bail: return status; diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 23c732f2752..d8a0cb92cef 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -109,6 +109,7 @@ static struct mlog_attribute mlog_attrs[MLOG_MAX_BITS] = { define_mask(CONN), define_mask(QUORUM), define_mask(EXPORT), + define_mask(XATTR), define_mask(ERROR), define_mask(NOTICE), define_mask(KTHREAD), diff --git a/fs/ocfs2/cluster/masklog.h b/fs/ocfs2/cluster/masklog.h index 597e064bb94..57670c68047 100644 --- a/fs/ocfs2/cluster/masklog.h +++ b/fs/ocfs2/cluster/masklog.h @@ -112,6 +112,7 @@ #define ML_CONN 0x0000000004000000ULL /* net connection management */ #define ML_QUORUM 0x0000000008000000ULL /* net connection quorum */ #define ML_EXPORT 0x0000000010000000ULL /* ocfs2 export operations */ +#define ML_XATTR 0x0000000020000000ULL /* ocfs2 extended attributes */ /* bits that are infrequently given and frequently matched in the high word */ #define ML_ERROR 0x0000000100000000ULL /* sent to KERN_ERR */ #define ML_NOTICE 0x0000000200000000ULL /* setn to KERN_NOTICE */ diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 9cce563fd62..026e6eb8518 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -82,6 +82,49 @@ static int ocfs2_do_extend_dir(struct super_block *sb, struct ocfs2_alloc_context *meta_ac, struct buffer_head **new_bh); +static struct buffer_head *ocfs2_bread(struct inode *inode, + int block, int *err, int reada) +{ + struct buffer_head *bh = NULL; + int tmperr; + u64 p_blkno; + int readflags = 0; + + if (reada) + readflags |= OCFS2_BH_READAHEAD; + + if (((u64)block << inode->i_sb->s_blocksize_bits) >= + i_size_read(inode)) { + BUG_ON(!reada); + return NULL; + } + + down_read(&OCFS2_I(inode)->ip_alloc_sem); + tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, + NULL); + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (tmperr < 0) { + mlog_errno(tmperr); + goto fail; + } + + tmperr = ocfs2_read_blocks(inode, p_blkno, 1, &bh, readflags); + if (tmperr < 0) + goto fail; + + tmperr = 0; + + *err = 0; + return bh; + +fail: + brelse(bh); + bh = NULL; + + *err = -EIO; + return NULL; +} + /* * bh passed here can be an inode block or a dir data block, depending * on the inode inline data flag. @@ -188,8 +231,7 @@ static struct buffer_head *ocfs2_find_entry_id(const char *name, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -260,14 +302,13 @@ restart: } if ((bh = bh_use[ra_ptr++]) == NULL) goto next; - wait_on_buffer(bh); - if (!buffer_uptodate(bh)) { - /* read error, skip block & hope for the best */ + if (ocfs2_read_block(dir, block, &bh)) { + /* read error, skip block & hope for the best. + * ocfs2_read_block() has released the bh. */ ocfs2_error(dir->i_sb, "reading directory %llu, " "offset %lu\n", (unsigned long long)OCFS2_I(dir)->ip_blkno, block); - brelse(bh); goto next; } i = ocfs2_search_dirblock(bh, dir, name, namelen, @@ -417,8 +458,7 @@ static inline int ocfs2_delete_entry_id(handle_t *handle, struct ocfs2_dinode *di; struct ocfs2_inline_data *data; - ret = ocfs2_read_block(OCFS2_SB(dir->i_sb), OCFS2_I(dir)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, dir); + ret = ocfs2_read_block(dir, OCFS2_I(dir)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -596,8 +636,7 @@ static int ocfs2_dir_foreach_blk_id(struct inode *inode, struct ocfs2_inline_data *data; struct ocfs2_dir_entry *de; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog(ML_ERROR, "Unable to read inode block for dir %llu\n", (unsigned long long)OCFS2_I(inode)->ip_blkno); @@ -716,8 +755,7 @@ static int ocfs2_dir_foreach_blk_el(struct inode *inode, for (i = ra_sectors >> (sb->s_blocksize_bits - 9); i > 0; i--) { tmp = ocfs2_bread(inode, ++blk, &err, 1); - if (tmp) - brelse(tmp); + brelse(tmp); } last_ra_blk = blk; ra_sectors = 8; @@ -899,10 +937,8 @@ int ocfs2_find_files_on_disk(const char *name, leave: if (status < 0) { *dirent = NULL; - if (*dirent_bh) { - brelse(*dirent_bh); - *dirent_bh = NULL; - } + brelse(*dirent_bh); + *dirent_bh = NULL; } mlog_exit(status); @@ -951,8 +987,7 @@ int ocfs2_check_dir_for_entry(struct inode *dir, ret = 0; bail: - if (dirent_bh) - brelse(dirent_bh); + brelse(dirent_bh); mlog_exit(ret); return ret; @@ -1127,8 +1162,7 @@ static int ocfs2_fill_new_dir_el(struct ocfs2_super *osb, status = 0; bail: - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1192,6 +1226,9 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, struct buffer_head *dirdata_bh = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; handle_t *handle; + struct ocfs2_extent_tree et; + + ocfs2_init_dinode_extent_tree(&et, dir, di_bh); alloc = ocfs2_clusters_for_bytes(sb, bytes); @@ -1305,8 +1342,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, * This should never fail as our extent list is empty and all * related blocks have been journaled already. */ - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 0, blkno, len, 0, - NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 0, blkno, len, + 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -1337,8 +1374,8 @@ static int ocfs2_expand_inline_dir(struct inode *dir, struct buffer_head *di_bh, } blkno = ocfs2_clusters_to_blocks(dir->i_sb, bit_off); - ret = ocfs2_insert_extent(osb, handle, dir, di_bh, 1, blkno, - len, 0, NULL); + ret = ocfs2_insert_extent(osb, handle, dir, &et, 1, + blkno, len, 0, NULL); if (ret) { mlog_errno(ret); goto out_commit; @@ -1383,9 +1420,9 @@ static int ocfs2_do_extend_dir(struct super_block *sb, if (extend) { u32 offset = OCFS2_I(dir)->ip_clusters; - status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset, - 1, 0, parent_fe_bh, handle, - data_ac, meta_ac, NULL); + status = ocfs2_add_inode_data(OCFS2_SB(sb), dir, &offset, + 1, 0, parent_fe_bh, handle, + data_ac, meta_ac, NULL); BUG_ON(status == -EAGAIN); if (status < 0) { mlog_errno(status); @@ -1430,12 +1467,14 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, int credits, num_free_extents, drop_alloc_sem = 0; loff_t dir_i_size; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) parent_fe_bh->b_data; + struct ocfs2_extent_list *el = &fe->id2.i_list; struct ocfs2_alloc_context *data_ac = NULL; struct ocfs2_alloc_context *meta_ac = NULL; handle_t *handle = NULL; struct buffer_head *new_bh = NULL; struct ocfs2_dir_entry * de; struct super_block *sb = osb->sb; + struct ocfs2_extent_tree et; mlog_entry_void(); @@ -1479,7 +1518,8 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, spin_lock(&OCFS2_I(dir)->ip_lock); if (dir_i_size == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters)) { spin_unlock(&OCFS2_I(dir)->ip_lock); - num_free_extents = ocfs2_num_free_extents(osb, dir, fe); + ocfs2_init_dinode_extent_tree(&et, dir, parent_fe_bh); + num_free_extents = ocfs2_num_free_extents(osb, dir, &et); if (num_free_extents < 0) { status = num_free_extents; mlog_errno(status); @@ -1487,7 +1527,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, } if (!num_free_extents) { - status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac); + status = ocfs2_reserve_new_metadata(osb, el, &meta_ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -1502,7 +1542,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb, goto bail; } - credits = ocfs2_calc_extend_credits(sb, fe, 1); + credits = ocfs2_calc_extend_credits(sb, el, 1); } else { spin_unlock(&OCFS2_I(dir)->ip_lock); credits = OCFS2_SIMPLE_DIR_EXTEND_CREDITS; @@ -1568,8 +1608,7 @@ bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); - if (new_bh) - brelse(new_bh); + brelse(new_bh); mlog_exit(status); return status; @@ -1696,8 +1735,7 @@ static int ocfs2_find_dir_space_el(struct inode *dir, const char *name, status = 0; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1756,7 +1794,6 @@ int ocfs2_prepare_dir_for_insert(struct ocfs2_super *osb, *ret_de_bh = bh; bh = NULL; out: - if (bh) - brelse(bh); + brelse(bh); return ret; } diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index eae3d643a5e..ec684426034 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2024,8 +2024,7 @@ static int ocfs2_inode_lock_update(struct inode *inode, } else { /* Boo, we have to go to disk. */ /* read bh, cast, ocfs2_refresh_inode */ - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), oi->ip_blkno, - bh, OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, oi->ip_blkno, bh); if (status < 0) { mlog_errno(status); goto bail_refresh; @@ -2086,11 +2085,7 @@ static int ocfs2_assign_bh(struct inode *inode, return 0; } - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - ret_bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, ret_bh); if (status < 0) mlog_errno(status); diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index aed268e80b4..2baedac5823 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -293,8 +293,7 @@ static int ocfs2_last_eb_is_empty(struct inode *inode, struct ocfs2_extent_block *eb; struct ocfs2_extent_list *el; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), last_eb_blk, - &eb_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, last_eb_blk, &eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -382,9 +381,9 @@ static int ocfs2_figure_hole_clusters(struct inode *inode, if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL) goto no_more_extents; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), + ret = ocfs2_read_block(inode, le64_to_cpu(eb->h_next_leaf_blk), - &next_eb_bh, OCFS2_BH_CACHED, inode); + &next_eb_bh); if (ret) { mlog_errno(ret); goto out; @@ -551,6 +550,66 @@ static void ocfs2_relative_extent_offsets(struct super_block *sb, *num_clusters = le16_to_cpu(rec->e_leaf_clusters) - coff; } +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec; + u32 coff; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr leaf block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + i = ocfs2_search_extent_list(el, v_cluster); + if (i == -1) { + ret = -EROFS; + mlog_errno(ret); + goto out; + } else { + rec = &el->l_recs[i]; + BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos)); + + if (!rec->e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + coff = v_cluster - le32_to_cpu(rec->e_cpos); + *p_cluster = ocfs2_blocks_to_clusters(inode->i_sb, + le64_to_cpu(rec->e_blkno)); + *p_cluster = *p_cluster + coff; + if (num_clusters) + *num_clusters = ocfs2_rec_clusters(el, rec) - coff; + } +out: + if (eb_bh) + brelse(eb_bh); + return ret; +} + int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster, u32 *num_clusters, unsigned int *extent_flags) @@ -571,8 +630,7 @@ int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, if (ret == 0) goto out; - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno, - &di_bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &di_bh); if (ret) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 1b97490e1ea..1c4aa8b06f3 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,4 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, + u32 *p_cluster, u32 *num_clusters, + struct ocfs2_extent_list *el); + #endif /* _EXTENT_MAP_H */ diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index ed38796052d..8d3225a7807 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -55,6 +55,7 @@ #include "mmap.h" #include "suballoc.h" #include "super.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -184,7 +185,7 @@ static int ocfs2_sync_file(struct file *file, goto bail; journal = osb->journal->j_journal; - err = journal_force_commit(journal); + err = jbd2_journal_force_commit(journal); bail: mlog_exit(err); @@ -488,7 +489,7 @@ bail: } /* - * extend allocation only here. + * extend file allocation only here. * we'll update all the disk stuff, and oip->alloc_size * * expect stuff to be locked, a transaction started and enough data / @@ -497,189 +498,25 @@ bail: * Will return -EAGAIN, and a reason if a restart is needed. * If passed in, *reason will always be set, even in error. */ -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret) +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret) { - int status = 0; - int free_extents; - struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; - enum ocfs2_alloc_restarted reason = RESTART_NONE; - u32 bit_off, num_bits; - u64 block; - u8 flags = 0; - - BUG_ON(!clusters_to_add); - - if (mark_unwritten) - flags = OCFS2_EXT_UNWRITTEN; - - free_extents = ocfs2_num_free_extents(osb, inode, fe); - if (free_extents < 0) { - status = free_extents; - mlog_errno(status); - goto leave; - } - - /* there are two cases which could cause us to EAGAIN in the - * we-need-more-metadata case: - * 1) we haven't reserved *any* - * 2) we are so fragmented, we've needed to add metadata too - * many times. */ - if (!free_extents && !meta_ac) { - mlog(0, "we haven't reserved any metadata!\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } else if ((!free_extents) - && (ocfs2_alloc_context_bits_left(meta_ac) - < ocfs2_extend_meta_needed(fe))) { - mlog(0, "filesystem is really fragmented...\n"); - status = -EAGAIN; - reason = RESTART_META; - goto leave; - } - - status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, - clusters_to_add, &bit_off, &num_bits); - if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); - goto leave; - } - - BUG_ON(num_bits > clusters_to_add); - - /* reserve our write early -- insert_extent may update the inode */ - status = ocfs2_journal_access(handle, inode, fe_bh, - OCFS2_JOURNAL_ACCESS_WRITE); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - block = ocfs2_clusters_to_blocks(osb->sb, bit_off); - mlog(0, "Allocating %u clusters at block %u for inode %llu\n", - num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); - status = ocfs2_insert_extent(osb, handle, inode, fe_bh, - *logical_offset, block, num_bits, - flags, meta_ac); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - status = ocfs2_journal_dirty(handle, fe_bh); - if (status < 0) { - mlog_errno(status); - goto leave; - } - - clusters_to_add -= num_bits; - *logical_offset += num_bits; - - if (clusters_to_add) { - mlog(0, "need to alloc once more, clusters = %u, wanted = " - "%u\n", fe->i_clusters, clusters_to_add); - status = -EAGAIN; - reason = RESTART_TRANS; - } - -leave: - mlog_exit(status); - if (reason_ret) - *reason_ret = reason; - return status; -} - -/* - * For a given allocation, determine which allocators will need to be - * accessed, and lock them, reserving the appropriate number of bits. - * - * Sparse file systems call this from ocfs2_write_begin_nolock() - * and ocfs2_allocate_unwritten_extents(). - * - * File systems which don't support holes call this from - * ocfs2_extend_allocation(). - */ -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac) -{ - int ret = 0, num_free_extents; - unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; - struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - - *meta_ac = NULL; - if (data_ac) - *data_ac = NULL; - - BUG_ON(clusters_to_add != 0 && data_ac == NULL); - - mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " - "clusters_to_add = %u, extents_to_split = %u\n", - (unsigned long long)OCFS2_I(inode)->ip_blkno, (long long)i_size_read(inode), - le32_to_cpu(di->i_clusters), clusters_to_add, extents_to_split); - - num_free_extents = ocfs2_num_free_extents(osb, inode, di); - if (num_free_extents < 0) { - ret = num_free_extents; - mlog_errno(ret); - goto out; - } - - /* - * Sparse allocation file systems need to be more conservative - * with reserving room for expansion - the actual allocation - * happens while we've got a journal handle open so re-taking - * a cluster lock (because we ran out of room for another - * extent) will violate ordering rules. - * - * Most of the time we'll only be seeing this 1 cluster at a time - * anyway. - * - * Always lock for any unwritten extents - we might want to - * add blocks during a split. - */ - if (!num_free_extents || - (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { - ret = ocfs2_reserve_new_metadata(osb, di, meta_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - } - - if (clusters_to_add == 0) - goto out; - - ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); - if (ret < 0) { - if (ret != -ENOSPC) - mlog_errno(ret); - goto out; - } - -out: - if (ret) { - if (*meta_ac) { - ocfs2_free_alloc_context(*meta_ac); - *meta_ac = NULL; - } + int ret; + struct ocfs2_extent_tree et; - /* - * We cannot have an error and a non null *data_ac. - */ - } + ocfs2_init_dinode_extent_tree(&et, inode, fe_bh); + ret = ocfs2_add_clusters_in_btree(osb, inode, logical_offset, + clusters_to_add, mark_unwritten, + &et, handle, + data_ac, meta_ac, reason_ret); return ret; } @@ -698,6 +535,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, struct ocfs2_alloc_context *meta_ac = NULL; enum ocfs2_alloc_restarted why; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; mlog_entry("(clusters_to_add = %u)\n", clusters_to_add); @@ -707,8 +545,7 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, */ BUG_ON(mark_unwritten && !ocfs2_sparse_alloc(osb)); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, - OCFS2_BH_CACHED, inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); if (status < 0) { mlog_errno(status); goto leave; @@ -724,14 +561,21 @@ static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, restart_all: BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters); - status = ocfs2_lock_allocators(inode, fe, clusters_to_add, 0, &data_ac, - &meta_ac); + mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, " + "clusters_to_add = %u\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (long long)i_size_read(inode), le32_to_cpu(fe->i_clusters), + clusters_to_add); + ocfs2_init_dinode_extent_tree(&et, inode, bh); + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); if (status) { mlog_errno(status); goto leave; } - credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add); + credits = ocfs2_calc_extend_credits(osb->sb, &fe->id2.i_list, + clusters_to_add); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -753,16 +597,16 @@ restarted_transaction: prev_clusters = OCFS2_I(inode)->ip_clusters; - status = ocfs2_do_extend_allocation(osb, - inode, - &logical_start, - clusters_to_add, - mark_unwritten, - bh, - handle, - data_ac, - meta_ac, - &why); + status = ocfs2_add_inode_data(osb, + inode, + &logical_start, + clusters_to_add, + mark_unwritten, + bh, + handle, + data_ac, + meta_ac, + &why); if ((status < 0) && (status != -EAGAIN)) { if (status != -ENOSPC) mlog_errno(status); @@ -789,7 +633,7 @@ restarted_transaction: mlog(0, "restarting transaction.\n"); /* TODO: This can be more intelligent. */ credits = ocfs2_calc_extend_credits(osb->sb, - fe, + &fe->id2.i_list, clusters_to_add); status = ocfs2_extend_trans(handle, credits); if (status < 0) { @@ -826,10 +670,8 @@ leave: restart_func = 0; goto restart_all; } - if (bh) { - brelse(bh); - bh = NULL; - } + brelse(bh); + bh = NULL; mlog_exit(status); return status; @@ -1096,9 +938,15 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr) goto bail_unlock; } - if (i_size_read(inode) > attr->ia_size) + if (i_size_read(inode) > attr->ia_size) { + if (ocfs2_should_order_data(inode)) { + status = ocfs2_begin_ordered_truncate(inode, + attr->ia_size); + if (status) + goto bail_unlock; + } status = ocfs2_truncate_file(inode, bh, attr->ia_size); - else + } else status = ocfs2_extend_file(inode, bh, attr->ia_size); if (status < 0) { if (status != -ENOSPC) @@ -1140,8 +988,7 @@ bail_unlock_rw: if (size_change) ocfs2_rw_unlock(inode, 1); bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; @@ -1284,8 +1131,7 @@ static int ocfs2_write_remove_suid(struct inode *inode) struct buffer_head *bh = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - oi->ip_blkno, &bh, OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, oi->ip_blkno, &bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -1311,9 +1157,8 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, struct buffer_head *di_bh = NULL; if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { - ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, &di_bh, - OCFS2_BH_CACHED, inode); + ret = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, + &di_bh); if (ret) { mlog_errno(ret); goto out; @@ -1394,8 +1239,11 @@ static int __ocfs2_remove_inode_range(struct inode *inode, handle_t *handle; struct ocfs2_alloc_context *meta_ac = NULL; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_extent_tree et; - ret = ocfs2_lock_allocators(inode, di, 0, 1, NULL, &meta_ac); + ocfs2_init_dinode_extent_tree(&et, inode, di_bh); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); if (ret) { mlog_errno(ret); return ret; @@ -1425,7 +1273,7 @@ static int __ocfs2_remove_inode_range(struct inode *inode, goto out; } - ret = ocfs2_remove_extent(inode, di_bh, cpos, len, handle, meta_ac, + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, dealloc); if (ret) { mlog_errno(ret); @@ -2040,7 +1888,7 @@ out_dio: */ if (old_size != i_size_read(inode) || old_clusters != OCFS2_I(inode)->ip_clusters) { - ret = journal_force_commit(osb->journal->j_journal); + ret = jbd2_journal_force_commit(osb->journal->j_journal); if (ret < 0) written = ret; } @@ -2227,6 +2075,10 @@ const struct inode_operations ocfs2_file_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, .fallocate = ocfs2_fallocate, .fiemap = ocfs2_fiemap, }; @@ -2237,6 +2089,10 @@ const struct inode_operations ocfs2_special_file_iops = { .permission = ocfs2_permission, }; +/* + * Other than ->lock, keep ocfs2_fops and ocfs2_dops in sync with + * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! + */ const struct file_operations ocfs2_fops = { .llseek = generic_file_llseek, .read = do_sync_read, @@ -2251,6 +2107,7 @@ const struct file_operations ocfs2_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, .flock = ocfs2_flock, .splice_read = ocfs2_file_splice_read, .splice_write = ocfs2_file_splice_write, @@ -2267,5 +2124,51 @@ const struct file_operations ocfs2_dops = { #ifdef CONFIG_COMPAT .compat_ioctl = ocfs2_compat_ioctl, #endif + .lock = ocfs2_lock, + .flock = ocfs2_flock, +}; + +/* + * POSIX-lockless variants of our file_operations. + * + * These will be used if the underlying cluster stack does not support + * posix file locking, if the user passes the "localflocks" mount + * option, or if we have a local-only fs. + * + * ocfs2_flock is in here because all stacks handle UNIX file locks, + * so we still want it in the case of no stack support for + * plocks. Internally, it will do the right thing when asked to ignore + * the cluster. + */ +const struct file_operations ocfs2_fops_no_plocks = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .write = do_sync_write, + .mmap = ocfs2_mmap, + .fsync = ocfs2_sync_file, + .release = ocfs2_file_release, + .open = ocfs2_file_open, + .aio_read = ocfs2_file_aio_read, + .aio_write = ocfs2_file_aio_write, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif + .flock = ocfs2_flock, + .splice_read = ocfs2_file_splice_read, + .splice_write = ocfs2_file_splice_write, +}; + +const struct file_operations ocfs2_dops_no_plocks = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = ocfs2_readdir, + .fsync = ocfs2_sync_file, + .release = ocfs2_dir_release, + .open = ocfs2_dir_open, + .unlocked_ioctl = ocfs2_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ocfs2_compat_ioctl, +#endif .flock = ocfs2_flock, }; diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1e27b4d017e..e92382cbca5 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -28,9 +28,12 @@ extern const struct file_operations ocfs2_fops; extern const struct file_operations ocfs2_dops; +extern const struct file_operations ocfs2_fops_no_plocks; +extern const struct file_operations ocfs2_dops_no_plocks; extern const struct inode_operations ocfs2_file_iops; extern const struct inode_operations ocfs2_special_file_iops; struct ocfs2_alloc_context; +enum ocfs2_alloc_restarted; struct ocfs2_file_private { struct file *fp_file; @@ -38,27 +41,18 @@ struct ocfs2_file_private { struct ocfs2_lock_res fp_flock; }; -enum ocfs2_alloc_restarted { - RESTART_NONE = 0, - RESTART_TRANS, - RESTART_META -}; -int ocfs2_do_extend_allocation(struct ocfs2_super *osb, - struct inode *inode, - u32 *logical_offset, - u32 clusters_to_add, - int mark_unwritten, - struct buffer_head *fe_bh, - handle_t *handle, - struct ocfs2_alloc_context *data_ac, - struct ocfs2_alloc_context *meta_ac, - enum ocfs2_alloc_restarted *reason_ret); +int ocfs2_add_inode_data(struct ocfs2_super *osb, + struct inode *inode, + u32 *logical_offset, + u32 clusters_to_add, + int mark_unwritten, + struct buffer_head *fe_bh, + handle_t *handle, + struct ocfs2_alloc_context *data_ac, + struct ocfs2_alloc_context *meta_ac, + enum ocfs2_alloc_restarted *reason_ret); int ocfs2_extend_no_holes(struct inode *inode, u64 new_i_size, u64 zero_to); -int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di, - u32 clusters_to_add, u32 extents_to_split, - struct ocfs2_alloc_context **data_ac, - struct ocfs2_alloc_context **meta_ac); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 7e9e4c79aec..4903688f72a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -49,6 +49,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -219,6 +220,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, struct super_block *sb; struct ocfs2_super *osb; int status = -EINVAL; + int use_plocks = 1; mlog_entry("(0x%p, size:%llu)\n", inode, (unsigned long long)le64_to_cpu(fe->i_size)); @@ -226,6 +228,10 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, sb = inode->i_sb; osb = OCFS2_SB(sb); + if ((osb->s_mount_opt & OCFS2_MOUNT_LOCALFLOCKS) || + ocfs2_mount_local(osb) || !ocfs2_stack_supports_plocks()) + use_plocks = 0; + /* this means that read_inode cannot create a superblock inode * today. change if needed. */ if (!OCFS2_IS_VALID_DINODE(fe) || @@ -295,13 +301,19 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe, switch (inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_fop = &ocfs2_fops; + if (use_plocks) + inode->i_fop = &ocfs2_fops; + else + inode->i_fop = &ocfs2_fops_no_plocks; inode->i_op = &ocfs2_file_iops; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFDIR: inode->i_op = &ocfs2_dir_iops; - inode->i_fop = &ocfs2_dops; + if (use_plocks) + inode->i_fop = &ocfs2_dops; + else + inode->i_fop = &ocfs2_dops_no_plocks; i_size_write(inode, le64_to_cpu(fe->i_size)); break; case S_IFLNK: @@ -448,8 +460,11 @@ static int ocfs2_read_locked_inode(struct inode *inode, } } - status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0, - can_lock ? inode : NULL); + if (can_lock) + status = ocfs2_read_blocks(inode, args->fi_blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); + else + status = ocfs2_read_blocks_sync(osb, args->fi_blkno, 1, &bh); if (status < 0) { mlog_errno(status); goto bail; @@ -522,6 +537,9 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb, * data and fast symlinks. */ if (fe->i_clusters) { + if (ocfs2_should_order_data(inode)) + ocfs2_begin_ordered_truncate(inode, 0); + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); @@ -730,6 +748,13 @@ static int ocfs2_wipe_inode(struct inode *inode, goto bail_unlock_dir; } + /*Free extended attribute resources associated with this inode.*/ + status = ocfs2_xattr_remove(inode, di_bh); + if (status < 0) { + mlog_errno(status); + goto bail_unlock_dir; + } + status = ocfs2_remove_inode(inode, di_bh, orphan_dir_inode, orphan_dir_bh); if (status < 0) @@ -1081,6 +1106,8 @@ void ocfs2_clear_inode(struct inode *inode) oi->ip_last_trans = 0; oi->ip_dir_start_lookup = 0; oi->ip_blkno = 0ULL; + jbd2_journal_release_jbd_inode(OCFS2_SB(inode->i_sb)->journal->j_journal, + &oi->ip_jinode); bail: mlog_exit_void(); @@ -1107,58 +1134,6 @@ void ocfs2_drop_inode(struct inode *inode) } /* - * TODO: this should probably be merged into ocfs2_get_block - * - * However, you now need to pay attention to the cont_prepare_write() - * stuff in ocfs2_get_block (that is, ocfs2_get_block pretty much - * expects never to extend). - */ -struct buffer_head *ocfs2_bread(struct inode *inode, - int block, int *err, int reada) -{ - struct buffer_head *bh = NULL; - int tmperr; - u64 p_blkno; - int readflags = OCFS2_BH_CACHED; - - if (reada) - readflags |= OCFS2_BH_READAHEAD; - - if (((u64)block << inode->i_sb->s_blocksize_bits) >= - i_size_read(inode)) { - BUG_ON(!reada); - return NULL; - } - - down_read(&OCFS2_I(inode)->ip_alloc_sem); - tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, - NULL); - up_read(&OCFS2_I(inode)->ip_alloc_sem); - if (tmperr < 0) { - mlog_errno(tmperr); - goto fail; - } - - tmperr = ocfs2_read_block(OCFS2_SB(inode->i_sb), p_blkno, &bh, - readflags, inode); - if (tmperr < 0) - goto fail; - - tmperr = 0; - - *err = 0; - return bh; - -fail: - if (bh) { - brelse(bh); - bh = NULL; - } - *err = -EIO; - return NULL; -} - -/* * This is called from our getattr. */ int ocfs2_inode_revalidate(struct dentry *dentry) diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index 390a85596aa..2f37af9bcc4 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -40,6 +40,9 @@ struct ocfs2_inode_info /* protects allocation changes on this inode. */ struct rw_semaphore ip_alloc_sem; + /* protects extended attribute changes on this inode */ + struct rw_semaphore ip_xattr_sem; + /* These fields are protected by ip_lock */ spinlock_t ip_lock; u32 ip_open_count; @@ -68,6 +71,7 @@ struct ocfs2_inode_info struct ocfs2_extent_map ip_extent_map; struct inode vfs_inode; + struct jbd2_inode ip_jinode; }; /* @@ -113,8 +117,6 @@ extern struct kmem_cache *ocfs2_inode_cache; extern const struct address_space_operations ocfs2_aops; -struct buffer_head *ocfs2_bread(struct inode *inode, int block, - int *err, int reada); void ocfs2_clear_inode(struct inode *inode); void ocfs2_delete_inode(struct inode *inode); void ocfs2_drop_inode(struct inode *inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 7b142f0ce99..9fcd36dcc9a 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -102,8 +102,7 @@ bail_unlock: bail: mutex_unlock(&inode->i_mutex); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index c47bc2a809c..81e40677eec 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -215,9 +215,9 @@ static int ocfs2_commit_cache(struct ocfs2_super *osb) goto finally; } - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) { up_write(&journal->j_trans_barrier); mlog_errno(status); @@ -264,7 +264,7 @@ handle_t *ocfs2_start_trans(struct ocfs2_super *osb, int max_buffs) down_read(&osb->journal->j_trans_barrier); - handle = journal_start(journal, max_buffs); + handle = jbd2_journal_start(journal, max_buffs); if (IS_ERR(handle)) { up_read(&osb->journal->j_trans_barrier); @@ -290,7 +290,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, BUG_ON(!handle); - ret = journal_stop(handle); + ret = jbd2_journal_stop(handle); if (ret < 0) mlog_errno(ret); @@ -304,7 +304,7 @@ int ocfs2_commit_trans(struct ocfs2_super *osb, * transaction. extend_trans will either extend the current handle by * nblocks, or commit it and start a new one with nblocks credits. * - * This might call journal_restart() which will commit dirty buffers + * This might call jbd2_journal_restart() which will commit dirty buffers * and then restart the transaction. Before calling * ocfs2_extend_trans(), any changed blocks should have been * dirtied. After calling it, all blocks which need to be changed must @@ -332,7 +332,7 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #ifdef CONFIG_OCFS2_DEBUG_FS status = 1; #else - status = journal_extend(handle, nblocks); + status = jbd2_journal_extend(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -340,8 +340,10 @@ int ocfs2_extend_trans(handle_t *handle, int nblocks) #endif if (status > 0) { - mlog(0, "journal_extend failed, trying journal_restart\n"); - status = journal_restart(handle, nblocks); + mlog(0, + "jbd2_journal_extend failed, trying " + "jbd2_journal_restart\n"); + status = jbd2_journal_restart(handle, nblocks); if (status < 0) { mlog_errno(status); goto bail; @@ -393,11 +395,11 @@ int ocfs2_journal_access(handle_t *handle, switch (type) { case OCFS2_JOURNAL_ACCESS_CREATE: case OCFS2_JOURNAL_ACCESS_WRITE: - status = journal_get_write_access(handle, bh); + status = jbd2_journal_get_write_access(handle, bh); break; case OCFS2_JOURNAL_ACCESS_UNDO: - status = journal_get_undo_access(handle, bh); + status = jbd2_journal_get_undo_access(handle, bh); break; default: @@ -422,7 +424,7 @@ int ocfs2_journal_dirty(handle_t *handle, mlog_entry("(bh->b_blocknr=%llu)\n", (unsigned long long)bh->b_blocknr); - status = journal_dirty_metadata(handle, bh); + status = jbd2_journal_dirty_metadata(handle, bh); if (status < 0) mlog(ML_ERROR, "Could not dirty metadata buffer. " "(bh->b_blocknr=%llu)\n", @@ -432,6 +434,7 @@ int ocfs2_journal_dirty(handle_t *handle, return status; } +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh) { @@ -443,8 +446,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, return err; } +#endif -#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD_DEFAULT_MAX_COMMIT_AGE) +#define OCFS2_DEFAULT_COMMIT_INTERVAL (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE) void ocfs2_set_journal_params(struct ocfs2_super *osb) { @@ -457,9 +461,9 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) spin_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) - journal->j_flags |= JFS_BARRIER; + journal->j_flags |= JBD2_BARRIER; else - journal->j_flags &= ~JFS_BARRIER; + journal->j_flags &= ~JBD2_BARRIER; spin_unlock(&journal->j_state_lock); } @@ -524,14 +528,14 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) mlog(0, "inode->ip_clusters = %u\n", OCFS2_I(inode)->ip_clusters); /* call the kernels journal init function now */ - j_journal = journal_init_inode(inode); + j_journal = jbd2_journal_init_inode(inode); if (j_journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EINVAL; goto done; } - mlog(0, "Returned from journal_init_inode\n"); + mlog(0, "Returned from jbd2_journal_init_inode\n"); mlog(0, "j_journal->j_maxlen = %u\n", j_journal->j_maxlen); *dirty = (le32_to_cpu(di->id1.journal1.ij_flags) & @@ -550,8 +554,7 @@ done: if (status < 0) { if (inode_lock) ocfs2_inode_unlock(inode, 1); - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) { OCFS2_I(inode)->ip_open_count--; iput(inode); @@ -639,7 +642,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) if (journal->j_state != OCFS2_JOURNAL_LOADED) goto done; - /* need to inc inode use count as journal_destroy will iput. */ + /* need to inc inode use count - jbd2_journal_destroy will iput. */ if (!igrab(inode)) BUG(); @@ -668,9 +671,9 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) BUG_ON(atomic_read(&(osb->journal->j_num_trans)) != 0); if (ocfs2_mount_local(osb)) { - journal_lock_updates(journal->j_journal); - status = journal_flush(journal->j_journal); - journal_unlock_updates(journal->j_journal); + jbd2_journal_lock_updates(journal->j_journal); + status = jbd2_journal_flush(journal->j_journal); + jbd2_journal_unlock_updates(journal->j_journal); if (status < 0) mlog_errno(status); } @@ -686,7 +689,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb) } /* Shutdown the kernel journal system */ - journal_destroy(journal->j_journal); + jbd2_journal_destroy(journal->j_journal); OCFS2_I(inode)->ip_open_count--; @@ -711,15 +714,15 @@ static void ocfs2_clear_journal_error(struct super_block *sb, { int olderr; - olderr = journal_errno(journal); + olderr = jbd2_journal_errno(journal); if (olderr) { mlog(ML_ERROR, "File system error %d recorded in " "journal %u.\n", olderr, slot); mlog(ML_ERROR, "File system on device %s needs checking.\n", sb->s_id); - journal_ack_err(journal); - journal_clear_err(journal); + jbd2_journal_ack_err(journal); + jbd2_journal_clear_err(journal); } } @@ -734,7 +737,7 @@ int ocfs2_journal_load(struct ocfs2_journal *journal, int local, int replayed) osb = journal->j_osb; - status = journal_load(journal->j_journal); + status = jbd2_journal_load(journal->j_journal); if (status < 0) { mlog(ML_ERROR, "Failed to load journal!\n"); goto done; @@ -778,7 +781,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full) BUG_ON(!journal); - status = journal_wipe(journal->j_journal, full); + status = jbd2_journal_wipe(journal->j_journal, full); if (status < 0) { mlog_errno(status); goto bail; @@ -847,9 +850,8 @@ static int ocfs2_force_read_journal(struct inode *inode) /* We are reading journal data which should not * be put in the uptodate cache */ - status = ocfs2_read_blocks(OCFS2_SB(inode->i_sb), - p_blkno, p_blocks, bhs, 0, - NULL); + status = ocfs2_read_blocks_sync(OCFS2_SB(inode->i_sb), + p_blkno, p_blocks, bhs); if (status < 0) { mlog_errno(status); goto bail; @@ -865,8 +867,7 @@ static int ocfs2_force_read_journal(struct inode *inode) bail: for(i = 0; i < CONCURRENT_JOURNAL_FILL; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); mlog_exit(status); return status; } @@ -1133,7 +1134,8 @@ static int ocfs2_read_journal_inode(struct ocfs2_super *osb, } SET_INODE_JOURNAL(inode); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -1229,19 +1231,19 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, } mlog(0, "calling journal_init_inode\n"); - journal = journal_init_inode(inode); + journal = jbd2_journal_init_inode(inode); if (journal == NULL) { mlog(ML_ERROR, "Linux journal layer error\n"); status = -EIO; goto done; } - status = journal_load(journal); + status = jbd2_journal_load(journal); if (status < 0) { mlog_errno(status); if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); goto done; } @@ -1249,9 +1251,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, /* wipe the journal */ mlog(0, "flushing the journal.\n"); - journal_lock_updates(journal); - status = journal_flush(journal); - journal_unlock_updates(journal); + jbd2_journal_lock_updates(journal); + status = jbd2_journal_flush(journal); + jbd2_journal_unlock_updates(journal); if (status < 0) mlog_errno(status); @@ -1272,7 +1274,7 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb, if (!igrab(inode)) BUG(); - journal_destroy(journal); + jbd2_journal_destroy(journal); done: /* drop the lock on this nodes journal */ @@ -1282,8 +1284,7 @@ done: if (inode) iput(inode); - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 2178ebffa05..d4d14e9a3ce 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -27,7 +27,12 @@ #define OCFS2_JOURNAL_H #include <linux/fs.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif enum ocfs2_journal_state { OCFS2_JOURNAL_FREE = 0, @@ -215,8 +220,8 @@ static inline void ocfs2_checkpoint_inode(struct inode *inode) * buffer. Will have to call ocfs2_journal_dirty once * we've actually dirtied it. Type is one of . or . * ocfs2_journal_dirty - Mark a journalled buffer as having dirty data. - * ocfs2_journal_dirty_data - Indicate that a data buffer should go out before - * the current handle commits. + * ocfs2_jbd2_file_inode - Mark an inode so that its data goes out before + * the current handle commits. */ /* You must always start_trans with a number of buffs > 0, but it's @@ -268,8 +273,10 @@ int ocfs2_journal_access(handle_t *handle, */ int ocfs2_journal_dirty(handle_t *handle, struct buffer_head *bh); +#ifdef CONFIG_OCFS2_COMPAT_JBD int ocfs2_journal_dirty_data(handle_t *handle, struct buffer_head *bh); +#endif /* * Credit Macros: @@ -283,6 +290,9 @@ int ocfs2_journal_dirty_data(handle_t *handle, /* simple file updates like chmod, etc. */ #define OCFS2_INODE_UPDATE_CREDITS 1 +/* extended attribute block update */ +#define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 + /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) @@ -340,11 +350,23 @@ int ocfs2_journal_dirty_data(handle_t *handle, #define OCFS2_RENAME_CREDITS (3 * OCFS2_INODE_UPDATE_CREDITS + 3 \ + OCFS2_UNLINK_CREDITS) +/* global bitmap dinode, group desc., relinked group, + * suballocator dinode, group desc., relinked group, + * dinode, xattr block */ +#define OCFS2_XATTR_BLOCK_CREATE_CREDITS (OCFS2_SUBALLOC_ALLOC * 2 + \ + + OCFS2_INODE_UPDATE_CREDITS \ + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS) + +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ static inline int ocfs2_calc_extend_credits(struct super_block *sb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, u32 bits_wanted) { - int bitmap_blocks, sysfile_bitmap_blocks, dinode_blocks; + int bitmap_blocks, sysfile_bitmap_blocks, extent_blocks; /* bitmap dinode, group desc. + relinked group. */ bitmap_blocks = OCFS2_SUBALLOC_ALLOC; @@ -355,16 +377,16 @@ static inline int ocfs2_calc_extend_credits(struct super_block *sb, * however many metadata chunks needed * a remaining suballoc * alloc. */ sysfile_bitmap_blocks = 1 + - (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(fe); + (OCFS2_SUBALLOC_ALLOC - 1) * ocfs2_extend_meta_needed(root_el); /* this does not include *new* metadata blocks, which are - * accounted for in sysfile_bitmap_blocks. fe + + * accounted for in sysfile_bitmap_blocks. root_el + * prev. last_eb_blk + blocks along edge of tree. * calc_symlink_credits passes because we just need 1 * credit for the dinode there. */ - dinode_blocks = 1 + 1 + le16_to_cpu(fe->id2.i_list.l_tree_depth); + extent_blocks = 1 + 1 + le16_to_cpu(root_el->l_tree_depth); - return bitmap_blocks + sysfile_bitmap_blocks + dinode_blocks; + return bitmap_blocks + sysfile_bitmap_blocks + extent_blocks; } static inline int ocfs2_calc_symlink_credits(struct super_block *sb) @@ -415,4 +437,16 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb, return credits; } +static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) +{ + return jbd2_journal_file_inode(handle, &OCFS2_I(inode)->ip_jinode); +} + +static inline int ocfs2_begin_ordered_truncate(struct inode *inode, + loff_t new_size) +{ + return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, + new_size); +} + #endif /* OCFS2_JOURNAL_H */ diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 28e492e4ec8..687b28713c3 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -28,6 +28,7 @@ #include <linux/slab.h> #include <linux/highmem.h> #include <linux/bitops.h> +#include <linux/debugfs.h> #define MLOG_MASK_PREFIX ML_DISK_ALLOC #include <cluster/masklog.h> @@ -47,8 +48,6 @@ #define OCFS2_LOCAL_ALLOC(dinode) (&((dinode)->id2.i_lab)) -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb); - static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc); static int ocfs2_local_alloc_find_clear_bits(struct ocfs2_super *osb, @@ -75,24 +74,129 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, struct inode *local_alloc_inode); -static inline int ocfs2_local_alloc_window_bits(struct ocfs2_super *osb) +#ifdef CONFIG_OCFS2_FS_STATS + +static int ocfs2_la_debug_open(struct inode *inode, struct file *file) +{ + file->private_data = inode->i_private; + return 0; +} + +#define LA_DEBUG_BUF_SZ PAGE_CACHE_SIZE +#define LA_DEBUG_VER 1 +static ssize_t ocfs2_la_debug_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + static DEFINE_MUTEX(la_debug_mutex); + struct ocfs2_super *osb = file->private_data; + int written, ret; + char *buf = osb->local_alloc_debug_buf; + + mutex_lock(&la_debug_mutex); + memset(buf, 0, LA_DEBUG_BUF_SZ); + + written = snprintf(buf, LA_DEBUG_BUF_SZ, + "0x%x\t0x%llx\t%u\t%u\t0x%x\n", + LA_DEBUG_VER, + (unsigned long long)osb->la_last_gd, + osb->local_alloc_default_bits, + osb->local_alloc_bits, osb->local_alloc_state); + + ret = simple_read_from_buffer(userbuf, count, ppos, buf, written); + + mutex_unlock(&la_debug_mutex); + return ret; +} + +static const struct file_operations ocfs2_la_debug_fops = { + .open = ocfs2_la_debug_open, + .read = ocfs2_la_debug_read, +}; + +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + osb->local_alloc_debug_buf = kmalloc(LA_DEBUG_BUF_SZ, GFP_NOFS); + if (!osb->local_alloc_debug_buf) + return; + + osb->local_alloc_debug = debugfs_create_file("local_alloc_stats", + S_IFREG|S_IRUSR, + osb->osb_debug_root, + osb, + &ocfs2_la_debug_fops); + if (!osb->local_alloc_debug) { + kfree(osb->local_alloc_debug_buf); + osb->local_alloc_debug_buf = NULL; + } +} + +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + if (osb->local_alloc_debug) + debugfs_remove(osb->local_alloc_debug); + + if (osb->local_alloc_debug_buf) + kfree(osb->local_alloc_debug_buf); + + osb->local_alloc_debug_buf = NULL; + osb->local_alloc_debug = NULL; +} +#else /* CONFIG_OCFS2_FS_STATS */ +static void ocfs2_init_la_debug(struct ocfs2_super *osb) +{ + return; +} +static void ocfs2_shutdown_la_debug(struct ocfs2_super *osb) +{ + return; +} +#endif + +static inline int ocfs2_la_state_enabled(struct ocfs2_super *osb) { - BUG_ON(osb->s_clustersize_bits > 20); + return (osb->local_alloc_state == OCFS2_LA_THROTTLED || + osb->local_alloc_state == OCFS2_LA_ENABLED); +} - /* Size local alloc windows by the megabyte */ - return osb->local_alloc_size << (20 - osb->s_clustersize_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters) +{ + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED || + osb->local_alloc_state == OCFS2_LA_THROTTLED) + if (num_clusters >= osb->local_alloc_default_bits) { + cancel_delayed_work(&osb->la_enable_wq); + osb->local_alloc_state = OCFS2_LA_ENABLED; + } + spin_unlock(&osb->osb_lock); +} + +void ocfs2_la_enable_worker(struct work_struct *work) +{ + struct ocfs2_super *osb = + container_of(work, struct ocfs2_super, + la_enable_wq.work); + spin_lock(&osb->osb_lock); + osb->local_alloc_state = OCFS2_LA_ENABLED; + spin_unlock(&osb->osb_lock); } /* * Tell us whether a given allocation should use the local alloc * file. Otherwise, it has to go to the main bitmap. + * + * This function does semi-dirty reads of local alloc size and state! + * This is ok however, as the values are re-checked once under mutex. */ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) { - int la_bits = ocfs2_local_alloc_window_bits(osb); int ret = 0; + int la_bits; + + spin_lock(&osb->osb_lock); + la_bits = osb->local_alloc_bits; - if (osb->local_alloc_state != OCFS2_LA_ENABLED) + if (!ocfs2_la_state_enabled(osb)) goto bail; /* la_bits should be at least twice the size (in clusters) of @@ -106,6 +210,7 @@ int ocfs2_alloc_should_use_local(struct ocfs2_super *osb, u64 bits) bail: mlog(0, "state=%d, bits=%llu, la_bits=%d, ret=%d\n", osb->local_alloc_state, (unsigned long long)bits, la_bits, ret); + spin_unlock(&osb->osb_lock); return ret; } @@ -120,14 +225,18 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); - if (osb->local_alloc_size == 0) + ocfs2_init_la_debug(osb); + + if (osb->local_alloc_bits == 0) goto bail; - if (ocfs2_local_alloc_window_bits(osb) >= osb->bitmap_cpg) { + if (osb->local_alloc_bits >= osb->bitmap_cpg) { mlog(ML_NOTICE, "Requested local alloc window %d is larger " "than max possible %u. Using defaults.\n", - ocfs2_local_alloc_window_bits(osb), (osb->bitmap_cpg - 1)); - osb->local_alloc_size = OCFS2_DEFAULT_LOCAL_ALLOC_SIZE; + osb->local_alloc_bits, (osb->bitmap_cpg - 1)); + osb->local_alloc_bits = + ocfs2_megabytes_to_clusters(osb->sb, + OCFS2_DEFAULT_LOCAL_ALLOC_SIZE); } /* read the alloc off disk */ @@ -139,8 +248,8 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) goto bail; } - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -185,13 +294,14 @@ int ocfs2_load_local_alloc(struct ocfs2_super *osb) bail: if (status < 0) - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) iput(inode); - mlog(0, "Local alloc window bits = %d\n", - ocfs2_local_alloc_window_bits(osb)); + if (status < 0) + ocfs2_shutdown_la_debug(osb); + + mlog(0, "Local alloc window bits = %d\n", osb->local_alloc_bits); mlog_exit(status); return status; @@ -217,6 +327,11 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) mlog_entry_void(); + cancel_delayed_work(&osb->la_enable_wq); + flush_workqueue(ocfs2_wq); + + ocfs2_shutdown_la_debug(osb); + if (osb->local_alloc_state == OCFS2_LA_UNUSED) goto out; @@ -295,8 +410,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock: - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); ocfs2_inode_unlock(main_bm_inode, 1); @@ -345,8 +459,8 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, mutex_lock(&inode->i_mutex); - status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, - &alloc_bh, 0, inode); + status = ocfs2_read_blocks(inode, OCFS2_I(inode)->ip_blkno, 1, + &alloc_bh, OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; @@ -372,8 +486,7 @@ bail: *alloc_copy = NULL; } - if (alloc_bh) - brelse(alloc_bh); + brelse(alloc_bh); if (inode) { mutex_unlock(&inode->i_mutex); @@ -441,8 +554,7 @@ out_unlock: out_mutex: mutex_unlock(&main_bm_inode->i_mutex); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); iput(main_bm_inode); @@ -453,8 +565,48 @@ out: return status; } +/* Check to see if the local alloc window is within ac->ac_max_block */ +static int ocfs2_local_alloc_in_range(struct inode *inode, + struct ocfs2_alloc_context *ac, + u32 bits_wanted) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *alloc; + struct ocfs2_local_alloc *la; + int start; + u64 block_off; + + if (!ac->ac_max_block) + return 1; + + alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; + la = OCFS2_LOCAL_ALLOC(alloc); + + start = ocfs2_local_alloc_find_clear_bits(osb, alloc, bits_wanted); + if (start == -1) { + mlog_errno(-ENOSPC); + return 0; + } + + /* + * Converting (bm_off + start + bits_wanted) to blocks gives us + * the blkno just past our actual allocation. This is perfect + * to compare with ac_max_block. + */ + block_off = ocfs2_clusters_to_blocks(inode->i_sb, + le32_to_cpu(la->la_bm_off) + + start + bits_wanted); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)block_off, + (unsigned long long)ac->ac_max_block); + if (block_off > ac->ac_max_block) + return 0; + + return 1; +} + /* - * make sure we've got at least bitswanted contiguous bits in the + * make sure we've got at least bits_wanted contiguous bits in the * local alloc. You lose them when you drop i_mutex. * * We will add ourselves to the transaction passed in, but may start @@ -485,16 +637,18 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mutex_lock(&local_alloc_inode->i_mutex); - if (osb->local_alloc_state != OCFS2_LA_ENABLED) { - status = -ENOSPC; - goto bail; - } - - if (bits_wanted > ocfs2_local_alloc_window_bits(osb)) { - mlog(0, "Asking for more than my max window size!\n"); + /* + * We must double check state and allocator bits because + * another process may have changed them while holding i_mutex. + */ + spin_lock(&osb->osb_lock); + if (!ocfs2_la_state_enabled(osb) || + (bits_wanted > osb->local_alloc_bits)) { + spin_unlock(&osb->osb_lock); status = -ENOSPC; goto bail; } + spin_unlock(&osb->osb_lock); alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data; @@ -522,6 +676,36 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, mlog_errno(status); goto bail; } + + /* + * Under certain conditions, the window slide code + * might have reduced the number of bits available or + * disabled the the local alloc entirely. Re-check + * here and return -ENOSPC if necessary. + */ + status = -ENOSPC; + if (!ocfs2_la_state_enabled(osb)) + goto bail; + + free_bits = le32_to_cpu(alloc->id1.bitmap1.i_total) - + le32_to_cpu(alloc->id1.bitmap1.i_used); + if (bits_wanted > free_bits) + goto bail; + } + + if (ac->ac_max_block) + mlog(0, "Calling in_range for max block %llu\n", + (unsigned long long)ac->ac_max_block); + + if (!ocfs2_local_alloc_in_range(local_alloc_inode, ac, + bits_wanted)) { + /* + * The window is outside ac->ac_max_block. + * This errno tells the caller to keep localalloc enabled + * but to get the allocation from the main bitmap. + */ + status = -EFBIG; + goto bail; } ac->ac_inode = local_alloc_inode; @@ -789,6 +973,85 @@ bail: return status; } +enum ocfs2_la_event { + OCFS2_LA_EVENT_SLIDE, /* Normal window slide. */ + OCFS2_LA_EVENT_FRAGMENTED, /* The global bitmap has + * enough bits theoretically + * free, but a contiguous + * allocation could not be + * found. */ + OCFS2_LA_EVENT_ENOSPC, /* Global bitmap doesn't have + * enough bits free to satisfy + * our request. */ +}; +#define OCFS2_LA_ENABLE_INTERVAL (30 * HZ) +/* + * Given an event, calculate the size of our next local alloc window. + * + * This should always be called under i_mutex of the local alloc inode + * so that local alloc disabling doesn't race with processes trying to + * use the allocator. + * + * Returns the state which the local alloc was left in. This value can + * be ignored by some paths. + */ +static int ocfs2_recalc_la_window(struct ocfs2_super *osb, + enum ocfs2_la_event event) +{ + unsigned int bits; + int state; + + spin_lock(&osb->osb_lock); + if (osb->local_alloc_state == OCFS2_LA_DISABLED) { + WARN_ON_ONCE(osb->local_alloc_state == OCFS2_LA_DISABLED); + goto out_unlock; + } + + /* + * ENOSPC and fragmentation are treated similarly for now. + */ + if (event == OCFS2_LA_EVENT_ENOSPC || + event == OCFS2_LA_EVENT_FRAGMENTED) { + /* + * We ran out of contiguous space in the primary + * bitmap. Drastically reduce the number of bits used + * by local alloc until we have to disable it. + */ + bits = osb->local_alloc_bits >> 1; + if (bits > ocfs2_megabytes_to_clusters(osb->sb, 1)) { + /* + * By setting state to THROTTLED, we'll keep + * the number of local alloc bits used down + * until an event occurs which would give us + * reason to assume the bitmap situation might + * have changed. + */ + osb->local_alloc_state = OCFS2_LA_THROTTLED; + osb->local_alloc_bits = bits; + } else { + osb->local_alloc_state = OCFS2_LA_DISABLED; + } + queue_delayed_work(ocfs2_wq, &osb->la_enable_wq, + OCFS2_LA_ENABLE_INTERVAL); + goto out_unlock; + } + + /* + * Don't increase the size of the local alloc window until we + * know we might be able to fulfill the request. Otherwise, we + * risk bouncing around the global bitmap during periods of + * low space. + */ + if (osb->local_alloc_state != OCFS2_LA_THROTTLED) + osb->local_alloc_bits = osb->local_alloc_default_bits; + +out_unlock: + state = osb->local_alloc_state; + spin_unlock(&osb->osb_lock); + + return state; +} + static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac, struct inode **bitmap_inode, @@ -803,12 +1066,21 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_local_alloc_window_bits(osb); +retry_enospc: + (*ac)->ac_bits_wanted = osb->local_alloc_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); + if (status == -ENOSPC) { + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == + OCFS2_LA_DISABLED) + goto bail; + + ocfs2_free_ac_resource(*ac); + memset(*ac, 0, sizeof(struct ocfs2_alloc_context)); + goto retry_enospc; + } if (status < 0) { - if (status != -ENOSPC) - mlog_errno(status); + mlog_errno(status); goto bail; } @@ -849,7 +1121,7 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, "one\n"); mlog(0, "Allocating %u clusters for a new window.\n", - ocfs2_local_alloc_window_bits(osb)); + osb->local_alloc_bits); /* Instruct the allocation code to try the most recently used * cluster group. We'll re-record the group used this pass @@ -859,9 +1131,36 @@ static int ocfs2_local_alloc_new_window(struct ocfs2_super *osb, /* we used the generic suballoc reserve function, but we set * everything up nicely, so there's no reason why we can't use * the more specific cluster api to claim bits. */ - status = ocfs2_claim_clusters(osb, handle, ac, - ocfs2_local_alloc_window_bits(osb), + status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, &cluster_off, &cluster_count); + if (status == -ENOSPC) { +retry_enospc: + /* + * Note: We could also try syncing the journal here to + * allow use of any free bits which the current + * transaction can't give us access to. --Mark + */ + if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_FRAGMENTED) == + OCFS2_LA_DISABLED) + goto bail; + + status = ocfs2_claim_clusters(osb, handle, ac, + osb->local_alloc_bits, + &cluster_off, + &cluster_count); + if (status == -ENOSPC) + goto retry_enospc; + /* + * We only shrunk the *minimum* number of in our + * request - it's entirely possible that the allocator + * might give us more than we asked for. + */ + if (status == 0) { + spin_lock(&osb->osb_lock); + osb->local_alloc_bits = cluster_count; + spin_unlock(&osb->osb_lock); + } + } if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -905,6 +1204,8 @@ static int ocfs2_local_alloc_slide_window(struct ocfs2_super *osb, mlog_entry_void(); + ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_SLIDE); + /* This will lock the main bitmap for us. */ status = ocfs2_local_alloc_reserve_for_window(osb, &ac, @@ -976,8 +1277,7 @@ bail: if (handle) ocfs2_commit_trans(osb, handle); - if (main_bm_bh) - brelse(main_bm_bh); + brelse(main_bm_bh); if (main_bm_inode) iput(main_bm_inode); diff --git a/fs/ocfs2/localalloc.h b/fs/ocfs2/localalloc.h index 3f76631e110..ac5ea9f8665 100644 --- a/fs/ocfs2/localalloc.h +++ b/fs/ocfs2/localalloc.h @@ -52,4 +52,8 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb, u32 *bit_off, u32 *num_bits); +void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb, + unsigned int num_clusters); +void ocfs2_la_enable_worker(struct work_struct *work); + #endif /* OCFS2_LOCALALLOC_H */ diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 203f8714387..544ac624517 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -24,6 +24,7 @@ */ #include <linux/fs.h> +#include <linux/fcntl.h> #define MLOG_MASK_PREFIX ML_INODE #include <cluster/masklog.h> @@ -32,6 +33,7 @@ #include "dlmglue.h" #include "file.h" +#include "inode.h" #include "locks.h" static int ocfs2_do_flock(struct file *file, struct inode *inode, @@ -123,3 +125,16 @@ int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl) else return ocfs2_do_flock(file, inode, cmd, fl); } + +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) +{ + struct inode *inode = file->f_mapping->host; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (!(fl->fl_flags & FL_POSIX)) + return -ENOLCK; + if (__mandatory_lock(inode)) + return -ENOLCK; + + return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); +} diff --git a/fs/ocfs2/locks.h b/fs/ocfs2/locks.h index 9743ef2324e..496d488b271 100644 --- a/fs/ocfs2/locks.h +++ b/fs/ocfs2/locks.h @@ -27,5 +27,6 @@ #define OCFS2_LOCKS_H int ocfs2_flock(struct file *file, int cmd, struct file_lock *fl); +int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl); #endif /* OCFS2_LOCKS_H */ diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d5d808fe014..485a6aa0ad3 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -60,6 +60,7 @@ #include "symlink.h" #include "sysfile.h" #include "uptodate.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -327,14 +328,9 @@ leave: if (status == -ENOSPC) mlog(0, "Disk is full\n"); - if (new_fe_bh) - brelse(new_fe_bh); - - if (de_bh) - brelse(de_bh); - - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(new_fe_bh); + brelse(de_bh); + brelse(parent_fe_bh); if ((status < 0) && inode) iput(inode); @@ -647,12 +643,9 @@ out_unlock_inode: out: ocfs2_inode_unlock(dir, 1); - if (de_bh) - brelse(de_bh); - if (fe_bh) - brelse(fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); + brelse(de_bh); + brelse(fe_bh); + brelse(parent_fe_bh); mlog_exit(err); @@ -851,17 +844,10 @@ leave: iput(orphan_dir); } - if (fe_bh) - brelse(fe_bh); - - if (dirent_bh) - brelse(dirent_bh); - - if (parent_node_bh) - brelse(parent_node_bh); - - if (orphan_entry_bh) - brelse(orphan_entry_bh); + brelse(fe_bh); + brelse(dirent_bh); + brelse(parent_node_bh); + brelse(orphan_entry_bh); mlog_exit(status); @@ -1372,24 +1358,15 @@ bail: if (new_inode) iput(new_inode); - if (newfe_bh) - brelse(newfe_bh); - if (old_inode_bh) - brelse(old_inode_bh); - if (old_dir_bh) - brelse(old_dir_bh); - if (new_dir_bh) - brelse(new_dir_bh); - if (new_de_bh) - brelse(new_de_bh); - if (old_de_bh) - brelse(old_de_bh); - if (old_inode_de_bh) - brelse(old_inode_de_bh); - if (orphan_entry_bh) - brelse(orphan_entry_bh); - if (insert_entry_bh) - brelse(insert_entry_bh); + brelse(newfe_bh); + brelse(old_inode_bh); + brelse(old_dir_bh); + brelse(new_dir_bh); + brelse(new_de_bh); + brelse(old_de_bh); + brelse(old_inode_de_bh); + brelse(orphan_entry_bh); + brelse(insert_entry_bh); mlog_exit(status); @@ -1492,8 +1469,7 @@ bail: if (bhs) { for(i = 0; i < blocks; i++) - if (bhs[i]) - brelse(bhs[i]); + brelse(bhs[i]); kfree(bhs); } @@ -1598,10 +1574,10 @@ static int ocfs2_symlink(struct inode *dir, u32 offset = 0; inode->i_op = &ocfs2_symlink_inode_operations; - status = ocfs2_do_extend_allocation(osb, inode, &offset, 1, 0, - new_fe_bh, - handle, data_ac, NULL, - NULL); + status = ocfs2_add_inode_data(osb, inode, &offset, 1, 0, + new_fe_bh, + handle, data_ac, NULL, + NULL); if (status < 0) { if (status != -ENOSPC && status != -EINTR) { mlog(ML_ERROR, @@ -1659,12 +1635,9 @@ bail: ocfs2_inode_unlock(dir, 1); - if (new_fe_bh) - brelse(new_fe_bh); - if (parent_fe_bh) - brelse(parent_fe_bh); - if (de_bh) - brelse(de_bh); + brelse(new_fe_bh); + brelse(parent_fe_bh); + brelse(de_bh); if (inode_ac) ocfs2_free_alloc_context(inode_ac); if (data_ac) @@ -1759,8 +1732,7 @@ leave: iput(orphan_dir_inode); } - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1780,10 +1752,9 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); - status = ocfs2_read_block(osb, + status = ocfs2_read_block(orphan_dir_inode, OCFS2_I(orphan_dir_inode)->ip_blkno, - &orphan_dir_bh, OCFS2_BH_CACHED, - orphan_dir_inode); + &orphan_dir_bh); if (status < 0) { mlog_errno(status); goto leave; @@ -1829,8 +1800,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); leave: - if (orphan_dir_bh) - brelse(orphan_dir_bh); + brelse(orphan_dir_bh); mlog_exit(status); return status; @@ -1898,8 +1868,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, } leave: - if (target_de_bh) - brelse(target_de_bh); + brelse(target_de_bh); mlog_exit(status); return status; @@ -1918,4 +1887,8 @@ const struct inode_operations ocfs2_dir_iops = { .setattr = ocfs2_setattr, .getattr = ocfs2_getattr, .permission = ocfs2_permission, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 7f625f2b111..a21a465490c 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -34,7 +34,12 @@ #include <linux/workqueue.h> #include <linux/kref.h> #include <linux/mutex.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +# include "ocfs2_jbd_compat.h" +#endif /* For union ocfs2_dlm_lksb */ #include "stackglue.h" @@ -171,9 +176,13 @@ struct ocfs2_alloc_stats enum ocfs2_local_alloc_state { - OCFS2_LA_UNUSED = 0, - OCFS2_LA_ENABLED, - OCFS2_LA_DISABLED + OCFS2_LA_UNUSED = 0, /* Local alloc will never be used for + * this mountpoint. */ + OCFS2_LA_ENABLED, /* Local alloc is in use. */ + OCFS2_LA_THROTTLED, /* Local alloc is in use, but number + * of bits has been reduced. */ + OCFS2_LA_DISABLED /* Local alloc has temporarily been + * disabled. */ }; enum ocfs2_mount_options @@ -184,6 +193,8 @@ enum ocfs2_mount_options OCFS2_MOUNT_ERRORS_PANIC = 1 << 3, /* Panic on errors */ OCFS2_MOUNT_DATA_WRITEBACK = 1 << 4, /* No data ordering */ OCFS2_MOUNT_LOCALFLOCKS = 1 << 5, /* No cluster aware user file locks */ + OCFS2_MOUNT_NOUSERXATTR = 1 << 6, /* No user xattr */ + OCFS2_MOUNT_INODE64 = 1 << 7, /* Allow inode numbers > 2^32 */ }; #define OCFS2_OSB_SOFT_RO 0x0001 @@ -214,6 +225,7 @@ struct ocfs2_super u32 bitmap_cpg; u8 *uuid; char *uuid_str; + u32 uuid_hash; u8 *vol_label; u64 first_cluster_group_blkno; u32 fs_generation; @@ -241,6 +253,7 @@ struct ocfs2_super int s_sectsize_bits; int s_clustersize; int s_clustersize_bits; + unsigned int s_xattr_inline_size; atomic_t vol_state; struct mutex recovery_lock; @@ -252,11 +265,27 @@ struct ocfs2_super struct ocfs2_journal *journal; unsigned long osb_commit_interval; - int local_alloc_size; - enum ocfs2_local_alloc_state local_alloc_state; + struct delayed_work la_enable_wq; + + /* + * Must hold local alloc i_mutex and osb->osb_lock to change + * local_alloc_bits. Reads can be done under either lock. + */ + unsigned int local_alloc_bits; + unsigned int local_alloc_default_bits; + + enum ocfs2_local_alloc_state local_alloc_state; /* protected + * by osb_lock */ + struct buffer_head *local_alloc_bh; + u64 la_last_gd; +#ifdef CONFIG_OCFS2_FS_STATS + struct dentry *local_alloc_debug; + char *local_alloc_debug_buf; +#endif + /* Next two fields are for local node slot recovery during * mount. */ int dirty; @@ -340,6 +369,13 @@ static inline int ocfs2_supports_inline_data(struct ocfs2_super *osb) return 0; } +static inline int ocfs2_supports_xattr(struct ocfs2_super *osb) +{ + if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_XATTR) + return 1; + return 0; +} + /* set / clear functions because cluster events can make these happen * in parallel so we want the transitions to be atomic. this also * means that any future flags osb_flags must be protected by spinlock @@ -554,6 +590,14 @@ static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb) return pages_per_cluster; } +static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, + unsigned int megs) +{ + BUILD_BUG_ON(OCFS2_MAX_CLUSTERSIZE > 1048576); + + return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); +} + static inline void ocfs2_init_inode_steal_slot(struct ocfs2_super *osb) { spin_lock(&osb->osb_lock); diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 4f619850ccf..f24ce3d3f95 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -64,6 +64,7 @@ #define OCFS2_INODE_SIGNATURE "INODE01" #define OCFS2_EXTENT_BLOCK_SIGNATURE "EXBLK01" #define OCFS2_GROUP_DESC_SIGNATURE "GROUP01" +#define OCFS2_XATTR_BLOCK_SIGNATURE "XATTR01" /* Compatibility flags */ #define OCFS2_HAS_COMPAT_FEATURE(sb,mask) \ @@ -90,7 +91,8 @@ | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC \ | OCFS2_FEATURE_INCOMPAT_INLINE_DATA \ | OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP \ - | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK) + | OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK \ + | OCFS2_FEATURE_INCOMPAT_XATTR) #define OCFS2_FEATURE_RO_COMPAT_SUPP OCFS2_FEATURE_RO_COMPAT_UNWRITTEN /* @@ -127,10 +129,6 @@ /* Support for data packed into inode blocks */ #define OCFS2_FEATURE_INCOMPAT_INLINE_DATA 0x0040 -/* Support for the extended slot map */ -#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 - - /* * Support for alternate, userspace cluster stacks. If set, the superblock * field s_cluster_info contains a tag for the alternate stack in use as @@ -142,6 +140,12 @@ */ #define OCFS2_FEATURE_INCOMPAT_USERSPACE_STACK 0x0080 +/* Support for the extended slot map */ +#define OCFS2_FEATURE_INCOMPAT_EXTENDED_SLOT_MAP 0x100 + +/* Support for extended attributes */ +#define OCFS2_FEATURE_INCOMPAT_XATTR 0x0200 + /* * backup superblock flag is used to indicate that this volume * has backup superblocks. @@ -299,6 +303,12 @@ struct ocfs2_new_group_input { */ #define OCFS2_DEFAULT_LOCAL_ALLOC_SIZE 8 +/* + * Inline extended attribute size (in bytes) + * The value chosen should be aligned to 16 byte boundaries. + */ +#define OCFS2_MIN_XATTR_INLINE_SIZE 256 + struct ocfs2_system_inode_info { char *si_name; int si_iflags; @@ -563,7 +573,7 @@ struct ocfs2_super_block { /*40*/ __le16 s_max_slots; /* Max number of simultaneous mounts before tunefs required */ __le16 s_tunefs_flag; - __le32 s_reserved1; + __le32 s_uuid_hash; /* hash value of uuid */ __le64 s_first_cluster_group; /* Block offset of 1st cluster * group header */ /*50*/ __u8 s_label[OCFS2_MAX_VOL_LABEL_LEN]; /* Label for mounting, etc. */ @@ -571,7 +581,11 @@ struct ocfs2_super_block { /*A0*/ struct ocfs2_cluster_info s_cluster_info; /* Selected userspace stack. Only valid with INCOMPAT flag. */ -/*B8*/ __le64 s_reserved2[17]; /* Fill out superblock */ +/*B8*/ __le16 s_xattr_inline_size; /* extended attribute inline size + for this fs*/ + __le16 s_reserved0; + __le32 s_reserved1; +/*C0*/ __le64 s_reserved2[16]; /* Fill out superblock */ /*140*/ /* @@ -621,7 +635,8 @@ struct ocfs2_dinode { belongs to */ __le16 i_suballoc_bit; /* Bit offset in suballocator block group */ -/*10*/ __le32 i_reserved0; +/*10*/ __le16 i_reserved0; + __le16 i_xattr_inline_size; __le32 i_clusters; /* Cluster count */ __le32 i_uid; /* Owner UID */ __le32 i_gid; /* Owning GID */ @@ -640,11 +655,12 @@ struct ocfs2_dinode { __le32 i_atime_nsec; __le32 i_ctime_nsec; __le32 i_mtime_nsec; - __le32 i_attr; +/*70*/ __le32 i_attr; __le16 i_orphaned_slot; /* Only valid when OCFS2_ORPHANED_FL was set in i_flags */ __le16 i_dyn_features; -/*70*/ __le64 i_reserved2[8]; + __le64 i_xattr_loc; +/*80*/ __le64 i_reserved2[7]; /*B8*/ union { __le64 i_pad1; /* Generic way to refer to this 64bit union */ @@ -715,6 +731,136 @@ struct ocfs2_group_desc /*40*/ __u8 bg_bitmap[0]; }; +/* + * On disk extended attribute structure for OCFS2. + */ + +/* + * ocfs2_xattr_entry indicates one extend attribute. + * + * Note that it can be stored in inode, one block or one xattr bucket. + */ +struct ocfs2_xattr_entry { + __le32 xe_name_hash; /* hash value of xattr prefix+suffix. */ + __le16 xe_name_offset; /* byte offset from the 1st etnry in the local + local xattr storage(inode, xattr block or + xattr bucket). */ + __u8 xe_name_len; /* xattr name len, does't include prefix. */ + __u8 xe_type; /* the low 7 bits indicates the name prefix's + * type and the highest 1 bits indicate whether + * the EA is stored in the local storage. */ + __le64 xe_value_size; /* real xattr value length. */ +}; + +/* + * On disk structure for xattr header. + * + * One ocfs2_xattr_header describes how many ocfs2_xattr_entry records in + * the local xattr storage. + */ +struct ocfs2_xattr_header { + __le16 xh_count; /* contains the count of how + many records are in the + local xattr storage. */ + __le16 xh_free_start; /* current offset for storing + xattr. */ + __le16 xh_name_value_len; /* total length of name/value + length in this bucket. */ + __le16 xh_num_buckets; /* bucket nums in one extent + record, only valid in the + first bucket. */ + __le64 xh_csum; + struct ocfs2_xattr_entry xh_entries[0]; /* xattr entry list. */ +}; + +/* + * On disk structure for xattr value root. + * + * It is used when one extended attribute's size is larger, and we will save it + * in an outside cluster. It will stored in a b-tree like file content. + */ +struct ocfs2_xattr_value_root { +/*00*/ __le32 xr_clusters; /* clusters covered by xattr value. */ + __le32 xr_reserved0; + __le64 xr_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xr_list; /* Extent record list */ +}; + +/* + * On disk structure for xattr tree root. + * + * It is used when there are too many extended attributes for one file. These + * attributes will be organized and stored in an indexed-btree. + */ +struct ocfs2_xattr_tree_root { +/*00*/ __le32 xt_clusters; /* clusters covered by xattr. */ + __le32 xt_reserved0; + __le64 xt_last_eb_blk; /* Pointer to last extent block */ +/*10*/ struct ocfs2_extent_list xt_list; /* Extent record list */ +}; + +#define OCFS2_XATTR_INDEXED 0x1 +#define OCFS2_HASH_SHIFT 5 +#define OCFS2_XATTR_ROUND 3 +#define OCFS2_XATTR_SIZE(size) (((size) + OCFS2_XATTR_ROUND) & \ + ~(OCFS2_XATTR_ROUND)) + +#define OCFS2_XATTR_BUCKET_SIZE 4096 +#define OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET (OCFS2_XATTR_BUCKET_SIZE \ + / OCFS2_MIN_BLOCKSIZE) + +/* + * On disk structure for xattr block. + */ +struct ocfs2_xattr_block { +/*00*/ __u8 xb_signature[8]; /* Signature for verification */ + __le16 xb_suballoc_slot; /* Slot suballocator this + block belongs to. */ + __le16 xb_suballoc_bit; /* Bit offset in suballocator + block group */ + __le32 xb_fs_generation; /* Must match super block */ +/*10*/ __le64 xb_blkno; /* Offset on disk, in blocks */ + __le64 xb_csum; +/*20*/ __le16 xb_flags; /* Indicates whether this block contains + real xattr or a xattr tree. */ + __le16 xb_reserved0; + __le32 xb_reserved1; + __le64 xb_reserved2; +/*30*/ union { + struct ocfs2_xattr_header xb_header; /* xattr header if this + block contains xattr */ + struct ocfs2_xattr_tree_root xb_root;/* xattr tree root if this + block cotains xattr + tree. */ + } xb_attrs; +}; + +#define OCFS2_XATTR_ENTRY_LOCAL 0x80 +#define OCFS2_XATTR_TYPE_MASK 0x7F +static inline void ocfs2_xattr_set_local(struct ocfs2_xattr_entry *xe, + int local) +{ + if (local) + xe->xe_type |= OCFS2_XATTR_ENTRY_LOCAL; + else + xe->xe_type &= ~OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline int ocfs2_xattr_is_local(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_ENTRY_LOCAL; +} + +static inline void ocfs2_xattr_set_type(struct ocfs2_xattr_entry *xe, int type) +{ + xe->xe_type |= type & OCFS2_XATTR_TYPE_MASK; +} + +static inline int ocfs2_xattr_get_type(struct ocfs2_xattr_entry *xe) +{ + return xe->xe_type & OCFS2_XATTR_TYPE_MASK; +} + #ifdef __KERNEL__ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) { @@ -728,6 +874,20 @@ static inline int ocfs2_max_inline_data(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_data.id_data); } +static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, + struct ocfs2_dinode *di) +{ + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data) - + xattrsize; + else + return sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_data.id_data); +} + static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) { int size; @@ -738,6 +898,24 @@ static inline int ocfs2_extent_recs_per_inode(struct super_block *sb) return size / sizeof(struct ocfs2_extent_rec); } +static inline int ocfs2_extent_recs_per_inode_with_xattr( + struct super_block *sb, + struct ocfs2_dinode *di) +{ + int size; + unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); + + if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs) - + xattrsize; + else + size = sb->s_blocksize - + offsetof(struct ocfs2_dinode, id2.i_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} + static inline int ocfs2_chain_recs_per_inode(struct super_block *sb) { int size; @@ -801,6 +979,17 @@ static inline u64 ocfs2_backup_super_blkno(struct super_block *sb, int index) return 0; } + +static inline u16 ocfs2_xattr_recs_per_xb(struct super_block *sb) +{ + int size; + + size = sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #else static inline int ocfs2_fast_symlink_chars(int blocksize) { @@ -884,6 +1073,17 @@ static inline uint64_t ocfs2_backup_super_blkno(int blocksize, int index) return 0; } + +static inline int ocfs2_xattr_recs_per_xb(int blocksize) +{ + int size; + + size = blocksize - + offsetof(struct ocfs2_xattr_block, + xb_attrs.xb_root.xt_list.l_recs); + + return size / sizeof(struct ocfs2_extent_rec); +} #endif /* __KERNEL__ */ diff --git a/fs/ocfs2/ocfs2_jbd_compat.h b/fs/ocfs2/ocfs2_jbd_compat.h new file mode 100644 index 00000000000..b91c78f8f55 --- /dev/null +++ b/fs/ocfs2/ocfs2_jbd_compat.h @@ -0,0 +1,82 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * ocfs2_jbd_compat.h + * + * Compatibility defines for JBD. + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + */ + +#ifndef OCFS2_JBD_COMPAT_H +#define OCFS2_JBD_COMPAT_H + +#ifndef CONFIG_OCFS2_COMPAT_JBD +# error Should not have been included +#endif + +struct jbd2_inode { + unsigned int dummy; +}; + +#define JBD2_BARRIER JFS_BARRIER +#define JBD2_DEFAULT_MAX_COMMIT_AGE JBD_DEFAULT_MAX_COMMIT_AGE + +#define jbd2_journal_ack_err journal_ack_err +#define jbd2_journal_clear_err journal_clear_err +#define jbd2_journal_destroy journal_destroy +#define jbd2_journal_dirty_metadata journal_dirty_metadata +#define jbd2_journal_errno journal_errno +#define jbd2_journal_extend journal_extend +#define jbd2_journal_flush journal_flush +#define jbd2_journal_force_commit journal_force_commit +#define jbd2_journal_get_write_access journal_get_write_access +#define jbd2_journal_get_undo_access journal_get_undo_access +#define jbd2_journal_init_inode journal_init_inode +#define jbd2_journal_invalidatepage journal_invalidatepage +#define jbd2_journal_load journal_load +#define jbd2_journal_lock_updates journal_lock_updates +#define jbd2_journal_restart journal_restart +#define jbd2_journal_start journal_start +#define jbd2_journal_start_commit journal_start_commit +#define jbd2_journal_stop journal_stop +#define jbd2_journal_try_to_free_buffers journal_try_to_free_buffers +#define jbd2_journal_unlock_updates journal_unlock_updates +#define jbd2_journal_wipe journal_wipe +#define jbd2_log_wait_commit log_wait_commit + +static inline int jbd2_journal_file_inode(handle_t *handle, + struct jbd2_inode *inode) +{ + return 0; +} + +static inline int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, + loff_t new_size) +{ + return 0; +} + +static inline void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, + struct inode *inode) +{ + return; +} + +static inline void jbd2_journal_release_jbd_inode(journal_t *journal, + struct jbd2_inode *jinode) +{ + return; +} + + +#endif /* OCFS2_JBD_COMPAT_H */ diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 8166968e901..ffd48db229a 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -200,7 +200,7 @@ static int update_backups(struct inode * inode, u32 clusters, char *data) if (cluster > clusters) break; - ret = ocfs2_read_block(osb, blkno, &backup, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, blkno, 1, &backup); if (ret < 0) { mlog_errno(ret); break; @@ -236,8 +236,8 @@ static void ocfs2_update_super_and_backups(struct inode *inode, * update the superblock last. * It doesn't matter if the write failed. */ - ret = ocfs2_read_block(osb, OCFS2_SUPER_BLOCK_BLKNO, - &super_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, OCFS2_SUPER_BLOCK_BLKNO, 1, + &super_bh); if (ret < 0) { mlog_errno(ret); goto out; @@ -332,8 +332,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) lgd_blkno = ocfs2_which_cluster_group(main_bm_inode, first_new_cluster - 1); - ret = ocfs2_read_block(osb, lgd_blkno, &group_bh, OCFS2_BH_CACHED, - main_bm_inode); + ret = ocfs2_read_block(main_bm_inode, lgd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); goto out_unlock; @@ -540,7 +539,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out_unlock; } - ret = ocfs2_read_block(osb, input->group, &group_bh, 0, NULL); + ret = ocfs2_read_blocks_sync(osb, input->group, 1, &group_bh); if (ret < 0) { mlog(ML_ERROR, "Can't read the group descriptor # %llu " "from the device.", (unsigned long long)input->group); diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c index bb5ff8939bf..bdda2d8f850 100644 --- a/fs/ocfs2/slot_map.c +++ b/fs/ocfs2/slot_map.c @@ -150,8 +150,8 @@ int ocfs2_refresh_slot_info(struct ocfs2_super *osb) * be !NULL. Thus, ocfs2_read_blocks() will ignore blocknr. If * this is not true, the read of -1 (UINT64_MAX) will fail. */ - ret = ocfs2_read_blocks(osb, -1, si->si_blocks, si->si_bh, 0, - si->si_inode); + ret = ocfs2_read_blocks(si->si_inode, -1, si->si_blocks, si->si_bh, + OCFS2_BH_IGNORE_CACHE); if (ret == 0) { spin_lock(&osb->osb_lock); ocfs2_update_slot_info(si); @@ -404,7 +404,8 @@ static int ocfs2_map_slot_buffers(struct ocfs2_super *osb, (unsigned long long)blkno); bh = NULL; /* Acquire a fresh bh */ - status = ocfs2_read_block(osb, blkno, &bh, 0, si->si_inode); + status = ocfs2_read_blocks(si->si_inode, blkno, 1, &bh, + OCFS2_BH_IGNORE_CACHE); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 353fc35c674..faec2d87935 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -28,6 +28,7 @@ #include "ocfs2.h" /* For struct ocfs2_lock_res */ #include "stackglue.h" +#include <linux/dlm_plock.h> /* * The control protocol starts with a handshake. Until the handshake @@ -746,6 +747,37 @@ static void user_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) { } +static int user_plock(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl) +{ + /* + * This more or less just demuxes the plock request into any + * one of three dlm calls. + * + * Internally, fs/dlm will pass these to a misc device, which + * a userspace daemon will read and write to. + * + * For now, cancel requests (which happen internally only), + * are turned into unlocks. Most of this function taken from + * gfs2_lock. + */ + + if (cmd == F_CANCELLK) { + cmd = F_SETLK; + fl->fl_type = F_UNLCK; + } + + if (IS_GETLK(cmd)) + return dlm_posix_get(conn->cc_lockspace, ino, file, fl); + else if (fl->fl_type == F_UNLCK) + return dlm_posix_unlock(conn->cc_lockspace, ino, file, fl); + else + return dlm_posix_lock(conn->cc_lockspace, ino, file, cmd, fl); +} + /* * Compare a requested locking protocol version against the current one. * @@ -839,6 +871,7 @@ static struct ocfs2_stack_operations ocfs2_user_plugin_ops = { .dlm_unlock = user_dlm_unlock, .lock_status = user_dlm_lock_status, .lock_lvb = user_dlm_lvb, + .plock = user_plock, .dump_lksb = user_dlm_dump_lksb, }; diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index 07f348b8d72..68b668b0e60 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -288,6 +288,26 @@ void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb) } EXPORT_SYMBOL_GPL(ocfs2_dlm_dump_lksb); +int ocfs2_stack_supports_plocks(void) +{ + return active_stack && active_stack->sp_ops->plock; +} +EXPORT_SYMBOL_GPL(ocfs2_stack_supports_plocks); + +/* + * ocfs2_plock() can only be safely called if + * ocfs2_stack_supports_plocks() returned true + */ +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl) +{ + WARN_ON_ONCE(active_stack->sp_ops->plock == NULL); + if (active_stack->sp_ops->plock) + return active_stack->sp_ops->plock(conn, ino, file, cmd, fl); + return -EOPNOTSUPP; +} +EXPORT_SYMBOL_GPL(ocfs2_plock); + int ocfs2_cluster_connect(const char *stack_name, const char *group, int grouplen, diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h index db56281dd1b..c571af375ef 100644 --- a/fs/ocfs2/stackglue.h +++ b/fs/ocfs2/stackglue.h @@ -28,6 +28,10 @@ #include "dlm/dlmapi.h" #include <linux/dlm.h> +/* Needed for plock-related prototypes */ +struct file; +struct file_lock; + /* * dlmconstants.h does not have a LOCAL flag. We hope to remove it * some day, but right now we need it. Let's fake it. This value is larger @@ -187,6 +191,17 @@ struct ocfs2_stack_operations { void *(*lock_lvb)(union ocfs2_dlm_lksb *lksb); /* + * Cluster-aware posix locks + * + * This is NULL for stacks which do not support posix locks. + */ + int (*plock)(struct ocfs2_cluster_connection *conn, + u64 ino, + struct file *file, + int cmd, + struct file_lock *fl); + + /* * This is an optoinal debugging hook. If provided, the * stack can dump debugging information about this lock. */ @@ -240,6 +255,10 @@ int ocfs2_dlm_lock_status(union ocfs2_dlm_lksb *lksb); void *ocfs2_dlm_lvb(union ocfs2_dlm_lksb *lksb); void ocfs2_dlm_dump_lksb(union ocfs2_dlm_lksb *lksb); +int ocfs2_stack_supports_plocks(void); +int ocfs2_plock(struct ocfs2_cluster_connection *conn, u64 ino, + struct file *file, int cmd, struct file_lock *fl); + void ocfs2_stack_glue_set_locking_protocol(struct ocfs2_locking_protocol *proto); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index d2d278fb981..c5ff18b46b5 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -62,15 +62,18 @@ static int ocfs2_block_group_fill(handle_t *handle, struct ocfs2_chain_list *cl); static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh); + struct buffer_head *bh, + u64 max_block); static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found); static int ocfs2_claim_suballoc_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac, @@ -110,8 +113,11 @@ static inline void ocfs2_block_to_cluster_group(struct inode *inode, u64 data_blkno, u64 *bg_blkno, u16 *bg_bit_off); +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac); -static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) { struct inode *inode = ac->ac_inode; @@ -124,10 +130,8 @@ static void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) iput(inode); ac->ac_inode = NULL; } - if (ac->ac_bh) { - brelse(ac->ac_bh); - ac->ac_bh = NULL; - } + brelse(ac->ac_bh); + ac->ac_bh = NULL; } void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac) @@ -276,7 +280,8 @@ static inline u16 ocfs2_find_smallest_chain(struct ocfs2_chain_list *cl) */ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, struct inode *alloc_inode, - struct buffer_head *bh) + struct buffer_head *bh, + u64 max_block) { int status, credits; struct ocfs2_dinode *fe = (struct ocfs2_dinode *) bh->b_data; @@ -294,9 +299,9 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, mlog_entry_void(); cl = &fe->id2.i_chain; - status = ocfs2_reserve_clusters(osb, - le16_to_cpu(cl->cl_cpg), - &ac); + status = ocfs2_reserve_clusters_with_limit(osb, + le16_to_cpu(cl->cl_cpg), + max_block, &ac); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -394,8 +399,7 @@ bail: if (ac) ocfs2_free_alloc_context(ac); - if (bg_bh) - brelse(bg_bh); + brelse(bg_bh); mlog_exit(status); return status; @@ -469,7 +473,8 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, goto bail; } - status = ocfs2_block_group_alloc(osb, alloc_inode, bh); + status = ocfs2_block_group_alloc(osb, alloc_inode, bh, + ac->ac_max_block); if (status < 0) { if (status != -ENOSPC) mlog_errno(status); @@ -486,16 +491,15 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, get_bh(bh); ac->ac_bh = bh; bail: - if (bh) - brelse(bh); + brelse(bh); mlog_exit(status); return status; } -int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, - struct ocfs2_alloc_context **ac) +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac) { int status; u32 slot; @@ -507,7 +511,7 @@ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, goto bail; } - (*ac)->ac_bits_wanted = ocfs2_extend_meta_needed(fe); + (*ac)->ac_bits_wanted = blocks; (*ac)->ac_which = OCFS2_AC_USE_META; slot = osb->slot_num; (*ac)->ac_group_search = ocfs2_block_group_search; @@ -532,6 +536,15 @@ bail: return status; } +int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, + struct ocfs2_extent_list *root_el, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_new_metadata_blocks(osb, + ocfs2_extend_meta_needed(root_el), + ac); +} + static int ocfs2_steal_inode_from_other_nodes(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac) { @@ -582,6 +595,14 @@ int ocfs2_reserve_new_inode(struct ocfs2_super *osb, (*ac)->ac_group_search = ocfs2_block_group_search; /* + * stat(2) can't handle i_ino > 32bits, so we tell the + * lower levels not to allocate us a block group past that + * limit. The 'inode64' mount option avoids this behavior. + */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64)) + (*ac)->ac_max_block = (u32)~0U; + + /* * slot is set when we successfully steal inode from other nodes. * It is reset in 3 places: * 1. when we flush the truncate log @@ -661,9 +682,9 @@ bail: /* Callers don't need to care which bitmap (local alloc or main) to * use so we figure it out for them, but unfortunately this clutters * things a bit. */ -int ocfs2_reserve_clusters(struct ocfs2_super *osb, - u32 bits_wanted, - struct ocfs2_alloc_context **ac) +static int ocfs2_reserve_clusters_with_limit(struct ocfs2_super *osb, + u32 bits_wanted, u64 max_block, + struct ocfs2_alloc_context **ac) { int status; @@ -677,24 +698,20 @@ int ocfs2_reserve_clusters(struct ocfs2_super *osb, } (*ac)->ac_bits_wanted = bits_wanted; + (*ac)->ac_max_block = max_block; status = -ENOSPC; if (ocfs2_alloc_should_use_local(osb, bits_wanted)) { status = ocfs2_reserve_local_alloc_bits(osb, bits_wanted, *ac); - if ((status < 0) && (status != -ENOSPC)) { + if (status == -EFBIG) { + /* The local alloc window is outside ac_max_block. + * use the main bitmap. */ + status = -ENOSPC; + } else if ((status < 0) && (status != -ENOSPC)) { mlog_errno(status); goto bail; - } else if (status == -ENOSPC) { - /* reserve_local_bits will return enospc with - * the local alloc inode still locked, so we - * can change this safely here. */ - mlog(0, "Disabling local alloc\n"); - /* We set to OCFS2_LA_DISABLED so that umount - * can clean up what's left of the local - * allocation */ - osb->local_alloc_state = OCFS2_LA_DISABLED; } } @@ -718,6 +735,13 @@ bail: return status; } +int ocfs2_reserve_clusters(struct ocfs2_super *osb, + u32 bits_wanted, + struct ocfs2_alloc_context **ac) +{ + return ocfs2_reserve_clusters_with_limit(osb, bits_wanted, 0, ac); +} + /* * More or less lifted from ext3. I'll leave their description below: * @@ -1000,11 +1024,14 @@ static inline int ocfs2_block_group_reasonably_empty(struct ocfs2_group_desc *bg static int ocfs2_cluster_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int search = -ENOSPC; int ret; + u64 blkoff; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *) group_bh->b_data; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); u16 tmp_off, tmp_found; unsigned int max_bits, gd_cluster_off; @@ -1037,6 +1064,17 @@ static int ocfs2_cluster_group_search(struct inode *inode, if (ret) return ret; + if (max_block) { + blkoff = ocfs2_clusters_to_blocks(inode->i_sb, + gd_cluster_off + + tmp_off + tmp_found); + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + return -ENOSPC; + } + /* ocfs2_block_group_find_clear_bits() might * return success, but we still want to return * -ENOSPC unless it found the minimum number @@ -1045,6 +1083,12 @@ static int ocfs2_cluster_group_search(struct inode *inode, *bit_off = tmp_off; *bits_found = tmp_found; search = 0; /* success */ + } else if (tmp_found) { + /* + * Don't show bits which we'll be returning + * for allocation to the local alloc bitmap. + */ + ocfs2_local_alloc_seen_free_bits(osb, tmp_found); } } @@ -1054,19 +1098,31 @@ static int ocfs2_cluster_group_search(struct inode *inode, static int ocfs2_block_group_search(struct inode *inode, struct buffer_head *group_bh, u32 bits_wanted, u32 min_bits, + u64 max_block, u16 *bit_off, u16 *bits_found) { int ret = -ENOSPC; + u64 blkoff; struct ocfs2_group_desc *bg = (struct ocfs2_group_desc *) group_bh->b_data; BUG_ON(min_bits != 1); BUG_ON(ocfs2_is_cluster_bitmap(inode)); - if (bg->bg_free_bits_count) + if (bg->bg_free_bits_count) { ret = ocfs2_block_group_find_clear_bits(OCFS2_SB(inode->i_sb), group_bh, bits_wanted, le16_to_cpu(bg->bg_bits), bit_off, bits_found); + if (!ret && max_block) { + blkoff = le64_to_cpu(bg->bg_blkno) + *bit_off + + *bits_found; + mlog(0, "Checking %llu against %llu\n", + (unsigned long long)blkoff, + (unsigned long long)max_block); + if (blkoff > max_block) + ret = -ENOSPC; + } + } return ret; } @@ -1116,8 +1172,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, struct ocfs2_group_desc *gd; struct inode *alloc_inode = ac->ac_inode; - ret = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), gd_blkno, - &group_bh, OCFS2_BH_CACHED, alloc_inode); + ret = ocfs2_read_block(alloc_inode, gd_blkno, &group_bh); if (ret < 0) { mlog_errno(ret); return ret; @@ -1131,7 +1186,7 @@ static int ocfs2_search_one_group(struct ocfs2_alloc_context *ac, } ret = ac->ac_group_search(alloc_inode, group_bh, bits_wanted, min_bits, - bit_off, &found); + ac->ac_max_block, bit_off, &found); if (ret < 0) { if (ret != -ENOSPC) mlog_errno(ret); @@ -1186,9 +1241,9 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, bits_wanted, chain, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno); - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), + status = ocfs2_read_block(alloc_inode, le64_to_cpu(cl->cl_recs[chain].c_blkno), - &group_bh, OCFS2_BH_CACHED, alloc_inode); + &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1204,21 +1259,20 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, bit_off, + bits_wanted, min_bits, + ac->ac_max_block, bit_off, &tmp_bits)) == -ENOSPC) { if (!bg->bg_next_group) break; - if (prev_group_bh) { - brelse(prev_group_bh); - prev_group_bh = NULL; - } + brelse(prev_group_bh); + prev_group_bh = NULL; + next_group = le64_to_cpu(bg->bg_next_group); prev_group_bh = group_bh; group_bh = NULL; - status = ocfs2_read_block(OCFS2_SB(alloc_inode->i_sb), - next_group, &group_bh, - OCFS2_BH_CACHED, alloc_inode); + status = ocfs2_read_block(alloc_inode, + next_group, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1307,10 +1361,8 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, *bg_blkno = le64_to_cpu(bg->bg_blkno); *bits_left = le16_to_cpu(bg->bg_free_bits_count); bail: - if (group_bh) - brelse(group_bh); - if (prev_group_bh) - brelse(prev_group_bh); + brelse(group_bh); + brelse(prev_group_bh); mlog_exit(status); return status; @@ -1723,7 +1775,6 @@ int ocfs2_free_suballoc_bits(handle_t *handle, { int status = 0; u32 tmp_used; - struct ocfs2_super *osb = OCFS2_SB(alloc_inode->i_sb); struct ocfs2_dinode *fe = (struct ocfs2_dinode *) alloc_bh->b_data; struct ocfs2_chain_list *cl = &fe->id2.i_chain; struct buffer_head *group_bh = NULL; @@ -1742,8 +1793,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, (unsigned long long)OCFS2_I(alloc_inode)->ip_blkno, count, (unsigned long long)bg_blkno, start_bit); - status = ocfs2_read_block(osb, bg_blkno, &group_bh, OCFS2_BH_CACHED, - alloc_inode); + status = ocfs2_read_block(alloc_inode, bg_blkno, &group_bh); if (status < 0) { mlog_errno(status); goto bail; @@ -1784,8 +1834,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, } bail: - if (group_bh) - brelse(group_bh); + brelse(group_bh); mlog_exit(status); return status; @@ -1838,9 +1887,15 @@ int ocfs2_free_clusters(handle_t *handle, status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, bg_start_bit, bg_blkno, num_clusters); - if (status < 0) + if (status < 0) { mlog_errno(status); + goto out; + } + ocfs2_local_alloc_seen_free_bits(OCFS2_SB(bitmap_inode->i_sb), + num_clusters); + +out: mlog_exit(status); return status; } @@ -1891,3 +1946,84 @@ static inline void ocfs2_debug_suballoc_inode(struct ocfs2_dinode *fe) (unsigned long long)fe->id2.i_chain.cl_recs[i].c_blkno); } } + +/* + * For a given allocation, determine which allocators will need to be + * accessed, and lock them, reserving the appropriate number of bits. + * + * Sparse file systems call this from ocfs2_write_begin_nolock() + * and ocfs2_allocate_unwritten_extents(). + * + * File systems which don't support holes call this from + * ocfs2_extend_allocation(). + */ +int ocfs2_lock_allocators(struct inode *inode, + struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac) +{ + int ret = 0, num_free_extents; + unsigned int max_recs_needed = clusters_to_add + 2 * extents_to_split; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + *meta_ac = NULL; + if (data_ac) + *data_ac = NULL; + + BUG_ON(clusters_to_add != 0 && data_ac == NULL); + + num_free_extents = ocfs2_num_free_extents(osb, inode, et); + if (num_free_extents < 0) { + ret = num_free_extents; + mlog_errno(ret); + goto out; + } + + /* + * Sparse allocation file systems need to be more conservative + * with reserving room for expansion - the actual allocation + * happens while we've got a journal handle open so re-taking + * a cluster lock (because we ran out of room for another + * extent) will violate ordering rules. + * + * Most of the time we'll only be seeing this 1 cluster at a time + * anyway. + * + * Always lock for any unwritten extents - we might want to + * add blocks during a split. + */ + if (!num_free_extents || + (ocfs2_sparse_alloc(osb) && num_free_extents < max_recs_needed)) { + ret = ocfs2_reserve_new_metadata(osb, et->et_root_el, meta_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + } + + if (clusters_to_add == 0) + goto out; + + ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto out; + } + +out: + if (ret) { + if (*meta_ac) { + ocfs2_free_alloc_context(*meta_ac); + *meta_ac = NULL; + } + + /* + * We cannot have an error and a non null *data_ac. + */ + } + + return ret; +} diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index 544c600662b..4df159d8f45 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -28,10 +28,11 @@ typedef int (group_search_t)(struct inode *, struct buffer_head *, - u32, - u32, - u16 *, - u16 *); + u32, /* bits_wanted */ + u32, /* min_bits */ + u64, /* max_block */ + u16 *, /* *bit_off */ + u16 *); /* *bits_found */ struct ocfs2_alloc_context { struct inode *ac_inode; /* which bitmap are we allocating from? */ @@ -51,6 +52,8 @@ struct ocfs2_alloc_context { group_search_t *ac_group_search; u64 ac_last_group; + u64 ac_max_block; /* Highest block number to allocate. 0 is + is the same as ~0 - unlimited */ }; void ocfs2_free_alloc_context(struct ocfs2_alloc_context *ac); @@ -59,9 +62,17 @@ static inline int ocfs2_alloc_context_bits_left(struct ocfs2_alloc_context *ac) return ac->ac_bits_wanted - ac->ac_bits_given; } +/* + * Please note that the caller must make sure that root_el is the root + * of extent tree. So for an inode, it should be &fe->id2.i_list. Otherwise + * the result may be wrong. + */ int ocfs2_reserve_new_metadata(struct ocfs2_super *osb, - struct ocfs2_dinode *fe, + struct ocfs2_extent_list *root_el, struct ocfs2_alloc_context **ac); +int ocfs2_reserve_new_metadata_blocks(struct ocfs2_super *osb, + int blocks, + struct ocfs2_alloc_context **ac); int ocfs2_reserve_new_inode(struct ocfs2_super *osb, struct ocfs2_alloc_context **ac); int ocfs2_reserve_clusters(struct ocfs2_super *osb, @@ -147,6 +158,7 @@ static inline int ocfs2_is_cluster_bitmap(struct inode *inode) * apis above. */ int ocfs2_reserve_cluster_bitmap_bits(struct ocfs2_super *osb, struct ocfs2_alloc_context *ac); +void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac); /* given a cluster offset, calculate which block group it belongs to * and return that block offset. */ @@ -156,4 +168,8 @@ u64 ocfs2_which_cluster_group(struct inode *inode, u32 cluster); int ocfs2_check_group_descriptor(struct super_block *sb, struct ocfs2_dinode *di, struct ocfs2_group_desc *gd); +int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_extent_tree *et, + u32 clusters_to_add, u32 extents_to_split, + struct ocfs2_alloc_context **data_ac, + struct ocfs2_alloc_context **meta_ac); #endif /* _CHAINALLOC_H_ */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 70334d85aff..304b63ac78c 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -64,6 +64,7 @@ #include "sysfile.h" #include "uptodate.h" #include "ver.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -154,6 +155,9 @@ enum { Opt_localalloc, Opt_localflocks, Opt_stack, + Opt_user_xattr, + Opt_nouser_xattr, + Opt_inode64, Opt_err, }; @@ -173,6 +177,9 @@ static const match_table_t tokens = { {Opt_localalloc, "localalloc=%d"}, {Opt_localflocks, "localflocks"}, {Opt_stack, "cluster_stack=%s"}, + {Opt_user_xattr, "user_xattr"}, + {Opt_nouser_xattr, "nouser_xattr"}, + {Opt_inode64, "inode64"}, {Opt_err, NULL} }; @@ -205,10 +212,11 @@ static int ocfs2_sync_fs(struct super_block *sb, int wait) ocfs2_schedule_truncate_log_flush(osb, 0); } - if (journal_start_commit(OCFS2_SB(sb)->journal->j_journal, &target)) { + if (jbd2_journal_start_commit(OCFS2_SB(sb)->journal->j_journal, + &target)) { if (wait) - log_wait_commit(OCFS2_SB(sb)->journal->j_journal, - target); + jbd2_log_wait_commit(OCFS2_SB(sb)->journal->j_journal, + target); } return 0; } @@ -325,6 +333,7 @@ static struct inode *ocfs2_alloc_inode(struct super_block *sb) if (!oi) return NULL; + jbd2_journal_init_jbd_inode(&oi->ip_jinode, &oi->vfs_inode); return &oi->vfs_inode; } @@ -406,6 +415,15 @@ static int ocfs2_remount(struct super_block *sb, int *flags, char *data) goto out; } + /* Probably don't want this on remount; it might + * mess with other nodes */ + if (!(osb->s_mount_opt & OCFS2_MOUNT_INODE64) && + (parsed_options.mount_opt & OCFS2_MOUNT_INODE64)) { + ret = -EINVAL; + mlog(ML_ERROR, "Cannot enable inode64 on remount\n"); + goto out; + } + /* We're going to/from readonly mode. */ if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) { /* Lock here so the check of HARD_RO and the potential @@ -637,7 +655,8 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) osb->s_atime_quantum = parsed_options.atime_quantum; osb->preferred_slot = parsed_options.slot; osb->osb_commit_interval = parsed_options.commit_interval; - osb->local_alloc_size = parsed_options.localalloc_opt; + osb->local_alloc_default_bits = ocfs2_megabytes_to_clusters(sb, parsed_options.localalloc_opt); + osb->local_alloc_bits = osb->local_alloc_default_bits; status = ocfs2_verify_userspace_stack(osb, &parsed_options); if (status) @@ -743,8 +762,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) return status; read_super_error: - if (bh != NULL) - brelse(bh); + brelse(bh); if (inode) iput(inode); @@ -847,6 +865,12 @@ static int ocfs2_parse_options(struct super_block *sb, case Opt_data_writeback: mopt->mount_opt |= OCFS2_MOUNT_DATA_WRITEBACK; break; + case Opt_user_xattr: + mopt->mount_opt &= ~OCFS2_MOUNT_NOUSERXATTR; + break; + case Opt_nouser_xattr: + mopt->mount_opt |= OCFS2_MOUNT_NOUSERXATTR; + break; case Opt_atime_quantum: if (match_int(&args[0], &option)) { status = 0; @@ -873,7 +897,7 @@ static int ocfs2_parse_options(struct super_block *sb, if (option < 0) return 0; if (option == 0) - option = JBD_DEFAULT_MAX_COMMIT_AGE; + option = JBD2_DEFAULT_MAX_COMMIT_AGE; mopt->commit_interval = HZ * option; break; case Opt_localalloc: @@ -918,6 +942,9 @@ static int ocfs2_parse_options(struct super_block *sb, OCFS2_STACK_LABEL_LEN); mopt->cluster_stack[OCFS2_STACK_LABEL_LEN] = '\0'; break; + case Opt_inode64: + mopt->mount_opt |= OCFS2_MOUNT_INODE64; + break; default: mlog(ML_ERROR, "Unrecognized mount option \"%s\" " @@ -938,6 +965,7 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct ocfs2_super *osb = OCFS2_SB(mnt->mnt_sb); unsigned long opts = osb->s_mount_opt; + unsigned int local_alloc_megs; if (opts & OCFS2_MOUNT_HB_LOCAL) seq_printf(s, ",_netdev,heartbeat=local"); @@ -970,8 +998,9 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",commit=%u", (unsigned) (osb->osb_commit_interval / HZ)); - if (osb->local_alloc_size != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) - seq_printf(s, ",localalloc=%d", osb->local_alloc_size); + local_alloc_megs = osb->local_alloc_bits >> (20 - osb->s_clustersize_bits); + if (local_alloc_megs != OCFS2_DEFAULT_LOCAL_ALLOC_SIZE) + seq_printf(s, ",localalloc=%d", local_alloc_megs); if (opts & OCFS2_MOUNT_LOCALFLOCKS) seq_printf(s, ",localflocks,"); @@ -980,6 +1009,14 @@ static int ocfs2_show_options(struct seq_file *s, struct vfsmount *mnt) seq_printf(s, ",cluster_stack=%.*s", OCFS2_STACK_LABEL_LEN, osb->osb_cluster_stack); + if (opts & OCFS2_MOUNT_NOUSERXATTR) + seq_printf(s, ",nouser_xattr"); + else + seq_printf(s, ",user_xattr"); + + if (opts & OCFS2_MOUNT_INODE64) + seq_printf(s, ",inode64"); + return 0; } @@ -1132,6 +1169,7 @@ static void ocfs2_inode_init_once(void *data) oi->ip_dir_start_lookup = 0; init_rwsem(&oi->ip_alloc_sem); + init_rwsem(&oi->ip_xattr_sem); mutex_init(&oi->ip_io_mutex); oi->ip_blkno = 0ULL; @@ -1375,6 +1413,7 @@ static int ocfs2_initialize_super(struct super_block *sb, sb->s_fs_info = osb; sb->s_op = &ocfs2_sops; sb->s_export_op = &ocfs2_export_ops; + sb->s_xattr = ocfs2_xattr_handlers; sb->s_time_gran = 1; sb->s_flags |= MS_NOATIME; /* this is needed to support O_LARGEFILE */ @@ -1421,8 +1460,12 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->slot_num = OCFS2_INVALID_SLOT; + osb->s_xattr_inline_size = le16_to_cpu( + di->id2.i_super.s_xattr_inline_size); + osb->local_alloc_state = OCFS2_LA_UNUSED; osb->local_alloc_bh = NULL; + INIT_DELAYED_WORK(&osb->la_enable_wq, ocfs2_la_enable_worker); init_waitqueue_head(&osb->osb_mount_event); @@ -1568,6 +1611,7 @@ static int ocfs2_initialize_super(struct super_block *sb, osb->first_cluster_group_blkno = le64_to_cpu(di->id2.i_super.s_first_cluster_group); osb->fs_generation = le32_to_cpu(di->i_fs_generation); + osb->uuid_hash = le32_to_cpu(di->id2.i_super.s_uuid_hash); mlog(0, "vol_label: %s\n", osb->vol_label); mlog(0, "uuid: %s\n", osb->uuid_str); mlog(0, "root_blkno=%llu, system_dir_blkno=%llu\n", diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c index ba9dbb51d25..cbd03dfdc7b 100644 --- a/fs/ocfs2/symlink.c +++ b/fs/ocfs2/symlink.c @@ -50,6 +50,7 @@ #include "inode.h" #include "journal.h" #include "symlink.h" +#include "xattr.h" #include "buffer_head_io.h" @@ -83,11 +84,7 @@ static char *ocfs2_fast_symlink_getlink(struct inode *inode, mlog_entry_void(); - status = ocfs2_read_block(OCFS2_SB(inode->i_sb), - OCFS2_I(inode)->ip_blkno, - bh, - OCFS2_BH_CACHED, - inode); + status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, bh); if (status < 0) { mlog_errno(status); link = ERR_PTR(status); @@ -157,8 +154,7 @@ bail: kunmap(page); page_cache_release(page); } - if (bh) - brelse(bh); + brelse(bh); return ERR_PTR(status); } @@ -168,10 +164,18 @@ const struct inode_operations ocfs2_symlink_inode_operations = { .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; const struct inode_operations ocfs2_fast_symlink_inode_operations = { .readlink = ocfs2_readlink, .follow_link = ocfs2_follow_link, .getattr = ocfs2_getattr, .setattr = ocfs2_setattr, + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .listxattr = ocfs2_listxattr, + .removexattr = generic_removexattr, }; diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c index 4da8851f2b2..187b99ff036 100644 --- a/fs/ocfs2/uptodate.c +++ b/fs/ocfs2/uptodate.c @@ -53,7 +53,11 @@ #include <linux/highmem.h> #include <linux/buffer_head.h> #include <linux/rbtree.h> -#include <linux/jbd.h> +#ifndef CONFIG_OCFS2_COMPAT_JBD +# include <linux/jbd2.h> +#else +# include <linux/jbd.h> +#endif #define MLOG_MASK_PREFIX ML_UPTODATE @@ -511,14 +515,10 @@ static void ocfs2_remove_metadata_tree(struct ocfs2_caching_info *ci, ci->ci_num_cached--; } -/* Called when we remove a chunk of metadata from an inode. We don't - * bother reverting things to an inlined array in the case of a remove - * which moves us back under the limit. */ -void ocfs2_remove_from_cache(struct inode *inode, - struct buffer_head *bh) +static void ocfs2_remove_block_from_cache(struct inode *inode, + sector_t block) { int index; - sector_t block = bh->b_blocknr; struct ocfs2_meta_cache_item *item = NULL; struct ocfs2_inode_info *oi = OCFS2_I(inode); struct ocfs2_caching_info *ci = &oi->ip_metadata_cache; @@ -544,6 +544,30 @@ void ocfs2_remove_from_cache(struct inode *inode, kmem_cache_free(ocfs2_uptodate_cachep, item); } +/* + * Called when we remove a chunk of metadata from an inode. We don't + * bother reverting things to an inlined array in the case of a remove + * which moves us back under the limit. + */ +void ocfs2_remove_from_cache(struct inode *inode, + struct buffer_head *bh) +{ + sector_t block = bh->b_blocknr; + + ocfs2_remove_block_from_cache(inode, block); +} + +/* Called when we remove xattr clusters from an inode. */ +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len) +{ + unsigned int i, b_len = ocfs2_clusters_to_blocks(inode->i_sb, 1) * c_len; + + for (i = 0; i < b_len; i++, block++) + ocfs2_remove_block_from_cache(inode, block); +} + int __init init_ocfs2_uptodate_cache(void) { ocfs2_uptodate_cachep = kmem_cache_create("ocfs2_uptodate", diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h index 2e73206059a..531b4b3a0c4 100644 --- a/fs/ocfs2/uptodate.h +++ b/fs/ocfs2/uptodate.h @@ -40,6 +40,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode, struct buffer_head *bh); void ocfs2_remove_from_cache(struct inode *inode, struct buffer_head *bh); +void ocfs2_remove_xattr_clusters_from_cache(struct inode *inode, + sector_t block, + u32 c_len); int ocfs2_buffer_read_ahead(struct inode *inode, struct buffer_head *bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c new file mode 100644 index 00000000000..c25780a70df --- /dev/null +++ b/fs/ocfs2/xattr.c @@ -0,0 +1,4834 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.c + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * CREDITS: + * Lots of code in this file is taken from ext3. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/capability.h> +#include <linux/fs.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/highmem.h> +#include <linux/pagemap.h> +#include <linux/uio.h> +#include <linux/sched.h> +#include <linux/splice.h> +#include <linux/mount.h> +#include <linux/writeback.h> +#include <linux/falloc.h> +#include <linux/sort.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/string.h> + +#define MLOG_MASK_PREFIX ML_XATTR +#include <cluster/masklog.h> + +#include "ocfs2.h" +#include "alloc.h" +#include "dlmglue.h" +#include "file.h" +#include "symlink.h" +#include "sysfile.h" +#include "inode.h" +#include "journal.h" +#include "ocfs2_fs.h" +#include "suballoc.h" +#include "uptodate.h" +#include "buffer_head_io.h" +#include "super.h" +#include "xattr.h" + + +struct ocfs2_xattr_def_value_root { + struct ocfs2_xattr_value_root xv; + struct ocfs2_extent_rec er; +}; + +struct ocfs2_xattr_bucket { + struct buffer_head *bhs[OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET]; + struct ocfs2_xattr_header *xh; +}; + +#define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) +#define OCFS2_XATTR_INLINE_SIZE 80 + +static struct ocfs2_xattr_def_value_root def_xv = { + .xv.xr_list.l_count = cpu_to_le16(1), +}; + +struct xattr_handler *ocfs2_xattr_handlers[] = { + &ocfs2_xattr_user_handler, + &ocfs2_xattr_trusted_handler, + NULL +}; + +static struct xattr_handler *ocfs2_xattr_handler_map[] = { + [OCFS2_XATTR_INDEX_USER] = &ocfs2_xattr_user_handler, + [OCFS2_XATTR_INDEX_TRUSTED] = &ocfs2_xattr_trusted_handler, +}; + +struct ocfs2_xattr_info { + int name_index; + const char *name; + const void *value; + size_t value_len; +}; + +struct ocfs2_xattr_search { + struct buffer_head *inode_bh; + /* + * xattr_bh point to the block buffer head which has extended attribute + * when extended attribute in inode, xattr_bh is equal to inode_bh. + */ + struct buffer_head *xattr_bh; + struct ocfs2_xattr_header *header; + struct ocfs2_xattr_bucket bucket; + void *base; + void *end; + struct ocfs2_xattr_entry *here; + int not_found; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset); + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size); + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs); + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs); + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh); + +static inline const char *ocfs2_xattr_prefix(int name_index) +{ + struct xattr_handler *handler = NULL; + + if (name_index > 0 && name_index < OCFS2_XATTR_MAX) + handler = ocfs2_xattr_handler_map[name_index]; + + return handler ? handler->prefix : NULL; +} + +static u32 ocfs2_xattr_name_hash(struct inode *inode, + const char *name, + int name_len) +{ + /* Get hash value of uuid from super block */ + u32 hash = OCFS2_SB(inode->i_sb)->uuid_hash; + int i; + + /* hash extended attribute name */ + for (i = 0; i < name_len; i++) { + hash = (hash << OCFS2_HASH_SHIFT) ^ + (hash >> (8*sizeof(hash) - OCFS2_HASH_SHIFT)) ^ + *name++; + } + + return hash; +} + +/* + * ocfs2_xattr_hash_entry() + * + * Compute the hash of an extended attribute. + */ +static void ocfs2_xattr_hash_entry(struct inode *inode, + struct ocfs2_xattr_header *header, + struct ocfs2_xattr_entry *entry) +{ + u32 hash = 0; + char *name = (char *)header + le16_to_cpu(entry->xe_name_offset); + + hash = ocfs2_xattr_name_hash(inode, name, entry->xe_name_len); + entry->xe_name_hash = cpu_to_le32(hash); + + return; +} + +static int ocfs2_xattr_extend_allocation(struct inode *inode, + u32 clusters_to_add, + struct buffer_head *xattr_bh, + struct ocfs2_xattr_value_root *xv) +{ + int status = 0; + int restart_func = 0; + int credits = 0; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + enum ocfs2_alloc_restarted why; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u32 prev_clusters, logical_start = le32_to_cpu(xv->xr_clusters); + struct ocfs2_extent_tree et; + + mlog(0, "(clusters_to_add for xattr= %u)\n", clusters_to_add); + + ocfs2_init_xattr_value_extent_tree(&et, inode, xattr_bh, xv); + +restart_all: + + status = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (status) { + mlog_errno(status); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + handle = NULL; + mlog_errno(status); + goto leave; + } + +restarted_transaction: + status = ocfs2_journal_access(handle, inode, xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + prev_clusters = le32_to_cpu(xv->xr_clusters); + status = ocfs2_add_clusters_in_btree(osb, + inode, + &logical_start, + clusters_to_add, + 0, + &et, + handle, + data_ac, + meta_ac, + &why); + if ((status < 0) && (status != -EAGAIN)) { + if (status != -ENOSPC) + mlog_errno(status); + goto leave; + } + + status = ocfs2_journal_dirty(handle, xattr_bh); + if (status < 0) { + mlog_errno(status); + goto leave; + } + + clusters_to_add -= le32_to_cpu(xv->xr_clusters) - prev_clusters; + + if (why != RESTART_NONE && clusters_to_add) { + if (why == RESTART_META) { + mlog(0, "restarting function.\n"); + restart_func = 1; + } else { + BUG_ON(why != RESTART_TRANS); + + mlog(0, "restarting transaction.\n"); + /* TODO: This can be more intelligent. */ + credits = ocfs2_calc_extend_credits(osb->sb, + et.et_root_el, + clusters_to_add); + status = ocfs2_extend_trans(handle, credits); + if (status < 0) { + /* handle still has to be committed at + * this point. */ + status = -ENOMEM; + mlog_errno(status); + goto leave; + } + goto restarted_transaction; + } + } + +leave: + if (handle) { + ocfs2_commit_trans(osb, handle); + handle = NULL; + } + if (data_ac) { + ocfs2_free_alloc_context(data_ac); + data_ac = NULL; + } + if (meta_ac) { + ocfs2_free_alloc_context(meta_ac); + meta_ac = NULL; + } + if ((!status) && restart_func) { + restart_func = 0; + goto restart_all; + } + + return status; +} + +static int __ocfs2_remove_xattr_range(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + u32 cpos, u32 phys_cpos, u32 len, + struct ocfs2_cached_dealloc_ctxt *dealloc) +{ + int ret; + u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_value_extent_tree(&et, inode, root_bh, xv); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xv->xr_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +static int ocfs2_xattr_shrink_size(struct inode *inode, + u32 old_clusters, + u32 new_clusters, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv) +{ + int ret = 0; + u32 trunc_len, cpos, phys_cpos, alloc_size; + u64 block; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_cached_dealloc_ctxt dealloc; + + ocfs2_init_dealloc_ctxt(&dealloc); + + if (old_clusters <= new_clusters) + return 0; + + cpos = new_clusters; + trunc_len = old_clusters - new_clusters; + while (trunc_len) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &phys_cpos, + &alloc_size, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (alloc_size > trunc_len) + alloc_size = trunc_len; + + ret = __ocfs2_remove_xattr_range(inode, root_bh, xv, cpos, + phys_cpos, alloc_size, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out; + } + + block = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); + ocfs2_remove_xattr_clusters_from_cache(inode, block, + alloc_size); + cpos += alloc_size; + trunc_len -= alloc_size; + } + +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static int ocfs2_xattr_value_truncate(struct inode *inode, + struct buffer_head *root_bh, + struct ocfs2_xattr_value_root *xv, + int len) +{ + int ret; + u32 new_clusters = ocfs2_clusters_for_bytes(inode->i_sb, len); + u32 old_clusters = le32_to_cpu(xv->xr_clusters); + + if (new_clusters == old_clusters) + return 0; + + if (new_clusters > old_clusters) + ret = ocfs2_xattr_extend_allocation(inode, + new_clusters - old_clusters, + root_bh, xv); + else + ret = ocfs2_xattr_shrink_size(inode, + old_clusters, new_clusters, + root_bh, xv); + + return ret; +} + +static int ocfs2_xattr_list_entry(char *buffer, size_t size, + size_t *result, const char *prefix, + const char *name, int name_len) +{ + char *p = buffer + *result; + int prefix_len = strlen(prefix); + int total_len = prefix_len + name_len + 1; + + *result += total_len; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + return 0; + + if (*result > size) + return -ERANGE; + + memcpy(p, prefix, prefix_len); + memcpy(p + prefix_len, name, name_len); + p[prefix_len + name_len] = '\0'; + + return 0; +} + +static int ocfs2_xattr_list_entries(struct inode *inode, + struct ocfs2_xattr_header *header, + char *buffer, size_t buffer_size) +{ + size_t result = 0; + int i, type, ret; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + name = (const char *)header + + le16_to_cpu(entry->xe_name_offset); + + ret = ocfs2_xattr_list_entry(buffer, buffer_size, + &result, prefix, name, + entry->xe_name_len); + if (ret) + return ret; + } + } + + return result; +} + +static int ocfs2_xattr_ibody_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_xattr_header *header = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_xattr_list_entries(inode, header, buffer, buffer_size); + + return ret; +} + +static int ocfs2_xattr_block_list(struct inode *inode, + struct ocfs2_dinode *di, + char *buffer, + size_t buffer_size) +{ + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &xb->xb_attrs.xb_header; + ret = ocfs2_xattr_list_entries(inode, header, + buffer, buffer_size); + } else { + struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; + ret = ocfs2_xattr_tree_list_index_block(inode, xt, + buffer, buffer_size); + } +cleanup: + brelse(blk_bh); + + return ret; +} + +ssize_t ocfs2_listxattr(struct dentry *dentry, + char *buffer, + size_t size) +{ + int ret = 0, i_ret = 0, b_ret = 0; + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(dentry->d_inode); + + if (!ocfs2_supports_xattr(OCFS2_SB(dentry->d_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return ret; + + ret = ocfs2_inode_lock(dentry->d_inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + i_ret = ocfs2_xattr_ibody_list(dentry->d_inode, di, buffer, size); + if (i_ret < 0) + b_ret = 0; + else { + if (buffer) { + buffer += i_ret; + size -= i_ret; + } + b_ret = ocfs2_xattr_block_list(dentry->d_inode, di, + buffer, size); + if (b_ret < 0) + i_ret = 0; + } + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(dentry->d_inode, 0); + + brelse(di_bh); + + return i_ret + b_ret; +} + +static int ocfs2_xattr_find_entry(int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_entry *entry; + size_t name_len; + int i, cmp = 1; + + if (name == NULL) + return -EINVAL; + + name_len = strlen(name); + entry = xs->here; + for (i = 0; i < le16_to_cpu(xs->header->xh_count); i++) { + cmp = name_index - ocfs2_xattr_get_type(entry); + if (!cmp) + cmp = name_len - entry->xe_name_len; + if (!cmp) + cmp = memcmp(name, (xs->base + + le16_to_cpu(entry->xe_name_offset)), + name_len); + if (cmp == 0) + break; + entry += 1; + } + xs->here = entry; + + return cmp ? -ENODATA : 0; +} + +static int ocfs2_xattr_get_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + void *buffer, + size_t len) +{ + u32 cpos, p_cluster, num_clusters, bpc, clusters; + u64 blkno; + int i, ret = 0; + size_t cplen, blocksize; + struct buffer_head *bh = NULL; + struct ocfs2_extent_list *el; + + el = &xv->xr_list; + clusters = le32_to_cpu(xv->xr_clusters); + bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + blocksize = inode->i_sb->s_blocksize; + + cpos = 0; + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + /* Copy ocfs2_xattr_value */ + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + cplen = len >= blocksize ? blocksize : len; + memcpy(buffer, bh->b_data, cplen); + len -= cplen; + buffer += cplen; + + brelse(bh); + bh = NULL; + if (len == 0) + break; + } + cpos += num_clusters; + } +out: + return ret; +} + +static int ocfs2_xattr_ibody_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) + return -ENODATA; + + xs->end = (void *)di + inode->i_sb->s_blocksize; + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret) + return ret; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + if (size > buffer_size) + return -ERANGE; + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + le16_to_cpu(xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len), size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + le16_to_cpu( + xs->here->xe_name_offset) + + OCFS2_XATTR_SIZE(xs->here->xe_name_len)); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return size; +} + +static int ocfs2_xattr_block_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_xattr_value_root *xv; + size_t size; + int ret = -ENODATA, name_offset, name_len, block_off, i; + + if (!di->i_xattr_loc) + return ret; + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret) + goto cleanup; + size = le64_to_cpu(xs->here->xe_value_size); + if (buffer) { + ret = -ERANGE; + if (size > buffer_size) + goto cleanup; + + name_offset = le16_to_cpu(xs->here->xe_name_offset); + name_len = OCFS2_XATTR_SIZE(xs->here->xe_name_len); + i = xs->here - xs->header->xh_entries; + + if (le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + xs->bucket.xh, + i, + &block_off, + &name_offset); + xs->base = xs->bucket.bhs[block_off]->b_data; + } + if (ocfs2_xattr_is_local(xs->here)) { + memcpy(buffer, (void *)xs->base + + name_offset + name_len, size); + } else { + xv = (struct ocfs2_xattr_value_root *) + (xs->base + name_offset + name_len); + ret = ocfs2_xattr_get_value_outside(inode, xv, + buffer, size); + if (ret < 0) { + mlog_errno(ret); + goto cleanup; + } + } + } + ret = size; +cleanup: + for (i = 0; i < OCFS2_XATTR_MAX_BLOCKS_PER_BUCKET; i++) + brelse(xs->bucket.bhs[i]); + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + brelse(blk_bh); + return ret; +} + +/* ocfs2_xattr_get() + * + * Copy an extended attribute into the buffer provided. + * Buffer is NULL to compute the size of buffer required. + */ +int ocfs2_xattr_get(struct inode *inode, + int name_index, + const char *name, + void *buffer, + size_t buffer_size) +{ + int ret; + struct ocfs2_dinode *di = NULL; + struct buffer_head *di_bh = NULL; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + ret = -ENODATA; + + ret = ocfs2_inode_lock(inode, &di_bh, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_read(&oi->ip_xattr_sem); + ret = ocfs2_xattr_ibody_get(inode, name_index, name, buffer, + buffer_size, &xis); + if (ret == -ENODATA) + ret = ocfs2_xattr_block_get(inode, name_index, name, buffer, + buffer_size, &xbs); + up_read(&oi->ip_xattr_sem); + ocfs2_inode_unlock(inode, 0); + + brelse(di_bh); + + return ret; +} + +static int __ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_value_root *xv, + const void *value, + int value_len) +{ + int ret = 0, i, cp_len, credits; + u16 blocksize = inode->i_sb->s_blocksize; + u32 p_cluster, num_clusters; + u32 cpos = 0, bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 clusters = ocfs2_clusters_for_bytes(inode->i_sb, value_len); + u64 blkno; + struct buffer_head *bh = NULL; + handle_t *handle; + + BUG_ON(clusters > le32_to_cpu(xv->xr_clusters)); + + credits = clusters * bpc; + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + while (cpos < clusters) { + ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster, + &num_clusters, &xv->xr_list); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + blkno = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster); + + for (i = 0; i < num_clusters * bpc; i++, blkno++) { + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_journal_access(handle, + inode, + bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + cp_len = value_len > blocksize ? blocksize : value_len; + memcpy(bh->b_data, value, cp_len); + value_len -= cp_len; + value += cp_len; + if (cp_len < blocksize) + memset(bh->b_data + cp_len, 0, + blocksize - cp_len); + + ret = ocfs2_journal_dirty(handle, bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + brelse(bh); + bh = NULL; + + /* + * XXX: do we need to empty all the following + * blocks in this cluster? + */ + if (!value_len) + break; + } + cpos += num_clusters; + } +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + brelse(bh); + + return ret; +} + +static int ocfs2_xattr_cleanup(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + /* Decrease xattr count */ + le16_add_cpu(&xs->header->xh_count, -1); + /* Remove the xattr entry and tree root which has already be set*/ + memset((void *)xs->here, 0, sizeof(struct ocfs2_xattr_entry)); + memset(val, 0, size); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_update_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + handle_t *handle = NULL; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + xs->here->xe_name_offset = cpu_to_le16(offs); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + if (xi->value_len <= OCFS2_XATTR_INLINE_SIZE) + ocfs2_xattr_set_local(xs->here, 1); + else + ocfs2_xattr_set_local(xs->here, 0); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +/* + * ocfs2_xattr_set_value_outside() + * + * Set large size value in B tree. + */ +static int ocfs2_xattr_set_value_outside(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + size_t offs) +{ + size_t name_len = strlen(xi->name); + void *val = xs->base + offs; + struct ocfs2_xattr_value_root *xv = NULL; + size_t size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + int ret = 0; + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(name_len)); + xv->xr_clusters = 0; + xv->xr_last_eb_blk = 0; + xv->xr_list.l_tree_depth = 0; + xv->xr_list.l_count = cpu_to_le16(1); + xv->xr_list.l_next_free_rec = 0; + + ret = ocfs2_xattr_value_truncate(inode, xs->xattr_bh, xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = __ocfs2_xattr_set_value_outside(inode, xv, xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + ret = ocfs2_xattr_update_entry(inode, xi, xs, offs); + if (ret < 0) + mlog_errno(ret); + + return ret; +} + +/* + * ocfs2_xattr_set_entry_local() + * + * Set, replace or remove extended attribute in local. + */ +static void ocfs2_xattr_set_entry_local(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + struct ocfs2_xattr_entry *last, + size_t min_offs) +{ + size_t name_len = strlen(xi->name); + int i; + + if (xi->value && xs->not_found) { + /* Insert the new xattr entry. */ + le16_add_cpu(&xs->header->xh_count, 1); + ocfs2_xattr_set_type(last, xi->name_index); + ocfs2_xattr_set_local(last, 1); + last->xe_name_len = name_len; + } else { + void *first_val; + void *val; + size_t offs, size; + + first_val = xs->base + min_offs; + offs = le16_to_cpu(xs->here->xe_name_offset); + val = xs->base + offs; + + if (le64_to_cpu(xs->here->xe_value_size) > + OCFS2_XATTR_INLINE_SIZE) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + + if (xi->value && size == OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + /* The old and the new value have the + same size. Just replace the value. */ + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + /* Clear value bytes. */ + memset(val + OCFS2_XATTR_SIZE(name_len), + 0, + OCFS2_XATTR_SIZE(xi->value_len)); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + return; + } + /* Remove the old name+value. */ + memmove(first_val + size, first_val, val - first_val); + memset(first_val, 0, size); + xs->here->xe_name_hash = 0; + xs->here->xe_name_offset = 0; + ocfs2_xattr_set_local(xs->here, 1); + xs->here->xe_value_size = 0; + + min_offs += size; + + /* Adjust all value offsets. */ + last = xs->header->xh_entries; + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t o = le16_to_cpu(last->xe_name_offset); + + if (o < offs) + last->xe_name_offset = cpu_to_le16(o + size); + last += 1; + } + + if (!xi->value) { + /* Remove the old entry. */ + last -= 1; + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xs->header->xh_count, -1); + } + } + if (xi->value) { + /* Insert the new name+value. */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + void *val = xs->base + min_offs - size; + + xs->here->xe_name_offset = cpu_to_le16(min_offs - size); + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, + xi->value_len); + xs->here->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xs->here, 1); + ocfs2_xattr_hash_entry(inode, xs->header, xs->here); + } + + return; +} + +/* + * ocfs2_xattr_set_entry() + * + * Set extended attribute entry into inode or block. + * + * If extended attribute value size > OCFS2_XATTR_INLINE_SIZE, + * We first insert tree root(ocfs2_xattr_value_root) with set_entry_local(), + * then set value in B tree with set_value_outside(). + */ +static int ocfs2_xattr_set_entry(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + int flag) +{ + struct ocfs2_xattr_entry *last; + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + size_t min_offs = xs->end - xs->base, name_len = strlen(xi->name); + size_t size_l = 0; + handle_t *handle = NULL; + int free, i, ret; + struct ocfs2_xattr_info xi_l = { + .name_index = xi->name_index, + .name = xi->name, + .value = xi->value, + .value_len = xi->value_len, + }; + + /* Compute min_offs, last and free space. */ + last = xs->header->xh_entries; + + for (i = 0 ; i < le16_to_cpu(xs->header->xh_count); i++) { + size_t offs = le16_to_cpu(last->xe_name_offset); + if (offs < min_offs) + min_offs = offs; + last += 1; + } + + free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + if (free < 0) + return -EFAULT; + + if (!xs->not_found) { + size_t size = 0; + if (ocfs2_xattr_is_local(xs->here)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE; + free += (size + sizeof(struct ocfs2_xattr_entry)); + } + /* Check free space in inode or block */ + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_ROOT_SIZE) { + ret = -ENOSPC; + goto out; + } + size_l = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_ROOT_SIZE; + xi_l.value = (void *)&def_xv; + xi_l.value_len = OCFS2_XATTR_ROOT_SIZE; + } else if (xi->value) { + if (free < sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len)) { + ret = -ENOSPC; + goto out; + } + } + + if (!xs->not_found) { + /* For existing extended attribute */ + size_t size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xs->here->xe_value_size)); + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + void *val = xs->base + offs; + + if (ocfs2_xattr_is_local(xs->here) && size == size_l) { + /* Replace existing local xattr with tree root */ + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else if (!ocfs2_xattr_is_local(xs->here)) { + /* For existing xattr which has value outside */ + struct ocfs2_xattr_value_root *xv = NULL; + xv = (struct ocfs2_xattr_value_root *)(val + + OCFS2_XATTR_SIZE(name_len)); + + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If new value need set outside also, + * first truncate old value to new value, + * then set new value with set_value_outside(). + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = __ocfs2_xattr_set_value_outside(inode, + xv, + xi->value, + xi->value_len); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_update_entry(inode, + xi, + xs, + offs); + if (ret < 0) + mlog_errno(ret); + goto out; + } else { + /* + * If new value need set in local, + * just trucate old value to zero. + */ + ret = ocfs2_xattr_value_truncate(inode, + xs->xattr_bh, + xv, + 0); + if (ret < 0) + mlog_errno(ret); + } + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + /* set extended attribute in external block. */ + ret = ocfs2_extend_trans(handle, + OCFS2_INODE_UPDATE_CREDITS + + OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + /* + * Set value in local, include set tree root in local. + * This is the first step for value size >INLINE_SIZE. + */ + ocfs2_xattr_set_entry_local(inode, &xi_l, xs, last, min_offs); + + if (!(flag & OCFS2_INLINE_XATTR_FL)) { + ret = ocfs2_journal_dirty(handle, xs->xattr_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + } + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) && + (flag & OCFS2_INLINE_XATTR_FL)) { + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + unsigned int xattrsize = osb->s_xattr_inline_size; + + /* + * Adjust extent record count or inline data size + * to reserve space for extended attribute. + */ + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + le16_add_cpu(&idata->id_count, -xattrsize); + } else if (!(ocfs2_inode_is_fast_symlink(inode))) { + struct ocfs2_extent_list *el = &di->id2.i_list; + le16_add_cpu(&el->l_count, -(xattrsize / + sizeof(struct ocfs2_extent_rec))); + } + di->i_xattr_inline_size = cpu_to_le16(xattrsize); + } + /* Update xattr flag */ + spin_lock(&oi->ip_lock); + oi->ip_dyn_features |= flag; + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + /* Update inode ctime */ + inode->i_ctime = CURRENT_TIME; + di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); + + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); + + if (!ret && xi->value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * Set value outside in B tree. + * This is the second step for value size > INLINE_SIZE. + */ + size_t offs = le16_to_cpu(xs->here->xe_name_offset); + ret = ocfs2_xattr_set_value_outside(inode, xi, xs, offs); + if (ret < 0) { + int ret2; + + mlog_errno(ret); + /* + * If set value outside failed, we have to clean + * the junk tree root we have already set in local. + */ + ret2 = ocfs2_xattr_cleanup(inode, xi, xs, offs); + if (ret2 < 0) + mlog_errno(ret2); + } + } +out: + return ret; + +} + +static int ocfs2_remove_value_outside(struct inode*inode, + struct buffer_head *bh, + struct ocfs2_xattr_header *header) +{ + int ret = 0, i; + + for (i = 0; i < le16_to_cpu(header->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &header->xh_entries[i]; + + if (!ocfs2_xattr_is_local(entry)) { + struct ocfs2_xattr_value_root *xv; + void *val; + + val = (void *)header + + le16_to_cpu(entry->xe_name_offset); + xv = (struct ocfs2_xattr_value_root *) + (val + OCFS2_XATTR_SIZE(entry->xe_name_len)); + ret = ocfs2_xattr_value_truncate(inode, bh, xv, 0); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + } + } + + return ret; +} + +static int ocfs2_xattr_ibody_remove(struct inode *inode, + struct buffer_head *di_bh) +{ + + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + struct ocfs2_xattr_header *header; + int ret; + + header = (struct ocfs2_xattr_header *) + ((void *)di + inode->i_sb->s_blocksize - + le16_to_cpu(di->i_xattr_inline_size)); + + ret = ocfs2_remove_value_outside(inode, di_bh, header); + + return ret; +} + +static int ocfs2_xattr_block_remove(struct inode *inode, + struct buffer_head *blk_bh) +{ + struct ocfs2_xattr_block *xb; + int ret = 0; + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + struct ocfs2_xattr_header *header = &(xb->xb_attrs.xb_header); + ret = ocfs2_remove_value_outside(inode, blk_bh, header); + } else + ret = ocfs2_delete_xattr_index_block(inode, blk_bh); + + return ret; +} + +static int ocfs2_xattr_free_block(struct inode *inode, + u64 block) +{ + struct inode *xb_alloc_inode; + struct buffer_head *xb_alloc_bh = NULL; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle; + int ret = 0; + u64 blk, bg_blkno; + u16 bit; + + ret = ocfs2_read_block(inode, block, &blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto out; + } + + ret = ocfs2_xattr_block_remove(inode, blk_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + blk = le64_to_cpu(xb->xb_blkno); + bit = le16_to_cpu(xb->xb_suballoc_bit); + bg_blkno = ocfs2_which_suballoc_group(blk, bit); + + xb_alloc_inode = ocfs2_get_system_file_inode(osb, + EXTENT_ALLOC_SYSTEM_INODE, + le16_to_cpu(xb->xb_suballoc_slot)); + if (!xb_alloc_inode) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + mutex_lock(&xb_alloc_inode->i_mutex); + + ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); + if (ret < 0) { + mlog_errno(ret); + goto out_mutex; + } + + handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_unlock; + } + + ret = ocfs2_free_suballoc_bits(handle, xb_alloc_inode, xb_alloc_bh, + bit, bg_blkno, 1); + if (ret < 0) + mlog_errno(ret); + + ocfs2_commit_trans(osb, handle); +out_unlock: + ocfs2_inode_unlock(xb_alloc_inode, 1); + brelse(xb_alloc_bh); +out_mutex: + mutex_unlock(&xb_alloc_inode->i_mutex); + iput(xb_alloc_inode); +out: + brelse(blk_bh); + return ret; +} + +/* + * ocfs2_xattr_remove() + * + * Free extended attribute resources associated with this inode. + */ +int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; + handle_t *handle; + int ret; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL)) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_ibody_remove(inode, di_bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + if (di->i_xattr_loc) { + ret = ocfs2_xattr_free_block(inode, + le64_to_cpu(di->i_xattr_loc)); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + di->i_xattr_loc = 0; + + spin_lock(&oi->ip_lock); + oi->ip_dyn_features &= ~(OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL); + di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); + spin_unlock(&oi->ip_lock); + + ret = ocfs2_journal_dirty(handle, di_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + return ret; +} + +static int ocfs2_xattr_has_space_inline(struct inode *inode, + struct ocfs2_dinode *di) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + unsigned int xattrsize = OCFS2_SB(inode->i_sb)->s_xattr_inline_size; + int free; + + if (xattrsize < OCFS2_MIN_XATTR_INLINE_SIZE) + return 0; + + if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + struct ocfs2_inline_data *idata = &di->id2.i_data; + free = le16_to_cpu(idata->id_count) - le64_to_cpu(di->i_size); + } else if (ocfs2_inode_is_fast_symlink(inode)) { + free = ocfs2_fast_symlink_chars(inode->i_sb) - + le64_to_cpu(di->i_size); + } else { + struct ocfs2_extent_list *el = &di->id2.i_list; + free = (le16_to_cpu(el->l_count) - + le16_to_cpu(el->l_next_free_rec)) * + sizeof(struct ocfs2_extent_rec); + } + if (free >= xattrsize) + return 1; + + return 0; +} + +/* + * ocfs2_xattr_ibody_find() + * + * Find extended attribute in inode block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_ibody_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + int has_space = 0; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return 0; + + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + down_read(&oi->ip_alloc_sem); + has_space = ocfs2_xattr_has_space_inline(inode, di); + up_read(&oi->ip_alloc_sem); + if (!has_space) + return 0; + } + + xs->xattr_bh = xs->inode_bh; + xs->end = (void *)di + inode->i_sb->s_blocksize; + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) + xs->header = (struct ocfs2_xattr_header *) + (xs->end - le16_to_cpu(di->i_xattr_inline_size)); + else + xs->header = (struct ocfs2_xattr_header *) + (xs->end - OCFS2_SB(inode->i_sb)->s_xattr_inline_size); + xs->base = (void *)xs->header; + xs->here = xs->header->xh_entries; + + /* Find the named attribute. */ + if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL) { + ret = ocfs2_xattr_find_entry(name_index, name, xs); + if (ret && ret != -ENODATA) + return ret; + xs->not_found = ret; + } + + return 0; +} + +/* + * ocfs2_xattr_ibody_set() + * + * Set, replace or remove an extended attribute into inode block. + * + */ +static int ocfs2_xattr_ibody_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + int ret; + + if (inode->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE) + return -ENOSPC; + + down_write(&oi->ip_alloc_sem); + if (!(oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL)) { + if (!ocfs2_xattr_has_space_inline(inode, di)) { + ret = -ENOSPC; + goto out; + } + } + + ret = ocfs2_xattr_set_entry(inode, xi, xs, + (OCFS2_INLINE_XATTR_FL | OCFS2_HAS_XATTR_FL)); +out: + up_write(&oi->ip_alloc_sem); + + return ret; +} + +/* + * ocfs2_xattr_block_find() + * + * Find extended attribute in external block and + * fill search info into struct ocfs2_xattr_search. + */ +static int ocfs2_xattr_block_find(struct inode *inode, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct buffer_head *blk_bh = NULL; + struct ocfs2_xattr_block *xb; + int ret = 0; + + if (!di->i_xattr_loc) + return ret; + + ret = ocfs2_read_block(inode, le64_to_cpu(di->i_xattr_loc), &blk_bh); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + /*Verify the signature of xattr block*/ + if (memcmp((void *)blk_bh->b_data, OCFS2_XATTR_BLOCK_SIGNATURE, + strlen(OCFS2_XATTR_BLOCK_SIGNATURE))) { + ret = -EFAULT; + goto cleanup; + } + + xs->xattr_bh = blk_bh; + xb = (struct ocfs2_xattr_block *)blk_bh->b_data; + + if (!(le16_to_cpu(xb->xb_flags) & OCFS2_XATTR_INDEXED)) { + xs->header = &xb->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)(blk_bh->b_data) + blk_bh->b_size; + xs->here = xs->header->xh_entries; + + ret = ocfs2_xattr_find_entry(name_index, name, xs); + } else + ret = ocfs2_xattr_index_block_find(inode, blk_bh, + name_index, + name, xs); + + if (ret && ret != -ENODATA) { + xs->xattr_bh = NULL; + goto cleanup; + } + xs->not_found = ret; + return 0; +cleanup: + brelse(blk_bh); + + return ret; +} + +/* + * When all the xattrs are deleted from index btree, the ocfs2_xattr_tree + * will be erased and ocfs2_xattr_block will have its ocfs2_xattr_header + * re-initialized. + */ +static int ocfs2_restore_xattr_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + + BUG_ON(!(xb_flags & OCFS2_XATTR_INDEXED) || + le16_to_cpu(el->l_next_free_rec) != 0); + + handle = ocfs2_start_trans(osb, OCFS2_XATTR_BLOCK_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xs->xattr_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xb->xb_flags = cpu_to_le16(xb_flags & ~OCFS2_XATTR_INDEXED); + + ocfs2_journal_dirty(handle, xs->xattr_bh); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * ocfs2_xattr_block_set() + * + * Set, replace or remove an extended attribute into external block. + * + */ +static int ocfs2_xattr_block_set(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct buffer_head *new_bh = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_dinode *di = (struct ocfs2_dinode *)xs->inode_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + handle_t *handle = NULL; + struct ocfs2_xattr_block *xblk = NULL; + u16 suballoc_bit_start; + u32 num_got; + u64 first_blkno; + int ret; + + if (!xs->xattr_bh) { + /* + * Alloc one external block for extended attribute + * outside of inode. + */ + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + handle = ocfs2_start_trans(osb, + OCFS2_XATTR_BLOCK_CREATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + ret = ocfs2_journal_access(handle, inode, xs->inode_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_metadata(osb, handle, meta_ac, 1, + &suballoc_bit_start, &num_got, + &first_blkno); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + new_bh = sb_getblk(inode->i_sb, first_blkno); + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + /* Initialize ocfs2_xattr_block */ + xs->xattr_bh = new_bh; + xblk = (struct ocfs2_xattr_block *)new_bh->b_data; + memset(xblk, 0, inode->i_sb->s_blocksize); + strcpy((void *)xblk, OCFS2_XATTR_BLOCK_SIGNATURE); + xblk->xb_suballoc_slot = cpu_to_le16(osb->slot_num); + xblk->xb_suballoc_bit = cpu_to_le16(suballoc_bit_start); + xblk->xb_fs_generation = cpu_to_le32(osb->fs_generation); + xblk->xb_blkno = cpu_to_le64(first_blkno); + + xs->header = &xblk->xb_attrs.xb_header; + xs->base = (void *)xs->header; + xs->end = (void *)xblk + inode->i_sb->s_blocksize; + xs->here = xs->header->xh_entries; + + + ret = ocfs2_journal_dirty(handle, new_bh); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + di->i_xattr_loc = cpu_to_le64(first_blkno); + ret = ocfs2_journal_dirty(handle, xs->inode_bh); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(osb, handle); +out: + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + if (ret < 0) + return ret; + } else + xblk = (struct ocfs2_xattr_block *)xs->xattr_bh->b_data; + + if (!(le16_to_cpu(xblk->xb_flags) & OCFS2_XATTR_INDEXED)) { + /* Set extended attribute into external block */ + ret = ocfs2_xattr_set_entry(inode, xi, xs, OCFS2_HAS_XATTR_FL); + if (!ret || ret != -ENOSPC) + goto end; + + ret = ocfs2_xattr_create_index_block(inode, xs); + if (ret) + goto end; + } + + ret = ocfs2_xattr_set_entry_index_block(inode, xi, xs); + if (!ret && xblk->xb_attrs.xb_root.xt_list.l_next_free_rec == 0) + ret = ocfs2_restore_xattr_block(inode, xs); + +end: + + return ret; +} + +/* + * ocfs2_xattr_set() + * + * Set, replace or remove an extended attribute for this inode. + * value is NULL to remove an existing extended attribute, else either + * create or replace an extended attribute. + */ +int ocfs2_xattr_set(struct inode *inode, + int name_index, + const char *name, + const void *value, + size_t value_len, + int flags) +{ + struct buffer_head *di_bh = NULL; + struct ocfs2_dinode *di; + int ret; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + struct ocfs2_xattr_info xi = { + .name_index = name_index, + .name = name, + .value = value, + .value_len = value_len, + }; + + struct ocfs2_xattr_search xis = { + .not_found = -ENODATA, + }; + + struct ocfs2_xattr_search xbs = { + .not_found = -ENODATA, + }; + + if (!ocfs2_supports_xattr(OCFS2_SB(inode->i_sb))) + return -EOPNOTSUPP; + + ret = ocfs2_inode_lock(inode, &di_bh, 1); + if (ret < 0) { + mlog_errno(ret); + return ret; + } + xis.inode_bh = xbs.inode_bh = di_bh; + di = (struct ocfs2_dinode *)di_bh->b_data; + + down_write(&OCFS2_I(inode)->ip_xattr_sem); + /* + * Scan inode and external block to find the same name + * extended attribute and collect search infomation. + */ + ret = ocfs2_xattr_ibody_find(inode, name_index, name, &xis); + if (ret) + goto cleanup; + if (xis.not_found) { + ret = ocfs2_xattr_block_find(inode, name_index, name, &xbs); + if (ret) + goto cleanup; + } + + if (xis.not_found && xbs.not_found) { + ret = -ENODATA; + if (flags & XATTR_REPLACE) + goto cleanup; + ret = 0; + if (!value) + goto cleanup; + } else { + ret = -EEXIST; + if (flags & XATTR_CREATE) + goto cleanup; + } + + if (!value) { + /* Remove existing extended attribute */ + if (!xis.not_found) + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + else if (!xbs.not_found) + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else { + /* We always try to set extended attribute into inode first*/ + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + if (!ret && !xbs.not_found) { + /* + * If succeed and that extended attribute existing in + * external block, then we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + } else if (ret == -ENOSPC) { + if (di->i_xattr_loc && !xbs.xattr_bh) { + ret = ocfs2_xattr_block_find(inode, name_index, + name, &xbs); + if (ret) + goto cleanup; + } + /* + * If no space in inode, we will set extended attribute + * into external block. + */ + ret = ocfs2_xattr_block_set(inode, &xi, &xbs); + if (ret) + goto cleanup; + if (!xis.not_found) { + /* + * If succeed and that extended attribute + * existing in inode, we will remove it. + */ + xi.value = NULL; + xi.value_len = 0; + ret = ocfs2_xattr_ibody_set(inode, &xi, &xis); + } + } + } +cleanup: + up_write(&OCFS2_I(inode)->ip_xattr_sem); + ocfs2_inode_unlock(inode, 1); + brelse(di_bh); + brelse(xbs.xattr_bh); + for (i = 0; i < blk_per_bucket; i++) + brelse(xbs.bucket.bhs[i]); + + return ret; +} + +/* + * Find the xattr extent rec which may contains name_hash. + * e_cpos will be the first name hash of the xattr rec. + * el must be the ocfs2_xattr_header.xb_attrs.xb_root.xt_list. + */ +static int ocfs2_xattr_get_rec(struct inode *inode, + u32 name_hash, + u64 *p_blkno, + u32 *e_cpos, + u32 *num_clusters, + struct ocfs2_extent_list *el) +{ + int ret = 0, i; + struct buffer_head *eb_bh = NULL; + struct ocfs2_extent_block *eb; + struct ocfs2_extent_rec *rec = NULL; + u64 e_blkno = 0; + + if (el->l_tree_depth) { + ret = ocfs2_find_leaf(inode, el, name_hash, &eb_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + eb = (struct ocfs2_extent_block *) eb_bh->b_data; + el = &eb->h_list; + + if (el->l_tree_depth) { + ocfs2_error(inode->i_sb, + "Inode %lu has non zero tree depth in " + "xattr tree block %llu\n", inode->i_ino, + (unsigned long long)eb_bh->b_blocknr); + ret = -EROFS; + goto out; + } + } + + for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) { + rec = &el->l_recs[i]; + + if (le32_to_cpu(rec->e_cpos) <= name_hash) { + e_blkno = le64_to_cpu(rec->e_blkno); + break; + } + } + + if (!e_blkno) { + ocfs2_error(inode->i_sb, "Inode %lu has bad extent " + "record (%u, %u, 0) in xattr", inode->i_ino, + le32_to_cpu(rec->e_cpos), + ocfs2_rec_clusters(el, rec)); + ret = -EROFS; + goto out; + } + + *p_blkno = le64_to_cpu(rec->e_blkno); + *num_clusters = le16_to_cpu(rec->e_leaf_clusters); + if (e_cpos) + *e_cpos = le32_to_cpu(rec->e_cpos); +out: + brelse(eb_bh); + return ret; +} + +typedef int (xattr_bucket_func)(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para); + +static int ocfs2_find_xe_in_bucket(struct inode *inode, + struct buffer_head *header_bh, + int name_index, + const char *name, + u32 name_hash, + u16 *xe_index, + int *found) +{ + int i, ret = 0, cmp = 1, block_off, new_offset; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t name_len = strlen(name); + struct ocfs2_xattr_entry *xe = NULL; + struct buffer_head *name_bh = NULL; + char *xe_name; + + /* + * We don't use binary search in the bucket because there + * may be multiple entries with the same name hash. + */ + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) + continue; + else if (name_hash < le32_to_cpu(xe->xe_name_hash)) + break; + + cmp = name_index - ocfs2_xattr_get_type(xe); + if (!cmp) + cmp = name_len - xe->xe_name_len; + if (cmp) + continue; + + ret = ocfs2_xattr_bucket_get_name_value(inode, + xh, + i, + &block_off, + &new_offset); + if (ret) { + mlog_errno(ret); + break; + } + + ret = ocfs2_read_block(inode, header_bh->b_blocknr + block_off, + &name_bh); + if (ret) { + mlog_errno(ret); + break; + } + xe_name = name_bh->b_data + new_offset; + + cmp = memcmp(name, xe_name, name_len); + brelse(name_bh); + name_bh = NULL; + + if (cmp == 0) { + *xe_index = i; + *found = 1; + ret = 0; + break; + } + } + + return ret; +} + +/* + * Find the specified xattr entry in a series of buckets. + * This series start from p_blkno and last for num_clusters. + * The ocfs2_xattr_header.xh_num_buckets of the first bucket contains + * the num of the valid buckets. + * + * Return the buffer_head this xattr should reside in. And if the xattr's + * hash is in the gap of 2 buckets, return the lower bucket. + */ +static int ocfs2_xattr_bucket_find(struct inode *inode, + int name_index, + const char *name, + u32 name_hash, + u64 p_blkno, + u32 first_hash, + u32 num_clusters, + struct ocfs2_xattr_search *xs) +{ + int ret, found = 0; + struct buffer_head *bh = NULL; + struct buffer_head *lower_bh = NULL; + struct ocfs2_xattr_header *xh = NULL; + struct ocfs2_xattr_entry *xe = NULL; + u16 index = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int low_bucket = 0, bucket, high_bucket; + u32 last_hash; + u64 blkno; + + ret = ocfs2_read_block(inode, p_blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + high_bucket = le16_to_cpu(xh->xh_num_buckets) - 1; + + while (low_bucket <= high_bucket) { + brelse(bh); + bh = NULL; + bucket = (low_bucket + high_bucket) / 2; + + blkno = p_blkno + bucket * blk_per_bucket; + + ret = ocfs2_read_block(inode, blkno, &bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xe = &xh->xh_entries[0]; + if (name_hash < le32_to_cpu(xe->xe_name_hash)) { + high_bucket = bucket - 1; + continue; + } + + /* + * Check whether the hash of the last entry in our + * bucket is larger than the search one. for an empty + * bucket, the last one is also the first one. + */ + if (xh->xh_count) + xe = &xh->xh_entries[le16_to_cpu(xh->xh_count) - 1]; + + last_hash = le32_to_cpu(xe->xe_name_hash); + + /* record lower_bh which may be the insert place. */ + brelse(lower_bh); + lower_bh = bh; + bh = NULL; + + if (name_hash > le32_to_cpu(xe->xe_name_hash)) { + low_bucket = bucket + 1; + continue; + } + + /* the searched xattr should reside in this bucket if exists. */ + ret = ocfs2_find_xe_in_bucket(inode, lower_bh, + name_index, name, name_hash, + &index, &found); + if (ret) { + mlog_errno(ret); + goto out; + } + break; + } + + /* + * Record the bucket we have found. + * When the xattr's hash value is in the gap of 2 buckets, we will + * always set it to the previous bucket. + */ + if (!lower_bh) { + /* + * We can't find any bucket whose first name_hash is less + * than the find name_hash. + */ + BUG_ON(bh->b_blocknr != p_blkno); + lower_bh = bh; + bh = NULL; + } + xs->bucket.bhs[0] = lower_bh; + xs->bucket.xh = (struct ocfs2_xattr_header *) + xs->bucket.bhs[0]->b_data; + lower_bh = NULL; + + xs->header = xs->bucket.xh; + xs->base = xs->bucket.bhs[0]->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (found) { + /* + * If we have found the xattr enty, read all the blocks in + * this bucket. + */ + ret = ocfs2_read_blocks(inode, xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + goto out; + } + + xs->here = &xs->header->xh_entries[index]; + mlog(0, "find xattr %s in bucket %llu, entry = %u\n", name, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, index); + } else + ret = -ENODATA; + +out: + brelse(bh); + brelse(lower_bh); + return ret; +} + +static int ocfs2_xattr_index_block_find(struct inode *inode, + struct buffer_head *root_bh, + int name_index, + const char *name, + struct ocfs2_xattr_search *xs) +{ + int ret; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + u64 p_blkno = 0; + u32 first_hash, num_clusters = 0; + u32 name_hash = ocfs2_xattr_name_hash(inode, name, strlen(name)); + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return -ENODATA; + + mlog(0, "find xattr %s, hash = %u, index = %d in xattr tree\n", + name, name_hash, name_index); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &first_hash, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + BUG_ON(p_blkno == 0 || num_clusters == 0 || first_hash > name_hash); + + mlog(0, "find xattr extent rec %u clusters from %llu, the first hash " + "in the rec is %u\n", num_clusters, p_blkno, first_hash); + + ret = ocfs2_xattr_bucket_find(inode, name_index, name, name_hash, + p_blkno, first_hash, num_clusters, xs); + +out: + return ret; +} + +static int ocfs2_iterate_xattr_buckets(struct inode *inode, + u64 blkno, + u32 clusters, + xattr_bucket_func *func, + void *para) +{ + int i, j, ret = 0; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u32 bpc = ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)); + u32 num_buckets = clusters * bpc; + struct ocfs2_xattr_bucket bucket; + + memset(&bucket, 0, sizeof(bucket)); + + mlog(0, "iterating xattr buckets in %u clusters starting from %llu\n", + clusters, blkno); + + for (i = 0; i < num_buckets; i++, blkno += blk_per_bucket) { + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, + bucket.bhs, OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + goto out; + } + + bucket.xh = (struct ocfs2_xattr_header *)bucket.bhs[0]->b_data; + /* + * The real bucket num in this series of blocks is stored + * in the 1st bucket. + */ + if (i == 0) + num_buckets = le16_to_cpu(bucket.xh->xh_num_buckets); + + mlog(0, "iterating xattr bucket %llu, first hash %u\n", blkno, + le32_to_cpu(bucket.xh->xh_entries[0].xe_name_hash)); + if (func) { + ret = func(inode, &bucket, para); + if (ret) { + mlog_errno(ret); + break; + } + } + + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + memset(&bucket, 0, sizeof(bucket)); + } + +out: + for (j = 0; j < blk_per_bucket; j++) + brelse(bucket.bhs[j]); + + return ret; +} + +struct ocfs2_xattr_tree_list { + char *buffer; + size_t buffer_size; + size_t result; +}; + +static int ocfs2_xattr_bucket_get_name_value(struct inode *inode, + struct ocfs2_xattr_header *xh, + int index, + int *block_off, + int *new_offset) +{ + u16 name_offset; + + if (index < 0 || index >= le16_to_cpu(xh->xh_count)) + return -EINVAL; + + name_offset = le16_to_cpu(xh->xh_entries[index].xe_name_offset); + + *block_off = name_offset >> inode->i_sb->s_blocksize_bits; + *new_offset = name_offset % inode->i_sb->s_blocksize; + + return 0; +} + +static int ocfs2_list_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0, type; + struct ocfs2_xattr_tree_list *xl = (struct ocfs2_xattr_tree_list *)para; + int i, block_off, new_offset; + const char *prefix, *name; + + for (i = 0 ; i < le16_to_cpu(bucket->xh->xh_count); i++) { + struct ocfs2_xattr_entry *entry = &bucket->xh->xh_entries[i]; + type = ocfs2_xattr_get_type(entry); + prefix = ocfs2_xattr_prefix(type); + + if (prefix) { + ret = ocfs2_xattr_bucket_get_name_value(inode, + bucket->xh, + i, + &block_off, + &new_offset); + if (ret) + break; + + name = (const char *)bucket->bhs[block_off]->b_data + + new_offset; + ret = ocfs2_xattr_list_entry(xl->buffer, + xl->buffer_size, + &xl->result, + prefix, name, + entry->xe_name_len); + if (ret) + break; + } + } + + return ret; +} + +static int ocfs2_xattr_tree_list_index_block(struct inode *inode, + struct ocfs2_xattr_tree_root *xt, + char *buffer, + size_t buffer_size) +{ + struct ocfs2_extent_list *el = &xt->xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos = 0, num_clusters = 0; + u64 p_blkno = 0; + struct ocfs2_xattr_tree_list xl = { + .buffer = buffer, + .buffer_size = buffer_size, + .result = 0, + }; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_list_xattr_bucket, + &xl); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + + ret = xl.result; +out: + return ret; +} + +static int cmp_xe(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_hash = le32_to_cpu(l->xe_name_hash); + u32 r_hash = le32_to_cpu(r->xe_name_hash); + + if (l_hash > r_hash) + return 1; + if (l_hash < r_hash) + return -1; + return 0; +} + +static void swap_xe(void *a, void *b, int size) +{ + struct ocfs2_xattr_entry *l = a, *r = b, tmp; + + tmp = *l; + memcpy(l, r, sizeof(struct ocfs2_xattr_entry)); + memcpy(r, &tmp, sizeof(struct ocfs2_xattr_entry)); +} + +/* + * When the ocfs2_xattr_block is filled up, new bucket will be created + * and all the xattr entries will be moved to the new bucket. + * Note: we need to sort the entries since they are not saved in order + * in the ocfs2_xattr_block. + */ +static void ocfs2_cp_xattr_block_to_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *xh_bh, + struct buffer_head *data_bh) +{ + int i, blocksize = inode->i_sb->s_blocksize; + u16 offset, size, off_change; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_header *xb_xh = &xb->xb_attrs.xb_header; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)xh_bh->b_data; + u16 count = le16_to_cpu(xb_xh->xh_count); + char *target = xh_bh->b_data, *src = xb_bh->b_data; + + mlog(0, "cp xattr from block %llu to bucket %llu\n", + (unsigned long long)xb_bh->b_blocknr, + (unsigned long long)xh_bh->b_blocknr); + + memset(xh_bh->b_data, 0, blocksize); + if (data_bh) + memset(data_bh->b_data, 0, blocksize); + /* + * Since the xe_name_offset is based on ocfs2_xattr_header, + * there is a offset change corresponding to the change of + * ocfs2_xattr_header's position. + */ + off_change = offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + xe = &xb_xh->xh_entries[count - 1]; + offset = le16_to_cpu(xe->xe_name_offset) + off_change; + size = blocksize - offset; + + /* copy all the names and values. */ + if (data_bh) + target = data_bh->b_data; + memcpy(target + offset, src + offset, size); + + /* Init new header now. */ + xh->xh_count = xb_xh->xh_count; + xh->xh_num_buckets = cpu_to_le16(1); + xh->xh_name_value_len = cpu_to_le16(size); + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE - size); + + /* copy all the entries. */ + target = xh_bh->b_data; + offset = offsetof(struct ocfs2_xattr_header, xh_entries); + size = count * sizeof(struct ocfs2_xattr_entry); + memcpy(target + offset, (char *)xb_xh + offset, size); + + /* Change the xe offset for all the xe because of the move. */ + off_change = OCFS2_XATTR_BUCKET_SIZE - blocksize + + offsetof(struct ocfs2_xattr_block, xb_attrs.xb_header); + for (i = 0; i < count; i++) + le16_add_cpu(&xh->xh_entries[i].xe_name_offset, off_change); + + mlog(0, "copy entry: start = %u, size = %u, offset_change = %u\n", + offset, size, off_change); + + sort(target + offset, count, sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); +} + +/* + * After we move xattr from block to index btree, we have to + * update ocfs2_xattr_search to the new xe and base. + * + * When the entry is in xattr block, xattr_bh indicates the storage place. + * While if the entry is in index b-tree, "bucket" indicates the + * real place of the xattr. + */ +static int ocfs2_xattr_update_xattr_search(struct inode *inode, + struct ocfs2_xattr_search *xs, + struct buffer_head *old_bh, + struct buffer_head *new_bh) +{ + int ret = 0; + char *buf = old_bh->b_data; + struct ocfs2_xattr_block *old_xb = (struct ocfs2_xattr_block *)buf; + struct ocfs2_xattr_header *old_xh = &old_xb->xb_attrs.xb_header; + int i, blocksize = inode->i_sb->s_blocksize; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + xs->bucket.bhs[0] = new_bh; + get_bh(new_bh); + xs->bucket.xh = (struct ocfs2_xattr_header *)xs->bucket.bhs[0]->b_data; + xs->header = xs->bucket.xh; + + xs->base = new_bh->b_data; + xs->end = xs->base + inode->i_sb->s_blocksize; + + if (!xs->not_found) { + if (OCFS2_XATTR_BUCKET_SIZE != blocksize) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + return ret; + } + + i = xs->here - old_xh->xh_entries; + xs->here = &xs->header->xh_entries[i]; + } + } + + return ret; +} + +static int ocfs2_xattr_create_index_block(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + int ret, credits = OCFS2_SUBALLOC_ALLOC; + u32 bit_off, len; + u64 blkno; + handle_t *handle; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_alloc_context *data_ac; + struct buffer_head *xh_bh = NULL, *data_bh = NULL; + struct buffer_head *xb_bh = xs->xattr_bh; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xr; + u16 xb_flags = le16_to_cpu(xb->xb_flags); + u16 bpb = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog(0, "create xattr index block for %llu\n", + (unsigned long long)xb_bh->b_blocknr); + + BUG_ON(xb_flags & OCFS2_XATTR_INDEXED); + + ret = ocfs2_reserve_clusters(osb, 1, &data_ac); + if (ret) { + mlog_errno(ret); + goto out; + } + + /* + * XXX: + * We can use this lock for now, and maybe move to a dedicated mutex + * if performance becomes a problem later. + */ + down_write(&oi->ip_alloc_sem); + + /* + * 3 more credits, one for xattr block update, one for the 1st block + * of the new xattr bucket and one for the value/data. + */ + credits += 3; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_sem; + } + + ret = ocfs2_journal_access(handle, inode, xb_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, &len); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* + * The bucket may spread in many blocks, and + * we will only touch the 1st block and the last block + * in the whole bucket(one for entry and one for data). + */ + blkno = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); + + mlog(0, "allocate 1 cluster from %llu to xattr block\n", blkno); + + xh_bh = sb_getblk(inode->i_sb, blkno); + if (!xh_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, xh_bh); + + ret = ocfs2_journal_access(handle, inode, xh_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + if (bpb > 1) { + data_bh = sb_getblk(inode->i_sb, blkno + bpb - 1); + if (!data_bh) { + ret = -EIO; + mlog_errno(ret); + goto out_commit; + } + + ocfs2_set_new_buffer_uptodate(inode, data_bh); + + ret = ocfs2_journal_access(handle, inode, data_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + } + + ocfs2_cp_xattr_block_to_bucket(inode, xb_bh, xh_bh, data_bh); + + ocfs2_journal_dirty(handle, xh_bh); + if (data_bh) + ocfs2_journal_dirty(handle, data_bh); + + ocfs2_xattr_update_xattr_search(inode, xs, xb_bh, xh_bh); + + /* Change from ocfs2_xattr_header to ocfs2_xattr_tree_root */ + memset(&xb->xb_attrs, 0, inode->i_sb->s_blocksize - + offsetof(struct ocfs2_xattr_block, xb_attrs)); + + xr = &xb->xb_attrs.xb_root; + xr->xt_clusters = cpu_to_le32(1); + xr->xt_last_eb_blk = 0; + xr->xt_list.l_tree_depth = 0; + xr->xt_list.l_count = cpu_to_le16(ocfs2_xattr_recs_per_xb(inode->i_sb)); + xr->xt_list.l_next_free_rec = cpu_to_le16(1); + + xr->xt_list.l_recs[0].e_cpos = 0; + xr->xt_list.l_recs[0].e_blkno = cpu_to_le64(blkno); + xr->xt_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1); + + xb->xb_flags = cpu_to_le16(xb_flags | OCFS2_XATTR_INDEXED); + + ret = ocfs2_journal_dirty(handle, xb_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + +out_commit: + ocfs2_commit_trans(osb, handle); + +out_sem: + up_write(&oi->ip_alloc_sem); + +out: + if (data_ac) + ocfs2_free_alloc_context(data_ac); + + brelse(xh_bh); + brelse(data_bh); + + return ret; +} + +static int cmp_xe_offset(const void *a, const void *b) +{ + const struct ocfs2_xattr_entry *l = a, *r = b; + u32 l_name_offset = le16_to_cpu(l->xe_name_offset); + u32 r_name_offset = le16_to_cpu(r->xe_name_offset); + + if (l_name_offset < r_name_offset) + return 1; + if (l_name_offset > r_name_offset) + return -1; + return 0; +} + +/* + * defrag a xattr bucket if we find that the bucket has some + * holes beteen name/value pairs. + * We will move all the name/value pairs to the end of the bucket + * so that we can spare some space for insertion. + */ +static int ocfs2_defrag_xattr_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + int ret, i; + size_t end, offset, len, value_len; + struct ocfs2_xattr_header *xh; + char *entries, *buf, *bucket_buf = NULL; + u64 blkno = bucket->bhs[0]->b_blocknr; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u16 xh_free_start; + size_t blocksize = inode->i_sb->s_blocksize; + handle_t *handle; + struct buffer_head **bhs; + struct ocfs2_xattr_entry *xe; + + bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!bhs) + return -ENOMEM; + + ret = ocfs2_read_blocks(inode, blkno, blk_per_bucket, bhs, + OCFS2_BH_CACHED); + if (ret) + goto out; + + /* + * In order to make the operation more efficient and generic, + * we copy all the blocks into a contiguous memory and do the + * defragment there, so if anything is error, we will not touch + * the real block. + */ + bucket_buf = kmalloc(OCFS2_XATTR_BUCKET_SIZE, GFP_NOFS); + if (!bucket_buf) { + ret = -EIO; + goto out; + } + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) + memcpy(buf, bhs[i]->b_data, blocksize); + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto commit; + } + } + + xh = (struct ocfs2_xattr_header *)bucket_buf; + entries = (char *)xh->xh_entries; + xh_free_start = le16_to_cpu(xh->xh_free_start); + + mlog(0, "adjust xattr bucket in %llu, count = %u, " + "xh_free_start = %u, xh_name_value_len = %u.\n", + blkno, le16_to_cpu(xh->xh_count), xh_free_start, + le16_to_cpu(xh->xh_name_value_len)); + + /* + * sort all the entries by their offset. + * the largest will be the first, so that we can + * move them to the end one by one. + */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe_offset, swap_xe); + + /* Move all name/values to the end of the bucket. */ + xe = xh->xh_entries; + end = OCFS2_XATTR_BUCKET_SIZE; + for (i = 0; i < le16_to_cpu(xh->xh_count); i++, xe++) { + offset = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + value_len = OCFS2_XATTR_SIZE( + le64_to_cpu(xe->xe_value_size)); + else + value_len = OCFS2_XATTR_ROOT_SIZE; + len = OCFS2_XATTR_SIZE(xe->xe_name_len) + value_len; + + /* + * We must make sure that the name/value pair + * exist in the same block. So adjust end to + * the previous block end if needed. + */ + if (((end - len) / blocksize != + (end - 1) / blocksize)) + end = end - end % blocksize; + + if (end > offset + len) { + memmove(bucket_buf + end - len, + bucket_buf + offset, len); + xe->xe_name_offset = cpu_to_le16(end - len); + } + + mlog_bug_on_msg(end < offset + len, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + end -= len; + } + + mlog_bug_on_msg(xh_free_start > end, "Defrag check failed for " + "bucket %llu\n", (unsigned long long)blkno); + + if (xh_free_start == end) + goto commit; + + memset(bucket_buf + xh_free_start, 0, end - xh_free_start); + xh->xh_free_start = cpu_to_le16(end); + + /* sort the entries by their name_hash. */ + sort(entries, le16_to_cpu(xh->xh_count), + sizeof(struct ocfs2_xattr_entry), + cmp_xe, swap_xe); + + buf = bucket_buf; + for (i = 0; i < blk_per_bucket; i++, buf += blocksize) { + memcpy(bhs[i]->b_data, buf, blocksize); + ocfs2_journal_dirty(handle, bhs[i]); + } + +commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out: + + if (bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(bhs[i]); + } + kfree(bhs); + + kfree(bucket_buf); + return ret; +} + +/* + * Move half nums of the xattr bucket in the previous cluster to this new + * cluster. We only touch the last cluster of the previous extend record. + * + * first_bh is the first buffer_head of a series of bucket in the same + * extent rec and header_bh is the header of one bucket in this cluster. + * They will be updated if we move the data header_bh contains to the new + * cluster. first_hash will be set as the 1st xe's name_hash of the new cluster. + */ +static int ocfs2_mv_xattr_bucket_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blkno, + u64 prev_blkno, + u32 num_clusters, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head *old_bh, *new_bh, *prev_bh, *new_first_bh = NULL; + struct ocfs2_xattr_header *new_xh; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)((*first_bh)->b_data); + + BUG_ON(le16_to_cpu(xh->xh_num_buckets) < num_buckets); + BUG_ON(OCFS2_XATTR_BUCKET_SIZE == osb->s_clustersize); + + prev_bh = *first_bh; + get_bh(prev_bh); + xh = (struct ocfs2_xattr_header *)prev_bh->b_data; + + prev_blkno += (num_clusters - 1) * bpc + bpc / 2; + + mlog(0, "move half of xattrs in cluster %llu to %llu\n", + prev_blkno, new_blkno); + + /* + * We need to update the 1st half of the new cluster and + * 1 more for the update of the 1st bucket of the previous + * extent record. + */ + credits = bpc / 2 + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, prev_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < bpc / 2; i++, prev_blkno++, new_blkno++) { + old_bh = new_bh = NULL; + new_bh = sb_getblk(inode->i_sb, new_blkno); + if (!new_bh) { + ret = -EIO; + mlog_errno(ret); + goto out; + } + + ocfs2_set_new_buffer_uptodate(inode, new_bh); + + ret = ocfs2_journal_access(handle, inode, new_bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + ret = ocfs2_read_block(inode, prev_blkno, &old_bh); + if (ret < 0) { + mlog_errno(ret); + brelse(new_bh); + goto out; + } + + memcpy(new_bh->b_data, old_bh->b_data, blocksize); + + if (i == 0) { + new_xh = (struct ocfs2_xattr_header *)new_bh->b_data; + new_xh->xh_num_buckets = cpu_to_le16(num_buckets / 2); + + if (first_hash) + *first_hash = le32_to_cpu( + new_xh->xh_entries[0].xe_name_hash); + new_first_bh = new_bh; + get_bh(new_first_bh); + } + + ocfs2_journal_dirty(handle, new_bh); + + if (*header_bh == old_bh) { + brelse(*header_bh); + *header_bh = new_bh; + get_bh(*header_bh); + + brelse(*first_bh); + *first_bh = new_first_bh; + get_bh(*first_bh); + } + brelse(new_bh); + brelse(old_bh); + } + + le16_add_cpu(&xh->xh_num_buckets, -(num_buckets / 2)); + + ocfs2_journal_dirty(handle, prev_bh); +out: + brelse(prev_bh); + brelse(new_first_bh); + return ret; +} + +static int ocfs2_read_xattr_bucket(struct inode *inode, + u64 blkno, + struct buffer_head **bhs, + int new) +{ + int ret = 0; + u16 i, blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + if (!new) + return ocfs2_read_blocks(inode, blkno, + blk_per_bucket, bhs, + OCFS2_BH_CACHED); + + for (i = 0; i < blk_per_bucket; i++) { + bhs[i] = sb_getblk(inode->i_sb, blkno + i); + if (bhs[i] == NULL) { + ret = -EIO; + mlog_errno(ret); + break; + } + ocfs2_set_new_buffer_uptodate(inode, bhs[i]); + } + + return ret; +} + +/* + * Move half num of the xattrs in old bucket(blk) to new bucket(new_blk). + * first_hash will record the 1st hash of the new bucket. + */ +static int ocfs2_half_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 blk, + u64 new_blk, + u32 *first_hash, + int new_bucket_head) +{ + int ret, i; + u16 count, start, len, name_value_len, xe_len, name_offset; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct buffer_head **s_bhs, **t_bhs = NULL; + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + int blocksize = inode->i_sb->s_blocksize; + + mlog(0, "move half of xattrs from bucket %llu to %llu\n", + blk, new_blk); + + s_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, blk, s_bhs, 0); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, s_bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + t_bhs = kcalloc(blk_per_bucket, sizeof(struct buffer_head *), GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, new_blk, t_bhs, new_bucket_head); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + /* copy the whole bucket to the new first. */ + for (i = 0; i < blk_per_bucket; i++) + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + + /* update the new bucket. */ + xh = (struct ocfs2_xattr_header *)t_bhs[0]->b_data; + count = le16_to_cpu(xh->xh_count); + start = count / 2; + + /* + * Calculate the total name/value len and xh_free_start for + * the old bucket first. + */ + name_offset = OCFS2_XATTR_BUCKET_SIZE; + name_value_len = 0; + for (i = 0; i < start; i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + name_value_len += xe_len; + if (le16_to_cpu(xe->xe_name_offset) < name_offset) + name_offset = le16_to_cpu(xe->xe_name_offset); + } + + /* + * Now begin the modification to the new bucket. + * + * In the new bucket, We just move the xattr entry to the beginning + * and don't touch the name/value. So there will be some holes in the + * bucket, and they will be removed when ocfs2_defrag_xattr_bucket is + * called. + */ + xe = &xh->xh_entries[start]; + len = sizeof(struct ocfs2_xattr_entry) * (count - start); + mlog(0, "mv xattr entry len %d from %d to %d\n", len, + (int)((char *)xe - (char *)xh), + (int)((char *)xh->xh_entries - (char *)xh)); + memmove((char *)xh->xh_entries, (char *)xe, len); + xe = &xh->xh_entries[count - start]; + len = sizeof(struct ocfs2_xattr_entry) * start; + memset((char *)xe, 0, len); + + le16_add_cpu(&xh->xh_count, -start); + le16_add_cpu(&xh->xh_name_value_len, -name_value_len); + + /* Calculate xh_free_start for the new bucket. */ + xh->xh_free_start = cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + xe_len = OCFS2_XATTR_SIZE(xe->xe_name_len); + if (ocfs2_xattr_is_local(xe)) + xe_len += + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + xe_len += OCFS2_XATTR_ROOT_SIZE; + if (le16_to_cpu(xe->xe_name_offset) < + le16_to_cpu(xh->xh_free_start)) + xh->xh_free_start = xe->xe_name_offset; + } + + /* set xh->xh_num_buckets for the new xh. */ + if (new_bucket_head) + xh->xh_num_buckets = cpu_to_le16(1); + else + xh->xh_num_buckets = 0; + + for (i = 0; i < blk_per_bucket; i++) { + ocfs2_journal_dirty(handle, t_bhs[i]); + if (ret) + mlog_errno(ret); + } + + /* store the first_hash of the new bucket. */ + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + + /* + * Now only update the 1st block of the old bucket. + * Please note that the entry has been sorted already above. + */ + xh = (struct ocfs2_xattr_header *)s_bhs[0]->b_data; + memset(&xh->xh_entries[start], 0, + sizeof(struct ocfs2_xattr_entry) * (count - start)); + xh->xh_count = cpu_to_le16(start); + xh->xh_free_start = cpu_to_le16(name_offset); + xh->xh_name_value_len = cpu_to_le16(name_value_len); + + ocfs2_journal_dirty(handle, s_bhs[0]); + if (ret) + mlog_errno(ret); + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy xattr from one bucket to another bucket. + * + * The caller must make sure that the journal transaction + * has enough space for journaling. + */ +static int ocfs2_cp_xattr_bucket(struct inode *inode, + handle_t *handle, + u64 s_blkno, + u64 t_blkno, + int t_is_new) +{ + int ret, i; + int blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int blocksize = inode->i_sb->s_blocksize; + struct buffer_head **s_bhs, **t_bhs = NULL; + + BUG_ON(s_blkno == t_blkno); + + mlog(0, "cp bucket %llu to %llu, target is %d\n", + s_blkno, t_blkno, t_is_new); + + s_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!s_bhs) + return -ENOMEM; + + ret = ocfs2_read_xattr_bucket(inode, s_blkno, s_bhs, 0); + if (ret) + goto out; + + t_bhs = kzalloc(sizeof(struct buffer_head *) * blk_per_bucket, + GFP_NOFS); + if (!t_bhs) { + ret = -ENOMEM; + goto out; + } + + ret = ocfs2_read_xattr_bucket(inode, t_blkno, t_bhs, t_is_new); + if (ret) + goto out; + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, t_bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + memcpy(t_bhs[i]->b_data, s_bhs[i]->b_data, blocksize); + ocfs2_journal_dirty(handle, t_bhs[i]); + } + +out: + if (s_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(s_bhs[i]); + } + kfree(s_bhs); + + if (t_bhs) { + for (i = 0; i < blk_per_bucket; i++) + brelse(t_bhs[i]); + } + kfree(t_bhs); + + return ret; +} + +/* + * Copy one xattr cluster from src_blk to to_blk. + * The to_blk will become the first bucket header of the cluster, so its + * xh_num_buckets will be initialized as the bucket num in the cluster. + */ +static int ocfs2_cp_xattr_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head *first_bh, + u64 src_blk, + u64 to_blk, + u32 *first_hash) +{ + int i, ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + int num_buckets = ocfs2_xattr_buckets_per_cluster(osb); + struct buffer_head *bh = NULL; + struct ocfs2_xattr_header *xh; + u64 to_blk_start = to_blk; + + mlog(0, "cp xattrs from cluster %llu to %llu\n", src_blk, to_blk); + + /* + * We need to update the new cluster and 1 more for the update of + * the 1st bucket of the previous extent rec. + */ + credits = bpc + 1; + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < num_buckets; i++) { + ret = ocfs2_cp_xattr_bucket(inode, handle, + src_blk, to_blk, 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + src_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + to_blk += ocfs2_blocks_per_xattr_bucket(inode->i_sb); + } + + /* update the old bucket header. */ + xh = (struct ocfs2_xattr_header *)first_bh->b_data; + le16_add_cpu(&xh->xh_num_buckets, -num_buckets); + + ocfs2_journal_dirty(handle, first_bh); + + /* update the new bucket header. */ + ret = ocfs2_read_block(inode, to_blk_start, &bh); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh = (struct ocfs2_xattr_header *)bh->b_data; + xh->xh_num_buckets = cpu_to_le16(num_buckets); + + ocfs2_journal_dirty(handle, bh); + + if (first_hash) + *first_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); +out: + brelse(bh); + return ret; +} + +/* + * Move half of the xattrs in this cluster to the new cluster. + * This function should only be called when bucket size == cluster size. + * Otherwise ocfs2_mv_xattr_bucket_cross_cluster should be used instead. + */ +static int ocfs2_half_xattr_cluster(struct inode *inode, + handle_t *handle, + u64 prev_blk, + u64 new_blk, + u32 *first_hash) +{ + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + int ret, credits = 2 * blk_per_bucket; + + BUG_ON(OCFS2_XATTR_BUCKET_SIZE < OCFS2_SB(inode->i_sb)->s_clustersize); + + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + return ret; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + return ocfs2_half_xattr_bucket(inode, handle, prev_blk, + new_blk, first_hash, 1); +} + +/* + * Move some xattrs from the old cluster to the new one since they are not + * contiguous in ocfs2 xattr tree. + * + * new_blk starts a new separate cluster, and we will move some xattrs from + * prev_blk to it. v_start will be set as the first name hash value in this + * new cluster so that it can be used as e_cpos during tree insertion and + * don't collide with our original b-tree operations. first_bh and header_bh + * will also be updated since they will be used in ocfs2_extend_xattr_bucket + * to extend the insert bucket. + * + * The problem is how much xattr should we move to the new one and when should + * we update first_bh and header_bh? + * 1. If cluster size > bucket size, that means the previous cluster has more + * than 1 bucket, so just move half nums of bucket into the new cluster and + * update the first_bh and header_bh if the insert bucket has been moved + * to the new cluster. + * 2. If cluster_size == bucket_size: + * a) If the previous extent rec has more than one cluster and the insert + * place isn't in the last cluster, copy the entire last cluster to the + * new one. This time, we don't need to upate the first_bh and header_bh + * since they will not be moved into the new cluster. + * b) Otherwise, move the bottom half of the xattrs in the last cluster into + * the new one. And we set the extend flag to zero if the insert place is + * moved into the new allocated cluster since no extend is needed. + */ +static int ocfs2_adjust_xattr_cross_cluster(struct inode *inode, + handle_t *handle, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u64 new_blk, + u64 prev_blk, + u32 prev_clusters, + u32 *v_start, + int *extend) +{ + int ret = 0; + int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + + mlog(0, "adjust xattrs from cluster %llu len %u to %llu\n", + prev_blk, prev_clusters, new_blk); + + if (ocfs2_xattr_buckets_per_cluster(OCFS2_SB(inode->i_sb)) > 1) + ret = ocfs2_mv_xattr_bucket_cross_cluster(inode, + handle, + first_bh, + header_bh, + new_blk, + prev_blk, + prev_clusters, + v_start); + else { + u64 last_blk = prev_blk + bpc * (prev_clusters - 1); + + if (prev_clusters > 1 && (*header_bh)->b_blocknr != last_blk) + ret = ocfs2_cp_xattr_cluster(inode, handle, *first_bh, + last_blk, new_blk, + v_start); + else { + ret = ocfs2_half_xattr_cluster(inode, handle, + last_blk, new_blk, + v_start); + + if ((*header_bh)->b_blocknr == last_blk && extend) + *extend = 0; + } + } + + return ret; +} + +/* + * Add a new cluster for xattr storage. + * + * If the new cluster is contiguous with the previous one, it will be + * appended to the same extent record, and num_clusters will be updated. + * If not, we will insert a new extent for it and move some xattrs in + * the last cluster into the new allocated one. + * We also need to limit the maximum size of a btree leaf, otherwise we'll + * lose the benefits of hashing because we'll have to search large leaves. + * So now the maximum size is OCFS2_MAX_XATTR_TREE_LEAF_SIZE(or clustersize, + * if it's bigger). + * + * first_bh is the first block of the previous extent rec and header_bh + * indicates the bucket we will insert the new xattrs. They will be updated + * when the header_bh is moved into the new cluster. + */ +static int ocfs2_add_new_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + struct buffer_head **first_bh, + struct buffer_head **header_bh, + u32 *num_clusters, + u32 prev_cpos, + u64 prev_blkno, + int *extend) +{ + int ret, credits; + u16 bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1); + u32 prev_clusters = *num_clusters; + u32 clusters_to_add = 1, bit_off, num_bits, v_start = 0; + u64 block; + handle_t *handle = NULL; + struct ocfs2_alloc_context *data_ac = NULL; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_tree et; + + mlog(0, "Add new xattr cluster for %llu, previous xattr hash = %u, " + "previous xattr blkno = %llu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + prev_cpos, prev_blkno); + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ret = ocfs2_lock_allocators(inode, &et, clusters_to_add, 0, + &data_ac, &meta_ac); + if (ret) { + mlog_errno(ret); + goto leave; + } + + credits = ocfs2_calc_extend_credits(osb->sb, et.et_root_el, + clusters_to_add); + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = __ocfs2_claim_clusters(osb, handle, data_ac, 1, + clusters_to_add, &bit_off, &num_bits); + if (ret < 0) { + if (ret != -ENOSPC) + mlog_errno(ret); + goto leave; + } + + BUG_ON(num_bits > clusters_to_add); + + block = ocfs2_clusters_to_blocks(osb->sb, bit_off); + mlog(0, "Allocating %u clusters at block %u for xattr in inode %llu\n", + num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); + + if (prev_blkno + prev_clusters * bpc == block && + (prev_clusters + num_bits) << osb->s_clustersize_bits <= + OCFS2_MAX_XATTR_TREE_LEAF_SIZE) { + /* + * If this cluster is contiguous with the old one and + * adding this new cluster, we don't surpass the limit of + * OCFS2_MAX_XATTR_TREE_LEAF_SIZE, cool. We will let it be + * initialized and used like other buckets in the previous + * cluster. + * So add it as a contiguous one. The caller will handle + * its init process. + */ + v_start = prev_cpos + prev_clusters; + *num_clusters = prev_clusters + num_bits; + mlog(0, "Add contiguous %u clusters to previous extent rec.\n", + num_bits); + } else { + ret = ocfs2_adjust_xattr_cross_cluster(inode, + handle, + first_bh, + header_bh, + block, + prev_blkno, + prev_clusters, + &v_start, + extend); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + + if (handle->h_buffer_credits < credits) { + /* + * The journal has been restarted before, and don't + * have enough space for the insertion, so extend it + * here. + */ + ret = ocfs2_extend_trans(handle, credits); + if (ret) { + mlog_errno(ret); + goto leave; + } + } + mlog(0, "Insert %u clusters at block %llu for xattr at %u\n", + num_bits, block, v_start); + ret = ocfs2_insert_extent(osb, handle, inode, &et, v_start, block, + num_bits, 0, meta_ac); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret < 0) { + mlog_errno(ret); + goto leave; + } + +leave: + if (handle) + ocfs2_commit_trans(osb, handle); + if (data_ac) + ocfs2_free_alloc_context(data_ac); + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + return ret; +} + +/* + * Extend a new xattr bucket and move xattrs to the end one by one until + * We meet with start_bh. Only move half of the xattrs to the bucket after it. + */ +static int ocfs2_extend_xattr_bucket(struct inode *inode, + struct buffer_head *first_bh, + struct buffer_head *start_bh, + u32 num_clusters) +{ + int ret, credits; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + u64 start_blk = start_bh->b_blocknr, end_blk; + u32 num_buckets = num_clusters * ocfs2_xattr_buckets_per_cluster(osb); + handle_t *handle; + struct ocfs2_xattr_header *first_xh = + (struct ocfs2_xattr_header *)first_bh->b_data; + u16 bucket = le16_to_cpu(first_xh->xh_num_buckets); + + mlog(0, "extend xattr bucket in %llu, xattr extend rec starting " + "from %llu, len = %u\n", start_blk, + (unsigned long long)first_bh->b_blocknr, num_clusters); + + BUG_ON(bucket >= num_buckets); + + end_blk = first_bh->b_blocknr + (bucket - 1) * blk_per_bucket; + + /* + * We will touch all the buckets after the start_bh(include it). + * Add one more bucket and modify the first_bh. + */ + credits = end_blk - start_blk + 2 * blk_per_bucket + 1; + handle = ocfs2_start_trans(osb, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, first_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto commit; + } + + while (end_blk != start_blk) { + ret = ocfs2_cp_xattr_bucket(inode, handle, end_blk, + end_blk + blk_per_bucket, 0); + if (ret) + goto commit; + end_blk -= blk_per_bucket; + } + + /* Move half of the xattr in start_blk to the next bucket. */ + ret = ocfs2_half_xattr_bucket(inode, handle, start_blk, + start_blk + blk_per_bucket, NULL, 0); + + le16_add_cpu(&first_xh->xh_num_buckets, 1); + ocfs2_journal_dirty(handle, first_bh); + +commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Add new xattr bucket in an extent record and adjust the buckets accordingly. + * xb_bh is the ocfs2_xattr_block. + * We will move all the buckets starting from header_bh to the next place. As + * for this one, half num of its xattrs will be moved to the next one. + * + * We will allocate a new cluster if current cluster is full and adjust + * header_bh and first_bh if the insert place is moved to the new cluster. + */ +static int ocfs2_add_new_xattr_bucket(struct inode *inode, + struct buffer_head *xb_bh, + struct buffer_head *header_bh) +{ + struct ocfs2_xattr_header *first_xh = NULL; + struct buffer_head *first_bh = NULL; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_xattr_tree_root *xb_root = &xb->xb_attrs.xb_root; + struct ocfs2_extent_list *el = &xb_root->xt_list; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + u32 name_hash = le32_to_cpu(xh->xh_entries[0].xe_name_hash); + struct super_block *sb = inode->i_sb; + struct ocfs2_super *osb = OCFS2_SB(sb); + int ret, num_buckets, extend = 1; + u64 p_blkno; + u32 e_cpos, num_clusters; + + mlog(0, "Add new xattr bucket starting form %llu\n", + (unsigned long long)header_bh->b_blocknr); + + /* + * Add refrence for header_bh here because it may be + * changed in ocfs2_add_new_xattr_cluster and we need + * to free it in the end. + */ + get_bh(header_bh); + + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, &e_cpos, + &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_read_block(inode, p_blkno, &first_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + num_buckets = ocfs2_xattr_buckets_per_cluster(osb) * num_clusters; + first_xh = (struct ocfs2_xattr_header *)first_bh->b_data; + + if (num_buckets == le16_to_cpu(first_xh->xh_num_buckets)) { + ret = ocfs2_add_new_xattr_cluster(inode, + xb_bh, + &first_bh, + &header_bh, + &num_clusters, + e_cpos, + p_blkno, + &extend); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + if (extend) + ret = ocfs2_extend_xattr_bucket(inode, + first_bh, + header_bh, + num_clusters); + if (ret) + mlog_errno(ret); +out: + brelse(first_bh); + brelse(header_bh); + return ret; +} + +static inline char *ocfs2_xattr_bucket_get_val(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + int offs) +{ + int block_off = offs >> inode->i_sb->s_blocksize_bits; + + offs = offs % inode->i_sb->s_blocksize; + return bucket->bhs[block_off]->b_data + offs; +} + +/* + * Handle the normal xattr set, including replace, delete and new. + * + * Note: "local" indicates the real data's locality. So we can't + * just its bucket locality by its length. + */ +static void ocfs2_xattr_set_entry_normal(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + struct ocfs2_xattr_entry *last, *xe; + int name_len = strlen(xi->name); + struct ocfs2_xattr_header *xh = xs->header; + u16 count = le16_to_cpu(xh->xh_count), start; + size_t blocksize = inode->i_sb->s_blocksize; + char *val; + size_t offs, size, new_size; + + last = &xh->xh_entries[count]; + if (!xs->not_found) { + xe = xs->here; + offs = le16_to_cpu(xe->xe_name_offset); + if (ocfs2_xattr_is_local(xe)) + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + /* + * If the new value will be stored outside, xi->value has been + * initalized as an empty ocfs2_xattr_value_root, and the same + * goes with xi->value_len, so we can set new_size safely here. + * See ocfs2_xattr_set_in_bucket. + */ + new_size = OCFS2_XATTR_SIZE(name_len) + + OCFS2_XATTR_SIZE(xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, -size); + if (xi->value) { + if (new_size > size) + goto set_new_name_value; + + /* Now replace the old value with new one. */ + if (local) + xe->xe_value_size = cpu_to_le64(xi->value_len); + else + xe->xe_value_size = 0; + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs); + memset(val + OCFS2_XATTR_SIZE(name_len), 0, + size - OCFS2_XATTR_SIZE(name_len)); + if (OCFS2_XATTR_SIZE(xi->value_len) > 0) + memcpy(val + OCFS2_XATTR_SIZE(name_len), + xi->value, xi->value_len); + + le16_add_cpu(&xh->xh_name_value_len, new_size); + ocfs2_xattr_set_local(xe, local); + return; + } else { + /* + * Remove the old entry if there is more than one. + * We don't remove the last entry so that we can + * use it to indicate the hash value of the empty + * bucket. + */ + last -= 1; + le16_add_cpu(&xh->xh_count, -1); + if (xh->xh_count) { + memmove(xe, xe + 1, + (void *)last - (void *)xe); + memset(last, 0, + sizeof(struct ocfs2_xattr_entry)); + } else + xh->xh_free_start = + cpu_to_le16(OCFS2_XATTR_BUCKET_SIZE); + + return; + } + } else { + /* find a new entry for insert. */ + int low = 0, high = count - 1, tmp; + struct ocfs2_xattr_entry *tmp_xe; + + while (low <= high && count) { + tmp = (low + high) / 2; + tmp_xe = &xh->xh_entries[tmp]; + + if (name_hash > le32_to_cpu(tmp_xe->xe_name_hash)) + low = tmp + 1; + else if (name_hash < + le32_to_cpu(tmp_xe->xe_name_hash)) + high = tmp - 1; + else { + low = tmp; + break; + } + } + + xe = &xh->xh_entries[low]; + if (low != count) + memmove(xe + 1, xe, (void *)last - (void *)xe); + + le16_add_cpu(&xh->xh_count, 1); + memset(xe, 0, sizeof(struct ocfs2_xattr_entry)); + xe->xe_name_hash = cpu_to_le32(name_hash); + xe->xe_name_len = name_len; + ocfs2_xattr_set_type(xe, xi->name_index); + } + +set_new_name_value: + /* Insert the new name+value. */ + size = OCFS2_XATTR_SIZE(name_len) + OCFS2_XATTR_SIZE(xi->value_len); + + /* + * We must make sure that the name/value pair + * exists in the same block. + */ + offs = le16_to_cpu(xh->xh_free_start); + start = offs - size; + + if (start >> inode->i_sb->s_blocksize_bits != + (offs - 1) >> inode->i_sb->s_blocksize_bits) { + offs = offs - offs % blocksize; + xh->xh_free_start = cpu_to_le16(offs); + } + + val = ocfs2_xattr_bucket_get_val(inode, + &xs->bucket, offs - size); + xe->xe_name_offset = cpu_to_le16(offs - size); + + memset(val, 0, size); + memcpy(val, xi->name, name_len); + memcpy(val + OCFS2_XATTR_SIZE(name_len), xi->value, xi->value_len); + + xe->xe_value_size = cpu_to_le64(xi->value_len); + ocfs2_xattr_set_local(xe, local); + xs->here = xe; + le16_add_cpu(&xh->xh_free_start, -size); + le16_add_cpu(&xh->xh_name_value_len, size); + + return; +} + +static int ocfs2_xattr_bucket_handle_journal(struct inode *inode, + handle_t *handle, + struct ocfs2_xattr_search *xs, + struct buffer_head **bhs, + u16 bh_num) +{ + int ret = 0, off, block_off; + struct ocfs2_xattr_entry *xe = xs->here; + + /* + * First calculate all the blocks we should journal_access + * and journal_dirty. The first block should always be touched. + */ + ret = ocfs2_journal_dirty(handle, bhs[0]); + if (ret) + mlog_errno(ret); + + /* calc the data. */ + off = le16_to_cpu(xe->xe_name_offset); + block_off = off >> inode->i_sb->s_blocksize_bits; + ret = ocfs2_journal_dirty(handle, bhs[block_off]); + if (ret) + mlog_errno(ret); + + return ret; +} + +/* + * Set the xattr entry in the specified bucket. + * The bucket is indicated by xs->bucket and it should have the enough + * space for the xattr insertion. + */ +static int ocfs2_xattr_set_entry_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs, + u32 name_hash, + int local) +{ + int i, ret; + handle_t *handle = NULL; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "Set xattr entry len = %lu index = %d in bucket %llu\n", + (unsigned long)xi->value_len, xi->name_index, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr); + + if (!xs->bucket.bhs[1]) { + ret = ocfs2_read_blocks(inode, + xs->bucket.bhs[0]->b_blocknr + 1, + blk_per_bucket - 1, &xs->bucket.bhs[1], + OCFS2_BH_CACHED); + if (ret) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, blk_per_bucket); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + handle = NULL; + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) { + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[i], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + ocfs2_xattr_set_entry_normal(inode, xi, xs, name_hash, local); + + /*Only dirty the blocks we have touched in set xattr. */ + ret = ocfs2_xattr_bucket_handle_journal(inode, handle, xs, + xs->bucket.bhs, blk_per_bucket); + if (ret) + mlog_errno(ret); +out: + ocfs2_commit_trans(osb, handle); + + return ret; +} + +static int ocfs2_xattr_value_update_size(struct inode *inode, + struct buffer_head *xe_bh, + struct ocfs2_xattr_entry *xe, + u64 new_size) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + handle_t *handle = NULL; + + handle = ocfs2_start_trans(osb, 1); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, xe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret < 0) { + mlog_errno(ret); + goto out_commit; + } + + xe->xe_value_size = cpu_to_le64(new_size); + + ret = ocfs2_journal_dirty(handle, xe_bh); + if (ret < 0) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + return ret; +} + +/* + * Truncate the specified xe_off entry in xattr bucket. + * bucket is indicated by header_bh and len is the new length. + * Both the ocfs2_xattr_value_root and the entry will be updated here. + * + * Copy the new updated xe and xe_value_root to new_xe and new_xv if needed. + */ +static int ocfs2_xattr_bucket_value_truncate(struct inode *inode, + struct buffer_head *header_bh, + int xe_off, + int len) +{ + int ret, offset; + u64 value_blk; + struct buffer_head *value_bh = NULL; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe; + struct ocfs2_xattr_header *xh = + (struct ocfs2_xattr_header *)header_bh->b_data; + size_t blocksize = inode->i_sb->s_blocksize; + + xe = &xh->xh_entries[xe_off]; + + BUG_ON(!xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + value_blk = offset / blocksize; + + /* We don't allow ocfs2_xattr_value to be stored in different block. */ + BUG_ON(value_blk != (offset + OCFS2_XATTR_ROOT_SIZE - 1) / blocksize); + value_blk += header_bh->b_blocknr; + + ret = ocfs2_read_block(inode, value_blk, &value_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + + xv = (struct ocfs2_xattr_value_root *) + (value_bh->b_data + offset % blocksize); + + mlog(0, "truncate %u in xattr bucket %llu to %d bytes.\n", + xe_off, (unsigned long long)header_bh->b_blocknr, len); + ret = ocfs2_xattr_value_truncate(inode, value_bh, xv, len); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_xattr_value_update_size(inode, header_bh, xe, len); + if (ret) { + mlog_errno(ret); + goto out; + } + +out: + brelse(value_bh); + return ret; +} + +static int ocfs2_xattr_bucket_value_truncate_xs(struct inode *inode, + struct ocfs2_xattr_search *xs, + int len) +{ + int ret, offset; + struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = (struct ocfs2_xattr_header *)xs->base; + + BUG_ON(!xs->bucket.bhs[0] || !xe || ocfs2_xattr_is_local(xe)); + + offset = xe - xh->xh_entries; + ret = ocfs2_xattr_bucket_value_truncate(inode, xs->bucket.bhs[0], + offset, len); + if (ret) + mlog_errno(ret); + + return ret; +} + +static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, + struct ocfs2_xattr_search *xs, + char *val, + int value_len) +{ + int offset; + struct ocfs2_xattr_value_root *xv; + struct ocfs2_xattr_entry *xe = xs->here; + + BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); + + offset = le16_to_cpu(xe->xe_name_offset) + + OCFS2_XATTR_SIZE(xe->xe_name_len); + + xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + + return __ocfs2_xattr_set_value_outside(inode, xv, val, value_len); +} + +static int ocfs2_rm_xattr_cluster(struct inode *inode, + struct buffer_head *root_bh, + u64 blkno, + u32 cpos, + u32 len) +{ + int ret; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct inode *tl_inode = osb->osb_tl_inode; + handle_t *handle; + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)root_bh->b_data; + struct ocfs2_alloc_context *meta_ac = NULL; + struct ocfs2_cached_dealloc_ctxt dealloc; + struct ocfs2_extent_tree et; + + ocfs2_init_xattr_tree_extent_tree(&et, inode, root_bh); + + ocfs2_init_dealloc_ctxt(&dealloc); + + mlog(0, "rm xattr extent rec at %u len = %u, start from %llu\n", + cpos, len, (unsigned long long)blkno); + + ocfs2_remove_xattr_clusters_from_cache(inode, blkno, len); + + ret = ocfs2_lock_allocators(inode, &et, 0, 1, NULL, &meta_ac); + if (ret) { + mlog_errno(ret); + return ret; + } + + mutex_lock(&tl_inode->i_mutex); + + if (ocfs2_truncate_log_needs_flush(osb)) { + ret = __ocfs2_flush_truncate_log(osb); + if (ret < 0) { + mlog_errno(ret); + goto out; + } + } + + handle = ocfs2_start_trans(osb, OCFS2_REMOVE_EXTENT_CREDITS); + if (handle == NULL) { + ret = -ENOMEM; + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access(handle, inode, root_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_remove_extent(inode, &et, cpos, len, handle, meta_ac, + &dealloc); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, -len); + + ret = ocfs2_journal_dirty(handle, root_bh); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + ret = ocfs2_truncate_log_append(osb, handle, blkno, len); + if (ret) + mlog_errno(ret); + +out_commit: + ocfs2_commit_trans(osb, handle); +out: + ocfs2_schedule_truncate_log_flush(osb, 1); + + mutex_unlock(&tl_inode->i_mutex); + + if (meta_ac) + ocfs2_free_alloc_context(meta_ac); + + ocfs2_run_deallocs(osb, &dealloc); + + return ret; +} + +static void ocfs2_xattr_bucket_remove_xs(struct inode *inode, + struct ocfs2_xattr_search *xs) +{ + handle_t *handle = NULL; + struct ocfs2_xattr_header *xh = xs->bucket.xh; + struct ocfs2_xattr_entry *last = &xh->xh_entries[ + le16_to_cpu(xh->xh_count) - 1]; + int ret = 0; + + handle = ocfs2_start_trans((OCFS2_SB(inode->i_sb)), 1); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + return; + } + + ret = ocfs2_journal_access(handle, inode, xs->bucket.bhs[0], + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + /* Remove the old entry. */ + memmove(xs->here, xs->here + 1, + (void *)last - (void *)xs->here); + memset(last, 0, sizeof(struct ocfs2_xattr_entry)); + le16_add_cpu(&xh->xh_count, -1); + + ret = ocfs2_journal_dirty(handle, xs->bucket.bhs[0]); + if (ret < 0) + mlog_errno(ret); +out_commit: + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +} + +/* + * Set the xattr name/value in the bucket specified in xs. + * + * As the new value in xi may be stored in the bucket or in an outside cluster, + * we divide the whole process into 3 steps: + * 1. insert name/value in the bucket(ocfs2_xattr_set_entry_in_bucket) + * 2. truncate of the outside cluster(ocfs2_xattr_bucket_value_truncate_xs) + * 3. Set the value to the outside cluster(ocfs2_xattr_bucket_set_value_outside) + * 4. If the clusters for the new outside value can't be allocated, we need + * to free the xattr we allocated in set. + */ +static int ocfs2_xattr_set_in_bucket(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + int ret, local = 1; + size_t value_len; + char *val = (char *)xi->value; + struct ocfs2_xattr_entry *xe = xs->here; + u32 name_hash = ocfs2_xattr_name_hash(inode, xi->name, + strlen(xi->name)); + + if (!xs->not_found && !ocfs2_xattr_is_local(xe)) { + /* + * We need to truncate the xattr storage first. + * + * If both the old and new value are stored to + * outside block, we only need to truncate + * the storage and then set the value outside. + * + * If the new value should be stored within block, + * we should free all the outside block first and + * the modification to the xattr block will be done + * by following steps. + */ + if (xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_len = xi->value_len; + else + value_len = 0; + + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) + goto out; + + if (value_len) + goto set_value_outside; + } + + value_len = xi->value_len; + /* So we have to handle the inside block change now. */ + if (value_len > OCFS2_XATTR_INLINE_SIZE) { + /* + * If the new value will be stored outside of block, + * initalize a new empty value root and insert it first. + */ + local = 0; + xi->value = &def_xv; + xi->value_len = OCFS2_XATTR_ROOT_SIZE; + } + + ret = ocfs2_xattr_set_entry_in_bucket(inode, xi, xs, name_hash, local); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (value_len <= OCFS2_XATTR_INLINE_SIZE) + goto out; + + /* allocate the space now for the outside block storage. */ + ret = ocfs2_xattr_bucket_value_truncate_xs(inode, xs, + value_len); + if (ret) { + mlog_errno(ret); + + if (xs->not_found) { + /* + * We can't allocate enough clusters for outside + * storage and we have allocated xattr already, + * so need to remove it. + */ + ocfs2_xattr_bucket_remove_xs(inode, xs); + } + goto out; + } + +set_value_outside: + ret = ocfs2_xattr_bucket_set_value_outside(inode, xs, val, value_len); +out: + return ret; +} + +/* check whether the xattr bucket is filled up with the same hash value. */ +static int ocfs2_check_xattr_bucket_collision(struct inode *inode, + struct ocfs2_xattr_bucket *bucket) +{ + struct ocfs2_xattr_header *xh = bucket->xh; + + if (xh->xh_entries[le16_to_cpu(xh->xh_count) - 1].xe_name_hash == + xh->xh_entries[0].xe_name_hash) { + mlog(ML_ERROR, "Too much hash collision in xattr bucket %llu, " + "hash = %u\n", + (unsigned long long)bucket->bhs[0]->b_blocknr, + le32_to_cpu(xh->xh_entries[0].xe_name_hash)); + return -ENOSPC; + } + + return 0; +} + +static int ocfs2_xattr_set_entry_index_block(struct inode *inode, + struct ocfs2_xattr_info *xi, + struct ocfs2_xattr_search *xs) +{ + struct ocfs2_xattr_header *xh; + struct ocfs2_xattr_entry *xe; + u16 count, header_size, xh_free_start; + int i, free, max_free, need, old; + size_t value_size = 0, name_len = strlen(xi->name); + size_t blocksize = inode->i_sb->s_blocksize; + int ret, allocation = 0; + u16 blk_per_bucket = ocfs2_blocks_per_xattr_bucket(inode->i_sb); + + mlog_entry("Set xattr %s in xattr index block\n", xi->name); + +try_again: + xh = xs->header; + count = le16_to_cpu(xh->xh_count); + xh_free_start = le16_to_cpu(xh->xh_free_start); + header_size = sizeof(struct ocfs2_xattr_header) + + count * sizeof(struct ocfs2_xattr_entry); + max_free = OCFS2_XATTR_BUCKET_SIZE - + le16_to_cpu(xh->xh_name_value_len) - header_size; + + mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " + "of %u which exceed block size\n", + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + header_size); + + if (xi->value && xi->value_len > OCFS2_XATTR_INLINE_SIZE) + value_size = OCFS2_XATTR_ROOT_SIZE; + else if (xi->value) + value_size = OCFS2_XATTR_SIZE(xi->value_len); + + if (xs->not_found) + need = sizeof(struct ocfs2_xattr_entry) + + OCFS2_XATTR_SIZE(name_len) + value_size; + else { + need = value_size + OCFS2_XATTR_SIZE(name_len); + + /* + * We only replace the old value if the new length is smaller + * than the old one. Otherwise we will allocate new space in the + * bucket to store it. + */ + xe = xs->here; + if (ocfs2_xattr_is_local(xe)) + old = OCFS2_XATTR_SIZE(le64_to_cpu(xe->xe_value_size)); + else + old = OCFS2_XATTR_SIZE(OCFS2_XATTR_ROOT_SIZE); + + if (old >= value_size) + need = 0; + } + + free = xh_free_start - header_size; + /* + * We need to make sure the new name/value pair + * can exist in the same block. + */ + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + mlog(0, "xs->not_found = %d, in xattr bucket %llu: free = %d, " + "need = %d, max_free = %d, xh_free_start = %u, xh_name_value_len =" + " %u\n", xs->not_found, + (unsigned long long)xs->bucket.bhs[0]->b_blocknr, + free, need, max_free, le16_to_cpu(xh->xh_free_start), + le16_to_cpu(xh->xh_name_value_len)); + + if (free < need || count == ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + if (need <= max_free && + count < ocfs2_xattr_max_xe_in_bucket(inode->i_sb)) { + /* + * We can create the space by defragment. Since only the + * name/value will be moved, the xe shouldn't be changed + * in xs. + */ + ret = ocfs2_defrag_xattr_bucket(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + xh_free_start = le16_to_cpu(xh->xh_free_start); + free = xh_free_start - header_size; + if (xh_free_start % blocksize < need) + free -= xh_free_start % blocksize; + + if (free >= need) + goto xattr_set; + + mlog(0, "Can't get enough space for xattr insert by " + "defragment. Need %u bytes, but we have %d, so " + "allocate new bucket for it.\n", need, free); + } + + /* + * We have to add new buckets or clusters and one + * allocation should leave us enough space for insert. + */ + BUG_ON(allocation); + + /* + * We do not allow for overlapping ranges between buckets. And + * the maximum number of collisions we will allow for then is + * one bucket's worth, so check it here whether we need to + * add a new bucket for the insert. + */ + ret = ocfs2_check_xattr_bucket_collision(inode, &xs->bucket); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_add_new_xattr_bucket(inode, + xs->xattr_bh, + xs->bucket.bhs[0]); + if (ret) { + mlog_errno(ret); + goto out; + } + + for (i = 0; i < blk_per_bucket; i++) + brelse(xs->bucket.bhs[i]); + + memset(&xs->bucket, 0, sizeof(xs->bucket)); + + ret = ocfs2_xattr_index_block_find(inode, xs->xattr_bh, + xi->name_index, + xi->name, xs); + if (ret && ret != -ENODATA) + goto out; + xs->not_found = ret; + allocation = 1; + goto try_again; + } + +xattr_set: + ret = ocfs2_xattr_set_in_bucket(inode, xi, xs); +out: + mlog_exit(ret); + return ret; +} + +static int ocfs2_delete_xattr_in_bucket(struct inode *inode, + struct ocfs2_xattr_bucket *bucket, + void *para) +{ + int ret = 0; + struct ocfs2_xattr_header *xh = bucket->xh; + u16 i; + struct ocfs2_xattr_entry *xe; + + for (i = 0; i < le16_to_cpu(xh->xh_count); i++) { + xe = &xh->xh_entries[i]; + if (ocfs2_xattr_is_local(xe)) + continue; + + ret = ocfs2_xattr_bucket_value_truncate(inode, + bucket->bhs[0], + i, 0); + if (ret) { + mlog_errno(ret); + break; + } + } + + return ret; +} + +static int ocfs2_delete_xattr_index_block(struct inode *inode, + struct buffer_head *xb_bh) +{ + struct ocfs2_xattr_block *xb = + (struct ocfs2_xattr_block *)xb_bh->b_data; + struct ocfs2_extent_list *el = &xb->xb_attrs.xb_root.xt_list; + int ret = 0; + u32 name_hash = UINT_MAX, e_cpos, num_clusters; + u64 p_blkno; + + if (le16_to_cpu(el->l_next_free_rec) == 0) + return 0; + + while (name_hash > 0) { + ret = ocfs2_xattr_get_rec(inode, name_hash, &p_blkno, + &e_cpos, &num_clusters, el); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_iterate_xattr_buckets(inode, p_blkno, num_clusters, + ocfs2_delete_xattr_in_bucket, + NULL); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_rm_xattr_cluster(inode, xb_bh, + p_blkno, e_cpos, num_clusters); + if (ret) { + mlog_errno(ret); + break; + } + + if (e_cpos == 0) + break; + + name_hash = e_cpos - 1; + } + +out: + return ret; +} + +/* + * 'trusted' attributes support + */ + +#define XATTR_TRUSTED_PREFIX "trusted." + +static size_t ocfs2_xattr_trusted_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_TRUSTED_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_TRUSTED_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_trusted_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_TRUSTED, name, + buffer, size); +} + +static int ocfs2_xattr_trusted_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + if (strcmp(name, "") == 0) + return -EINVAL; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_TRUSTED, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_trusted_handler = { + .prefix = XATTR_TRUSTED_PREFIX, + .list = ocfs2_xattr_trusted_list, + .get = ocfs2_xattr_trusted_get, + .set = ocfs2_xattr_trusted_set, +}; + + +/* + * 'user' attributes support + */ + +#define XATTR_USER_PREFIX "user." + +static size_t ocfs2_xattr_user_list(struct inode *inode, char *list, + size_t list_size, const char *name, + size_t name_len) +{ + const size_t prefix_len = sizeof(XATTR_USER_PREFIX) - 1; + const size_t total_len = prefix_len + name_len + 1; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return 0; + + if (list && total_len <= list_size) { + memcpy(list, XATTR_USER_PREFIX, prefix_len); + memcpy(list + prefix_len, name, name_len); + list[prefix_len + name_len] = '\0'; + } + return total_len; +} + +static int ocfs2_xattr_user_get(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + return ocfs2_xattr_get(inode, OCFS2_XATTR_INDEX_USER, name, + buffer, size); +} + +static int ocfs2_xattr_user_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + if (strcmp(name, "") == 0) + return -EINVAL; + if (osb->s_mount_opt & OCFS2_MOUNT_NOUSERXATTR) + return -EOPNOTSUPP; + + return ocfs2_xattr_set(inode, OCFS2_XATTR_INDEX_USER, name, value, + size, flags); +} + +struct xattr_handler ocfs2_xattr_user_handler = { + .prefix = XATTR_USER_PREFIX, + .list = ocfs2_xattr_user_list, + .get = ocfs2_xattr_user_get, + .set = ocfs2_xattr_user_set, +}; diff --git a/fs/ocfs2/xattr.h b/fs/ocfs2/xattr.h new file mode 100644 index 00000000000..c25c7c62a05 --- /dev/null +++ b/fs/ocfs2/xattr.h @@ -0,0 +1,68 @@ +/* -*- mode: c; c-basic-offset: 8; -*- + * vim: noexpandtab sw=8 ts=8 sts=0: + * + * xattr.h + * + * Function prototypes + * + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License as published by the Free Software Foundation; either + * version 2 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef OCFS2_XATTR_H +#define OCFS2_XATTR_H + +#include <linux/init.h> +#include <linux/xattr.h> + +enum ocfs2_xattr_type { + OCFS2_XATTR_INDEX_USER = 1, + OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS, + OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, + OCFS2_XATTR_INDEX_TRUSTED, + OCFS2_XATTR_INDEX_SECURITY, + OCFS2_XATTR_MAX +}; + +extern struct xattr_handler ocfs2_xattr_user_handler; +extern struct xattr_handler ocfs2_xattr_trusted_handler; + +extern ssize_t ocfs2_listxattr(struct dentry *, char *, size_t); +extern int ocfs2_xattr_get(struct inode *, int, const char *, void *, size_t); +extern int ocfs2_xattr_set(struct inode *, int, const char *, const void *, + size_t, int); +extern int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh); +extern struct xattr_handler *ocfs2_xattr_handlers[]; + +static inline u16 ocfs2_xattr_buckets_per_cluster(struct ocfs2_super *osb) +{ + return (1 << osb->s_clustersize_bits) / OCFS2_XATTR_BUCKET_SIZE; +} + +static inline u16 ocfs2_blocks_per_xattr_bucket(struct super_block *sb) +{ + return OCFS2_XATTR_BUCKET_SIZE / (1 << sb->s_blocksize_bits); +} + +static inline u16 ocfs2_xattr_max_xe_in_bucket(struct super_block *sb) +{ + u16 len = sb->s_blocksize - + offsetof(struct ocfs2_xattr_header, xh_entries); + + return len / sizeof(struct ocfs2_xattr_entry); +} +#endif /* OCFS2_XATTR_H */ diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 66c1ab87656..b675a49c182 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -683,6 +683,7 @@ static int cmdline_read_proc(char *page, char **start, off_t off, return proc_calc_metrics(page, start, off, count, eof, len); } +#ifdef CONFIG_FILE_LOCKING static int locks_open(struct inode *inode, struct file *filp) { return seq_open(filp, &locks_seq_operations); @@ -694,6 +695,7 @@ static const struct file_operations proc_locks_operations = { .llseek = seq_lseek, .release = seq_release, }; +#endif /* CONFIG_FILE_LOCKING */ static int execdomains_read_proc(char *page, char **start, off_t off, int count, int *eof, void *data) @@ -887,7 +889,9 @@ void __init proc_misc_init(void) #ifdef CONFIG_PRINTK proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations); #endif +#ifdef CONFIG_FILE_LOCKING proc_create("locks", 0, NULL, &proc_locks_operations); +#endif proc_create("devices", 0, NULL, &proc_devinfo_operations); proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations); #ifdef CONFIG_BLOCK |