diff options
Diffstat (limited to 'fs')
146 files changed, 4203 insertions, 2938 deletions
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 5c4e61d3c77..8f975f25b48 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -2,6 +2,7 @@ config AFS_FS tristate "Andrew File System support (AFS) (EXPERIMENTAL)" depends on INET && EXPERIMENTAL select AF_RXRPC + select DNS_RESOLVER help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/cell.c b/fs/afs/cell.c index e19c13f059e..ffea35c6387 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -13,6 +13,7 @@ #include <linux/slab.h> #include <linux/key.h> #include <linux/ctype.h> +#include <linux/dns_resolver.h> #include <linux/sched.h> #include <keys/rxrpc-type.h> #include "internal.h" @@ -36,6 +37,8 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) struct key *key; size_t namelen; char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next; + char *dvllist = NULL, *_vllist = NULL; + char delimiter = ':'; int ret; _enter("%s,%s", name, vllist); @@ -43,8 +46,10 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */ namelen = strlen(name); - if (namelen > AFS_MAXCELLNAME) + if (namelen > AFS_MAXCELLNAME) { + _leave(" = -ENAMETOOLONG"); return ERR_PTR(-ENAMETOOLONG); + } /* allocate and initialise a cell record */ cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL); @@ -64,15 +69,31 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) INIT_LIST_HEAD(&cell->vl_list); spin_lock_init(&cell->vl_lock); + /* if the ip address is invalid, try dns query */ + if (!vllist || strlen(vllist) < 7) { + ret = dns_query("afsdb", name, namelen, "ipv4", &dvllist, NULL); + if (ret < 0) { + _leave(" = %d", ret); + return ERR_PTR(ret); + } + _vllist = dvllist; + + /* change the delimiter for user-space reply */ + delimiter = ','; + + } else { + _vllist = vllist; + } + /* fill in the VL server list from the rest of the string */ do { unsigned a, b, c, d; - next = strchr(vllist, ':'); + next = strchr(_vllist, delimiter); if (next) *next++ = 0; - if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) + if (sscanf(_vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4) goto bad_address; if (a > 255 || b > 255 || c > 255 || d > 255) @@ -81,7 +102,7 @@ static struct afs_cell *afs_cell_alloc(const char *name, char *vllist) cell->vl_addrs[cell->vl_naddrs++].s_addr = htonl((a << 24) | (b << 16) | (c << 8) | d); - } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next)); + } while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (_vllist = next)); /* create a key to represent an anonymous user */ memcpy(keyname, "afs@", 4); @@ -110,6 +131,7 @@ bad_address: ret = -EINVAL; error: key_put(cell->anonymous_key); + kfree(dvllist); kfree(cell); _leave(" = %d", ret); return ERR_PTR(ret); @@ -201,14 +223,12 @@ int afs_cell_init(char *rootcell) } cp = strchr(rootcell, ':'); - if (!cp) { - printk(KERN_ERR "kAFS: no VL server IP addresses specified\n"); - _leave(" = -EINVAL"); - return -EINVAL; - } + if (!cp) + _debug("kAFS: no VL server IP addresses specified"); + else + *cp++ = 0; /* allocate a cell record for the root cell */ - *cp++ = 0; new_root = afs_cell_create(rootcell, cp); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); diff --git a/fs/afs/main.c b/fs/afs/main.c index 66d54d348c5..cfd1cbe25b2 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -111,6 +111,8 @@ static int __init afs_init(void) /* initialise the callback update process */ ret = afs_callback_update_init(); + if (ret < 0) + goto error_callback_update_init; /* create the RxRPC transport */ ret = afs_open_socket(); @@ -127,15 +129,16 @@ static int __init afs_init(void) error_fs: afs_close_socket(); error_open_socket: + afs_callback_update_kill(); +error_callback_update_init: + afs_vlocation_purge(); error_vl_update_init: + afs_cell_purge(); error_cell_init: #ifdef CONFIG_AFS_FSCACHE fscache_unregister_netfs(&afs_cache_netfs); error_cache: #endif - afs_callback_update_kill(); - afs_vlocation_purge(); - afs_cell_purge(); afs_proc_cleanup(); rcu_barrier(); printk(KERN_ERR "kAFS: failed to register: %d\n", ret); @@ -1277,7 +1277,7 @@ out: /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not - * implemented. May fail with -EFAULT if the context pointed to + * implemented. May fail with -EINVAL if the context pointed to * is invalid. */ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx) @@ -1795,15 +1795,16 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb, /* io_getevents: * Attempts to read at least min_nr events and up to nr events from - * the completion queue for the aio_context specified by ctx_id. May - * fail with -EINVAL if ctx_id is invalid, if min_nr is out of range, - * if nr is out of range, if when is out of range. May fail with - * -EFAULT if any of the memory specified to is invalid. May return - * 0 or < min_nr if no events are available and the timeout specified - * by when has elapsed, where when == NULL specifies an infinite - * timeout. Note that the timeout pointed to by when is relative and - * will be updated if not NULL and the operation blocks. Will fail - * with -ENOSYS if not implemented. + * the completion queue for the aio_context specified by ctx_id. If + * it succeeds, the number of read events is returned. May fail with + * -EINVAL if ctx_id is invalid, if min_nr is out of range, if nr is + * out of range, if timeout is out of range. May fail with -EFAULT + * if any of the memory specified is invalid. May return 0 or + * < min_nr if the timeout specified by timeout has elapsed + * before sufficient events are available, where timeout == NULL + * specifies an infinite timeout. Note that the timeout pointed to by + * timeout is relative and will be updated if not NULL and the + * operation blocks. Will fail with -ENOSYS if not implemented. */ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, long, min_nr, diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index f4a7840bf42..42c7fafc8bf 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -37,9 +37,9 @@ void __cachefiles_printk_object(struct cachefiles_object *object, printk(KERN_ERR "%sobject: OBJ%x\n", prefix, object->fscache.debug_id); - printk(KERN_ERR "%sobjstate=%s fl=%lx swfl=%lx ev=%lx[%lx]\n", + printk(KERN_ERR "%sobjstate=%s fl=%lx wbusy=%x ev=%lx[%lx]\n", prefix, fscache_object_states[object->fscache.state], - object->fscache.flags, object->fscache.work.flags, + object->fscache.flags, work_busy(&object->fscache.work), object->fscache.events, object->fscache.event_mask & FSCACHE_OBJECT_EVENTS_MASK); printk(KERN_ERR "%sops=%u inp=%u exc=%u\n", @@ -212,7 +212,7 @@ wait_for_old_object: /* if the object we're waiting for is queued for processing, * then just put ourselves on the queue behind it */ - if (slow_work_is_queued(&xobject->fscache.work)) { + if (work_pending(&xobject->fscache.work)) { _debug("queue OBJ%x behind OBJ%x immediately", object->fscache.debug_id, xobject->fscache.debug_id); @@ -220,8 +220,7 @@ wait_for_old_object: } /* otherwise we sleep until either the object we're waiting for - * is done, or the slow-work facility wants the thread back to - * do other work */ + * is done, or the fscache_object is congested */ wq = bit_waitqueue(&xobject->flags, CACHEFILES_OBJECT_ACTIVE); init_wait(&wait); requeue = false; @@ -229,8 +228,8 @@ wait_for_old_object: prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); if (!test_bit(CACHEFILES_OBJECT_ACTIVE, &xobject->flags)) break; - requeue = slow_work_sleep_till_thread_needed( - &object->fscache.work, &timeout); + + requeue = fscache_object_sleep_till_congested(&timeout); } while (timeout > 0 && !requeue); finish_wait(wq, &wait); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 0f0d41fbb03..0e3c0924cc3 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -422,7 +422,7 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, shift = PAGE_SHIFT - inode->i_sb->s_blocksize_bits; op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_FAST; + op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; pagevec_init(&pagevec, 0); @@ -729,7 +729,7 @@ int cachefiles_read_or_alloc_pages(struct fscache_retrieval *op, pagevec_init(&pagevec, 0); op->op.flags &= FSCACHE_OP_KEEP_FLAGS; - op->op.flags |= FSCACHE_OP_FAST; + op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; INIT_LIST_HEAD(&backpages); diff --git a/fs/char_dev.c b/fs/char_dev.c index d6db933df2b..f80a4f25123 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -20,6 +20,7 @@ #include <linux/cdev.h> #include <linux/mutex.h> #include <linux/backing-dev.h> +#include <linux/tty.h> #include "internal.h" diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 5739fd7f88b..917b7d449bb 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -2,7 +2,6 @@ config CIFS tristate "CIFS support (advanced network filesystem, SMBFS successor)" depends on INET select NLS - select SLOW_WORK help This is the client VFS module for the Common Internet File System (CIFS) protocol which is the successor to the Server Message Block @@ -71,14 +70,14 @@ config CIFS_WEAK_PW_HASH If unsure, say N. config CIFS_UPCALL - bool "Kerberos/SPNEGO advanced session setup" - depends on CIFS && KEYS - help - Enables an upcall mechanism for CIFS which accesses - userspace helper utilities to provide SPNEGO packaged (RFC 4178) - Kerberos tickets which are needed to mount to certain secure servers - (for which more secure Kerberos authentication is required). If - unsure, say N. + bool "Kerberos/SPNEGO advanced session setup" + depends on CIFS && KEYS + select DNS_RESOLVER + help + Enables an upcall mechanism for CIFS which accesses userspace helper + utilities to provide SPNEGO packaged (RFC 4178) Kerberos tickets + which are needed to mount to certain secure servers (for which more + secure Kerberos authentication is required). If unsure, say N. config CIFS_XATTR bool "CIFS extended attributes" @@ -122,6 +121,7 @@ config CIFS_DEBUG2 config CIFS_DFS_UPCALL bool "DFS feature support" depends on CIFS && KEYS + select DNS_RESOLVER help Distributed File System (DFS) support is used to access shares transparently in an enterprise name space, even if the share diff --git a/fs/cifs/README b/fs/cifs/README index a727b7cb075..a7081eeeb85 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -568,8 +568,9 @@ module can be displayed via modinfo. Misc /proc/fs/cifs Flags and Debug Info ======================================= Informational pseudo-files: -DebugData Displays information about active CIFS sessions - and shares, as well as the cifs.ko version. +DebugData Displays information about active CIFS sessions and + shares, features enabled as well as the cifs.ko + version. Stats Lists summary resource usage information as well as per share statistics, if CONFIG_CIFS_STATS in enabled in the kernel configuration. diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 4fce6e61b34..eb1ba493489 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -119,6 +119,31 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) "Display Internal CIFS Data Structures for Debugging\n" "---------------------------------------------------\n"); seq_printf(m, "CIFS Version %s\n", CIFS_VERSION); + seq_printf(m, "Features: "); +#ifdef CONFIG_CIFS_DFS_UPCALL + seq_printf(m, "dfs"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_FSCACHE + seq_printf(m, "fscache"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_WEAK_PW_HASH + seq_printf(m, "lanman"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_POSIX + seq_printf(m, "posix"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_UPCALL + seq_printf(m, "spnego"); + seq_putc(m, ' '); +#endif +#ifdef CONFIG_CIFS_XATTR + seq_printf(m, "xattr"); +#endif + seq_putc(m, '\n'); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); seq_printf(m, "Servers:"); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index dc1ed50ea06..d6ced7aa23c 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -141,7 +141,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, } rc = dns_resolve_server_name_to_ip(*devname, &srvIP); - if (rc != 0) { + if (rc < 0) { cERROR(1, "%s: Failed to resolve server part of %s to IP: %d", __func__, *devname, rc); goto compose_mount_options_err; @@ -150,8 +150,7 @@ char *cifs_compose_mount_options(const char *sb_mountdata, * assuming that we have 'unc=' and 'ip=' in * the original sb_mountdata */ - md_len = strlen(sb_mountdata) + strlen(srvIP) + - strlen(ref->node_name) + 12; + md_len = strlen(sb_mountdata) + rc + strlen(ref->node_name) + 12; mountdata = kzalloc(md_len+1, GFP_KERNEL); if (mountdata == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index 6effccff85a..87044906cd1 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -84,6 +84,9 @@ struct key_type cifs_spnego_key_type = { /* strlen of ";uid=0x" */ #define UID_KEY_LEN 7 +/* strlen of ";creduid=0x" */ +#define CREDUID_KEY_LEN 11 + /* strlen of ";user=" */ #define USER_KEY_LEN 6 @@ -107,6 +110,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo) IP_KEY_LEN + INET6_ADDRSTRLEN + MAX_MECH_STR_LEN + UID_KEY_LEN + (sizeof(uid_t) * 2) + + CREDUID_KEY_LEN + (sizeof(uid_t) * 2) + USER_KEY_LEN + strlen(sesInfo->userName) + PID_KEY_LEN + (sizeof(pid_t) * 2) + 1; diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8a2cf129e53..a5ed10c9afe 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -45,7 +45,6 @@ #include "cifs_fs_sb.h" #include <linux/mm.h> #include <linux/key-type.h> -#include "dns_resolve.h" #include "cifs_spnego.h" #include "fscache.h" #define CIFS_MAGIC_NUMBER 0xFF534D42 /* the first four bytes of SMB PDUs */ @@ -934,27 +933,13 @@ init_cifs(void) if (rc) goto out_unregister_filesystem; #endif -#ifdef CONFIG_CIFS_DFS_UPCALL - rc = cifs_init_dns_resolver(); - if (rc) - goto out_unregister_key_type; -#endif - rc = slow_work_register_user(THIS_MODULE); - if (rc) - goto out_unregister_resolver_key; return 0; - out_unregister_resolver_key: -#ifdef CONFIG_CIFS_DFS_UPCALL - cifs_exit_dns_resolver(); - out_unregister_key_type: -#endif #ifdef CONFIG_CIFS_UPCALL - unregister_key_type(&cifs_spnego_key_type); out_unregister_filesystem: -#endif unregister_filesystem(&cifs_fs_type); +#endif out_destroy_request_bufs: cifs_destroy_request_bufs(); out_destroy_mids: @@ -976,7 +961,6 @@ exit_cifs(void) cifs_fscache_unregister(); #ifdef CONFIG_CIFS_DFS_UPCALL cifs_dfs_release_automount_timer(); - cifs_exit_dns_resolver(); #endif #ifdef CONFIG_CIFS_UPCALL unregister_key_type(&cifs_spnego_key_type); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 59906146ad3..0cdfb8c32ac 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -22,7 +22,7 @@ #include <linux/in.h> #include <linux/in6.h> #include <linux/slab.h> -#include <linux/slow-work.h> +#include <linux/workqueue.h> #include "cifs_fs_sb.h" #include "cifsacl.h" /* @@ -356,7 +356,7 @@ struct cifsFileInfo { atomic_t count; /* reference count */ struct mutex fh_mutex; /* prevents reopen race after dead ses*/ struct cifs_search_info srch_inf; - struct slow_work oplock_break; /* slow_work job for oplock breaks */ + struct work_struct oplock_break; /* work for oplock breaks */ }; /* Take a reference on the file private data */ @@ -728,6 +728,10 @@ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ GLOBAL_EXTERN unsigned int cifs_min_small; /* min size of small buf pool */ GLOBAL_EXTERN unsigned int cifs_max_pending; /* MAX requests at once to server*/ +void cifs_oplock_break(struct work_struct *work); +void cifs_oplock_break_get(struct cifsFileInfo *cfile); +void cifs_oplock_break_put(struct cifsFileInfo *cfile); + extern const struct slow_work_ops cifs_oplock_break_ops; #endif /* _CIFS_GLOB_H */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 2eaebbd3113..1f545081408 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -86,8 +86,8 @@ extern unsigned int smbCalcSize(struct smb_hdr *ptr); extern unsigned int smbCalcSize_LE(struct smb_hdr *ptr); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); -extern int cifs_convert_address(struct sockaddr *dst, char *src); -extern int cifs_fill_sockaddr(struct sockaddr *dst, char *src, +extern int cifs_convert_address(struct sockaddr *dst, const char *src, int len); +extern int cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, unsigned short int port); extern int map_smb_to_linux_error(struct smb_hdr *smb, int logErr); extern void header_assemble(struct smb_hdr *, char /* command */ , diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2a43a0aca96..95c2ea67edf 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1543,6 +1543,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) if (volume_info->UNCip && volume_info->UNC) { rc = cifs_fill_sockaddr((struct sockaddr *)&addr, volume_info->UNCip, + strlen(volume_info->UNCip), volume_info->port); if (!rc) { /* we failed translating address */ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index a7de5e9fff1..578d88c5b46 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -157,7 +157,7 @@ cifs_new_fileinfo(struct inode *newinode, __u16 fileHandle, mutex_init(&pCifsFile->lock_mutex); INIT_LIST_HEAD(&pCifsFile->llist); atomic_set(&pCifsFile->count, 1); - slow_work_init(&pCifsFile->oplock_break, &cifs_oplock_break_ops); + INIT_WORK(&pCifsFile->oplock_break, cifs_oplock_break); write_lock(&GlobalSMBSeslock); list_add(&pCifsFile->tlist, &cifs_sb->tcon->openFileList); diff --git a/fs/cifs/dns_resolve.c b/fs/cifs/dns_resolve.c index 3ad7f4300c4..0eb87026cad 100644 --- a/fs/cifs/dns_resolve.c +++ b/fs/cifs/dns_resolve.c @@ -4,6 +4,8 @@ * Copyright (c) 2007 Igor Mammedov * Author(s): Igor Mammedov (niallain@gmail.com) * Steve French (sfrench@us.ibm.com) + * Wang Lei (wang840925@gmail.com) + * David Howells (dhowells@redhat.com) * * Contains the CIFS DFS upcall routines used for hostname to * IP address translation. @@ -24,214 +26,73 @@ */ #include <linux/slab.h> -#include <linux/keyctl.h> -#include <linux/key-type.h> -#include <keys/user-type.h> +#include <linux/dns_resolver.h> #include "dns_resolve.h" #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" -static const struct cred *dns_resolver_cache; - -/* Checks if supplied name is IP address - * returns: - * 1 - name is IP - * 0 - name is not IP - */ -static int -is_ip(char *name) -{ - struct sockaddr_storage ss; - - return cifs_convert_address((struct sockaddr *)&ss, name); -} - -static int -dns_resolver_instantiate(struct key *key, const void *data, - size_t datalen) -{ - int rc = 0; - char *ip; - - ip = kmalloc(datalen + 1, GFP_KERNEL); - if (!ip) - return -ENOMEM; - - memcpy(ip, data, datalen); - ip[datalen] = '\0'; - - /* make sure this looks like an address */ - if (!is_ip(ip)) { - kfree(ip); - return -EINVAL; - } - - key->type_data.x[0] = datalen; - key->payload.data = ip; - - return rc; -} - -static void -dns_resolver_destroy(struct key *key) -{ - kfree(key->payload.data); -} - -struct key_type key_type_dns_resolver = { - .name = "dns_resolver", - .def_datalen = sizeof(struct in_addr), - .describe = user_describe, - .instantiate = dns_resolver_instantiate, - .destroy = dns_resolver_destroy, - .match = user_match, -}; - -/* Resolves server name to ip address. - * input: - * unc - server UNC - * output: - * *ip_addr - pointer to server ip, caller responcible for freeing it. - * return 0 on success +/** + * dns_resolve_server_name_to_ip - Resolve UNC server name to ip address. + * @unc: UNC path specifying the server + * @ip_addr: Where to return the IP address. + * + * The IP address will be returned in string form, and the caller is + * responsible for freeing it. + * + * Returns length of result on success, -ve on error. */ int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr) { - const struct cred *saved_cred; - int rc = -EAGAIN; - struct key *rkey = ERR_PTR(-EAGAIN); + struct sockaddr_storage ss; + const char *hostname, *sep; char *name; - char *data = NULL; - int len; + int len, rc; if (!ip_addr || !unc) return -EINVAL; - /* search for server name delimiter */ len = strlen(unc); if (len < 3) { cFYI(1, "%s: unc is too short: %s", __func__, unc); return -EINVAL; } - len -= 2; - name = memchr(unc+2, '\\', len); - if (!name) { - cFYI(1, "%s: probably server name is whole unc: %s", - __func__, unc); - } else { - len = (name - unc) - 2/* leading // */; - } - - name = kmalloc(len+1, GFP_KERNEL); - if (!name) { - rc = -ENOMEM; - return rc; - } - memcpy(name, unc+2, len); - name[len] = 0; - - if (is_ip(name)) { - cFYI(1, "%s: it is IP, skipping dns upcall: %s", - __func__, name); - data = name; - goto skip_upcall; - } - saved_cred = override_creds(dns_resolver_cache); - rkey = request_key(&key_type_dns_resolver, name, ""); - revert_creds(saved_cred); - if (!IS_ERR(rkey)) { - if (!(rkey->perm & KEY_USR_VIEW)) { - down_read(&rkey->sem); - rkey->perm |= KEY_USR_VIEW; - up_read(&rkey->sem); - } - len = rkey->type_data.x[0]; - data = rkey->payload.data; - } else { - cERROR(1, "%s: unable to resolve: %s", __func__, name); - goto out; - } - -skip_upcall: - if (data) { - *ip_addr = kmalloc(len + 1, GFP_KERNEL); - if (*ip_addr) { - memcpy(*ip_addr, data, len + 1); - if (!IS_ERR(rkey)) - cFYI(1, "%s: resolved: %s to %s", __func__, - name, - *ip_addr - ); - rc = 0; - } else { - rc = -ENOMEM; - } - if (!IS_ERR(rkey)) - key_put(rkey); - } + /* Discount leading slashes for cifs */ + len -= 2; + hostname = unc + 2; -out: - kfree(name); + /* Search for server name delimiter */ + sep = memchr(hostname, '\\', len); + if (sep) + len = sep - unc; + else + cFYI(1, "%s: probably server name is whole unc: %s", + __func__, unc); + + /* Try to interpret hostname as an IPv4 or IPv6 address */ + rc = cifs_convert_address((struct sockaddr *)&ss, hostname, len); + if (rc > 0) + goto name_is_IP_address; + + /* Perform the upcall */ + rc = dns_query(NULL, hostname, len, NULL, ip_addr, NULL); + if (rc < 0) + cERROR(1, "%s: unable to resolve: %*.*s", + __func__, len, len, hostname); + else + cFYI(1, "%s: resolved: %*.*s to %s", + __func__, len, len, hostname, *ip_addr); return rc; -} -int __init cifs_init_dns_resolver(void) -{ - struct cred *cred; - struct key *keyring; - int ret; - - printk(KERN_NOTICE "Registering the %s key type\n", - key_type_dns_resolver.name); - - /* create an override credential set with a special thread keyring in - * which DNS requests are cached - * - * this is used to prevent malicious redirections from being installed - * with add_key(). - */ - cred = prepare_kernel_cred(NULL); - if (!cred) +name_is_IP_address: + name = kmalloc(len + 1, GFP_KERNEL); + if (!name) return -ENOMEM; - - keyring = key_alloc(&key_type_keyring, ".dns_resolver", 0, 0, cred, - (KEY_POS_ALL & ~KEY_POS_SETATTR) | - KEY_USR_VIEW | KEY_USR_READ, - KEY_ALLOC_NOT_IN_QUOTA); - if (IS_ERR(keyring)) { - ret = PTR_ERR(keyring); - goto failed_put_cred; - } - - ret = key_instantiate_and_link(keyring, NULL, 0, NULL, NULL); - if (ret < 0) - goto failed_put_key; - - ret = register_key_type(&key_type_dns_resolver); - if (ret < 0) - goto failed_put_key; - - /* instruct request_key() to use this special keyring as a cache for - * the results it looks up */ - cred->thread_keyring = keyring; - cred->jit_keyring = KEY_REQKEY_DEFL_THREAD_KEYRING; - dns_resolver_cache = cred; + memcpy(name, hostname, len); + name[len] = 0; + cFYI(1, "%s: unc is IP, skipping dns upcall: %s", __func__, name); + *ip_addr = name; return 0; - -failed_put_key: - key_put(keyring); -failed_put_cred: - put_cred(cred); - return ret; -} - -void cifs_exit_dns_resolver(void) -{ - key_revoke(dns_resolver_cache->thread_keyring); - unregister_key_type(&key_type_dns_resolver); - put_cred(dns_resolver_cache); - printk(KERN_NOTICE "Unregistered %s key type\n", - key_type_dns_resolver.name); } diff --git a/fs/cifs/dns_resolve.h b/fs/cifs/dns_resolve.h index 5d7f291df16..d3f5d27f4d0 100644 --- a/fs/cifs/dns_resolve.h +++ b/fs/cifs/dns_resolve.h @@ -24,8 +24,6 @@ #define _DNS_RESOLVE_H #ifdef __KERNEL__ -extern int __init cifs_init_dns_resolver(void); -extern void cifs_exit_dns_resolver(void); extern int dns_resolve_server_name_to_ip(const char *unc, char **ip_addr); #endif /* KERNEL */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fa04a00d126..db11fdef0e9 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2307,8 +2307,7 @@ static void cifs_invalidate_page(struct page *page, unsigned long offset) cifs_fscache_invalidate_page(page, &cifsi->vfs_inode); } -static void -cifs_oplock_break(struct slow_work *work) +void cifs_oplock_break(struct work_struct *work) { struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, oplock_break); @@ -2345,33 +2344,30 @@ cifs_oplock_break(struct slow_work *work) LOCKING_ANDX_OPLOCK_RELEASE, false); cFYI(1, "Oplock release rc = %d", rc); } + + /* + * We might have kicked in before is_valid_oplock_break() + * finished grabbing reference for us. Make sure it's done by + * waiting for GlobalSMSSeslock. + */ + write_lock(&GlobalSMBSeslock); + write_unlock(&GlobalSMBSeslock); + + cifs_oplock_break_put(cfile); } -static int -cifs_oplock_break_get(struct slow_work *work) +void cifs_oplock_break_get(struct cifsFileInfo *cfile) { - struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, - oplock_break); mntget(cfile->mnt); cifsFileInfo_get(cfile); - return 0; } -static void -cifs_oplock_break_put(struct slow_work *work) +void cifs_oplock_break_put(struct cifsFileInfo *cfile) { - struct cifsFileInfo *cfile = container_of(work, struct cifsFileInfo, - oplock_break); mntput(cfile->mnt); cifsFileInfo_put(cfile); } -const struct slow_work_ops cifs_oplock_break_ops = { - .get_ref = cifs_oplock_break_get, - .put_ref = cifs_oplock_break_put, - .execute = cifs_oplock_break, -}; - const struct address_space_operations cifs_addr_ops = { .readpage = cifs_readpage, .readpages = cifs_readpages, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a15b3a9bbff..dc4c47ab958 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -732,15 +732,9 @@ cifs_find_inode(struct inode *inode, void *opaque) if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) return 0; - /* - * uh oh -- it's a directory. We can't use it since hardlinked dirs are - * verboten. Disable serverino and return it as if it were found, the - * caller can discard it, generate a uniqueid and retry the find - */ - if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) { + /* if it's not a directory or has no dentries, then flag it */ + if (S_ISDIR(inode->i_mode) && !list_empty(&inode->i_dentry)) fattr->cf_flags |= CIFS_FATTR_INO_COLLISION; - cifs_autodisable_serverino(CIFS_SB(inode->i_sb)); - } return 1; } @@ -754,6 +748,27 @@ cifs_init_inode(struct inode *inode, void *opaque) return 0; } +/* + * walk dentry list for an inode and report whether it has aliases that + * are hashed. We use this to determine if a directory inode can actually + * be used. + */ +static bool +inode_has_hashed_dentries(struct inode *inode) +{ + struct dentry *dentry; + + spin_lock(&dcache_lock); + list_for_each_entry(dentry, &inode->i_dentry, d_alias) { + if (!d_unhashed(dentry) || IS_ROOT(dentry)) { + spin_unlock(&dcache_lock); + return true; + } + } + spin_unlock(&dcache_lock); + return false; +} + /* Given fattrs, get a corresponding inode */ struct inode * cifs_iget(struct super_block *sb, struct cifs_fattr *fattr) @@ -769,12 +784,16 @@ retry_iget5_locked: inode = iget5_locked(sb, hash, cifs_find_inode, cifs_init_inode, fattr); if (inode) { - /* was there a problematic inode number collision? */ + /* was there a potentially problematic inode collision? */ if (fattr->cf_flags & CIFS_FATTR_INO_COLLISION) { - iput(inode); - fattr->cf_uniqueid = iunique(sb, ROOT_I); fattr->cf_flags &= ~CIFS_FATTR_INO_COLLISION; - goto retry_iget5_locked; + + if (inode_has_hashed_dentries(inode)) { + cifs_autodisable_serverino(CIFS_SB(sb)); + iput(inode); + fattr->cf_uniqueid = iunique(sb, ROOT_I); + goto retry_iget5_locked; + } } cifs_fattr_to_inode(inode, fattr); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 1394aa37f26..3ccadc1326d 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -498,7 +498,6 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) struct cifsTconInfo *tcon; struct cifsInodeInfo *pCifsInode; struct cifsFileInfo *netfile; - int rc; cFYI(1, "Checking for oplock break or dnotify response"); if ((pSMB->hdr.Command == SMB_COM_NT_TRANSACT) && @@ -583,13 +582,18 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) pCifsInode->clientCanCacheAll = false; if (pSMB->OplockLevel == 0) pCifsInode->clientCanCacheRead = false; - rc = slow_work_enqueue(&netfile->oplock_break); - if (rc) { - cERROR(1, "failed to enqueue oplock " - "break: %d\n", rc); - } else { - netfile->oplock_break_cancelled = false; - } + + /* + * cifs_oplock_break_put() can't be called + * from here. Get reference after queueing + * succeeded. cifs_oplock_break() will + * synchronize using GlobalSMSSeslock. + */ + if (queue_work(system_nrt_wq, + &netfile->oplock_break)) + cifs_oplock_break_get(netfile); + netfile->oplock_break_cancelled = false; + read_unlock(&GlobalSMBSeslock); read_unlock(&cifs_tcp_ses_lock); return true; diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index c6721ee26db..f97851119e6 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -140,17 +140,18 @@ static const struct smb_to_posix_error mapping_table_ERRHRD[] = { * Returns 0 on failure. */ static int -cifs_inet_pton(const int address_family, const char *cp, void *dst) +cifs_inet_pton(const int address_family, const char *cp, int len, void *dst) { int ret = 0; /* calculate length by finding first slash or NULL */ if (address_family == AF_INET) - ret = in4_pton(cp, -1 /* len */, dst, '\\', NULL); + ret = in4_pton(cp, len, dst, '\\', NULL); else if (address_family == AF_INET6) - ret = in6_pton(cp, -1 /* len */, dst , '\\', NULL); + ret = in6_pton(cp, len, dst , '\\', NULL); - cFYI(DBG2, "address conversion returned %d for %s", ret, cp); + cFYI(DBG2, "address conversion returned %d for %*.*s", + ret, len, len, cp); if (ret > 0) ret = 1; return ret; @@ -165,37 +166,39 @@ cifs_inet_pton(const int address_family, const char *cp, void *dst) * Returns 0 on failure. */ int -cifs_convert_address(struct sockaddr *dst, char *src) +cifs_convert_address(struct sockaddr *dst, const char *src, int len) { - int rc; - char *pct, *endp; + int rc, alen, slen; + const char *pct; + char *endp, scope_id[13]; struct sockaddr_in *s4 = (struct sockaddr_in *) dst; struct sockaddr_in6 *s6 = (struct sockaddr_in6 *) dst; /* IPv4 address */ - if (cifs_inet_pton(AF_INET, src, &s4->sin_addr.s_addr)) { + if (cifs_inet_pton(AF_INET, src, len, &s4->sin_addr.s_addr)) { s4->sin_family = AF_INET; return 1; } - /* temporarily terminate string */ - pct = strchr(src, '%'); - if (pct) - *pct = '\0'; - - rc = cifs_inet_pton(AF_INET6, src, &s6->sin6_addr.s6_addr); - - /* repair temp termination (if any) and make pct point to scopeid */ - if (pct) - *pct++ = '%'; + /* attempt to exclude the scope ID from the address part */ + pct = memchr(src, '%', len); + alen = pct ? pct - src : len; + rc = cifs_inet_pton(AF_INET6, src, alen, &s6->sin6_addr.s6_addr); if (!rc) return rc; s6->sin6_family = AF_INET6; if (pct) { + /* grab the scope ID */ + slen = len - (alen + 1); + if (slen <= 0 || slen > 12) + return 0; + memcpy(scope_id, pct + 1, slen); + scope_id[slen] = '\0'; + s6->sin6_scope_id = (u32) simple_strtoul(pct, &endp, 0); - if (!*pct || *endp) + if (endp != scope_id + slen) return 0; } @@ -203,10 +206,10 @@ cifs_convert_address(struct sockaddr *dst, char *src) } int -cifs_fill_sockaddr(struct sockaddr *dst, char *src, +cifs_fill_sockaddr(struct sockaddr *dst, const char *src, int len, const unsigned short int port) { - if (!cifs_convert_address(dst, src)) + if (!cifs_convert_address(dst, src, len)) return 0; switch (dst->sa_family) { diff --git a/fs/compat.c b/fs/compat.c index c6fda9aeb86..5976bad85f6 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -15,6 +15,7 @@ * published by the Free Software Foundation. */ +#include <linux/stddef.h> #include <linux/kernel.h> #include <linux/linkage.h> #include <linux/compat.h> @@ -891,8 +892,6 @@ asmlinkage long compat_sys_mount(char __user * dev_name, char __user * dir_name, return retval; } -#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de))) - struct compat_old_linux_dirent { compat_ulong_t d_ino; compat_ulong_t d_offset; @@ -981,7 +980,8 @@ static int compat_filldir(void *__buf, const char *name, int namlen, struct compat_linux_dirent __user * dirent; struct compat_getdents_callback *buf = __buf; compat_ulong_t d_ino; - int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(compat_long_t)); + int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + + namlen + 2, sizeof(compat_long_t)); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) @@ -1068,8 +1068,8 @@ static int compat_filldir64(void * __buf, const char * name, int namlen, loff_t { struct linux_dirent64 __user *dirent; struct compat_getdents_callback64 *buf = __buf; - int jj = NAME_OFFSET(dirent); - int reclen = ALIGN(jj + namlen + 1, sizeof(u64)); + int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, + sizeof(u64)); u64 off; buf->error = -EINVAL; /* only used if we fail.. */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index c0d35c62052..37a34c2c622 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -248,7 +248,7 @@ static struct connection *assoc2con(int assoc_id) for (i = 0 ; i < CONN_HASH_SIZE; i++) { hlist_for_each_entry(con, h, &connection_hash[i], list) { - if (con && con->sctp_assoc == assoc_id) { + if (con->sctp_assoc == assoc_id) { mutex_unlock(&connections_lock); return con; } diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c index 2c6ad518100..ef17e0169da 100644 --- a/fs/dlm/netlink.c +++ b/fs/dlm/netlink.c @@ -81,24 +81,11 @@ static struct genl_ops dlm_nl_ops = { int __init dlm_netlink_init(void) { - int rv; - - rv = genl_register_family(&family); - if (rv) - return rv; - - rv = genl_register_ops(&family, &dlm_nl_ops); - if (rv < 0) - goto err; - return 0; - err: - genl_unregister_family(&family); - return rv; + return genl_register_family_with_ops(&family, &dlm_nl_ops, 1); } void dlm_netlink_exit(void) { - genl_unregister_ops(&family, &dlm_nl_ops); genl_unregister_family(&family); } diff --git a/fs/exec.c b/fs/exec.c index e19de6a8033..dab85ecad68 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include <linux/mm.h> #include <linux/stat.h> #include <linux/fcntl.h> -#include <linux/smp_lock.h> #include <linux/swap.h> #include <linux/string.h> #include <linux/init.h> @@ -653,6 +652,7 @@ int setup_arg_pages(struct linux_binprm *bprm, else stack_base = vma->vm_start - stack_expand; #endif + current->mm->start_stack = bprm->p; ret = expand_stack(vma, stack_base); if (ret) ret = -EFAULT; @@ -1891,13 +1891,7 @@ void do_coredump(long signr, int exit_code, struct pt_regs *regs) */ clear_thread_flag(TIF_SIGPENDING); - /* - * lock_kernel() because format_corename() is controlled by sysctl, which - * uses lock_kernel() - */ - lock_kernel(); ispipe = format_corename(corename, signr); - unlock_kernel(); if (ispipe) { int dump_count; diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index 522b15498f4..e8c6ba0e4a3 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -31,6 +31,7 @@ config EXT3_FS config EXT3_DEFAULTS_TO_ORDERED bool "Default to 'data=ordered' in ext3" depends on EXT3_FS + default y help The journal mode options for ext3 have different tradeoffs between when data is guaranteed to be on disk and diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c index 735f0190ec2..001eb0e2d48 100644 --- a/fs/ext3/inode.c +++ b/fs/ext3/inode.c @@ -1149,9 +1149,25 @@ static int walk_page_buffers( handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext3_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext3_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext3_journal_dirty_metadata(handle, bh); + return ret; } /* @@ -1625,10 +1641,7 @@ static int ext3_writeback_writepage(struct page *page, goto out_fail; } - if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode)) - ret = nobh_writepage(page, ext3_get_block, wbc); - else - ret = block_write_full_page(page, ext3_get_block, wbc); + ret = block_write_full_page(page, ext3_get_block, wbc); err = ext3_journal_stop(handle); if (!ret) @@ -1922,17 +1935,6 @@ static int ext3_block_truncate_page(handle_t *handle, struct page *page, length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext3_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -2284,27 +2286,6 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext3_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext3_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -2327,6 +2308,31 @@ static void ext3_free_branches(handle_t *handle, struct inode *inode, truncate_restart_transaction(handle, inode); } + /* + * We've probably journalled the indirect block several + * times during the truncate. But it's no longer + * needed and we now drop it from the transaction via + * journal_revoke(). + * + * That's easy if it's exclusively part of this + * transaction. But if it's part of the committing + * transaction then journal_forget() will simply + * brelse() it. That means that if the underlying + * block is reallocated in ext3_get_block(), + * unmap_underlying_metadata() will find this block + * and will try to get rid of it. damn, damn. Thus + * we don't allow a block to be reallocated until + * a transaction freeing it has fully committed. + * + * We also have to make sure journal replay after a + * crash does not overwrite non-journaled data blocks + * with old metadata when the block got reallocated for + * data. Thus we have to store a revoke record for a + * block in the same transaction in which we free the + * block. + */ + ext3_forget(handle, 1, inode, bh, bh->b_blocknr); + ext3_free_blocks(handle, inode, nr, 1); if (parent_bh) { diff --git a/fs/ext3/namei.c b/fs/ext3/namei.c index ee184084ca4..2b35ddb70d6 100644 --- a/fs/ext3/namei.c +++ b/fs/ext3/namei.c @@ -1447,7 +1447,6 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, struct inode *inode) { struct inode *dir = dentry->d_parent->d_inode; - unsigned long offset; struct buffer_head * bh; struct ext3_dir_entry_2 *de; struct super_block * sb; @@ -1469,7 +1468,7 @@ static int ext3_add_entry (handle_t *handle, struct dentry *dentry, ext3_mark_inode_dirty(handle, dir); } blocks = dir->i_size >> sb->s_blocksize_bits; - for (block = 0, offset = 0; block < blocks; block++) { + for (block = 0; block < blocks; block++) { bh = ext3_bread(handle, dir, block, 0, &retval); if(!bh) return retval; diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 54351ac7cef..0ccd7b12b73 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -964,7 +964,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, ext3_fsblk_t n_blocks_count) { ext3_fsblk_t o_blocks_count; - unsigned long o_groups_count; ext3_grpblk_t last; ext3_grpblk_t add; struct buffer_head * bh; @@ -976,7 +975,6 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, * yet: we're going to revalidate es->s_blocks_count after * taking the s_resize_lock below. */ o_blocks_count = le32_to_cpu(es->s_blocks_count); - o_groups_count = EXT3_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT3-fs: extending last group from "E3FSBLK" uto "E3FSBLK" blocks\n", diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 6c953bb255e..9650a956fd0 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -661,9 +661,6 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) */ seq_puts(seq, ",barrier="); seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); - seq_printf(seq, ",data=%s", data_mode_string(test_opt(sb, DATA_FLAGS))); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1255,10 +1252,12 @@ set_qf_format: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext3_msg(sb, KERN_WARNING, + "warning: ignoring deprecated bh option"); break; default: ext3_msg(sb, KERN_ERR, @@ -2001,14 +2000,6 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) break; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA)) { - ext3_msg(sb, KERN_WARNING, - "warning: ignoring nobh option - " - "it is supported only with writeback mode"); - clear_opt(sbi->s_mount_opt, NOBH); - } - } /* * The journal_load will have done any necessary log recovery, * so we can safely mount the rest of the filesystem now. diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index feaf498feaa..5e2ed4504ea 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -204,6 +204,7 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type, return error; else { inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); if (error == 0) acl = NULL; diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 95b7594c76f..bd30799a43e 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -377,14 +377,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ext4_grpblk_t bit; unsigned int i; struct ext4_group_desc *desc; - struct ext4_super_block *es; - struct ext4_sb_info *sbi; + struct ext4_sb_info *sbi = EXT4_SB(sb); int err = 0, ret, blk_free_count; ext4_grpblk_t blocks_freed; struct ext4_group_info *grp; - sbi = EXT4_SB(sb); - es = sbi->s_es; ext4_debug("Adding block(s) %llu-%llu\n", block, block + count - 1); ext4_get_group_no_and_offset(sb, block, &block_group, &bit); @@ -477,7 +474,6 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, ret = ext4_handle_dirty_metadata(handle, NULL, gd_bh); if (!err) err = ret; - sb->s_dirt = 1; error_return: brelse(bitmap_bh); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 5b6973fbf1b..3db5084db9b 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -229,16 +229,20 @@ int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || (start_blk + count < start_blk) || - (start_blk + count > ext4_blocks_count(sbi->s_es))) + (start_blk + count > ext4_blocks_count(sbi->s_es))) { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; + } while (n) { entry = rb_entry(n, struct ext4_system_zone, node); if (start_blk + count - 1 < entry->start_blk) n = n->rb_left; else if (start_blk >= (entry->start_blk + entry->count)) n = n->rb_right; - else + else { + sbi->s_es->s_last_error_block = cpu_to_le64(start_blk); return 0; + } } return 1; } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index ea5e6cb7e2a..374510f72ba 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -61,10 +61,11 @@ static unsigned char get_dtype(struct super_block *sb, int filetype) } -int ext4_check_dir_entry(const char *function, struct inode *dir, - struct ext4_dir_entry_2 *de, - struct buffer_head *bh, - unsigned int offset) +int __ext4_check_dir_entry(const char *function, unsigned int line, + struct inode *dir, + struct ext4_dir_entry_2 *de, + struct buffer_head *bh, + unsigned int offset) { const char *error_msg = NULL; const int rlen = ext4_rec_len_from_disk(de->rec_len, @@ -83,11 +84,10 @@ int ext4_check_dir_entry(const char *function, struct inode *dir, error_msg = "inode out of bounds"; if (error_msg != NULL) - ext4_error_inode(function, dir, - "bad entry in directory: %s - block=%llu" + ext4_error_inode(dir, function, line, bh->b_blocknr, + "bad entry in directory: %s - " "offset=%u(%u), inode=%u, rec_len=%d, name_len=%d", - error_msg, (unsigned long long) bh->b_blocknr, - (unsigned) (offset%bh->b_size), offset, + error_msg, (unsigned) (offset%bh->b_size), offset, le32_to_cpu(de->inode), rlen, de->name_len); return error_msg == NULL ? 1 : 0; @@ -121,7 +121,8 @@ static int ext4_readdir(struct file *filp, * We don't set the inode dirty flag since it's not * critical that it get flushed back to the disk. */ - ext4_clear_inode_flag(filp->f_path.dentry->d_inode, EXT4_INODE_INDEX); + ext4_clear_inode_flag(filp->f_path.dentry->d_inode, + EXT4_INODE_INDEX); } stored = 0; offset = filp->f_pos & (sb->s_blocksize - 1); @@ -193,7 +194,7 @@ revalidate: while (!error && filp->f_pos < inode->i_size && offset < sb->s_blocksize) { de = (struct ext4_dir_entry_2 *) (bh->b_data + offset); - if (!ext4_check_dir_entry("ext4_readdir", inode, de, + if (!ext4_check_dir_entry(inode, de, bh, offset)) { /* * On error, skip the f_pos to the next block @@ -343,7 +344,7 @@ int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct dir_private_info *info; int len; - info = (struct dir_private_info *) dir_file->private_data; + info = dir_file->private_data; p = &info->root.rb_node; /* Create and allocate the fname structure */ diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 19a4de57128..e03841d9f30 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -57,10 +57,13 @@ #endif #define EXT4_ERROR_INODE(inode, fmt, a...) \ - ext4_error_inode(__func__, (inode), (fmt), ## a) + ext4_error_inode((inode), __func__, __LINE__, 0, (fmt), ## a) + +#define EXT4_ERROR_INODE_BLOCK(inode, block, fmt, a...) \ + ext4_error_inode((inode), __func__, __LINE__, (block), (fmt), ## a) #define EXT4_ERROR_FILE(file, fmt, a...) \ - ext4_error_file(__func__, (file), (fmt), ## a) + ext4_error_file(__func__, __LINE__, (file), (fmt), ## a) /* data type for block offset of block group */ typedef int ext4_grpblk_t; @@ -167,13 +170,15 @@ struct mpage_da_data { }; #define EXT4_IO_UNWRITTEN 0x1 typedef struct ext4_io_end { - struct list_head list; /* per-file finished AIO list */ + struct list_head list; /* per-file finished IO list */ struct inode *inode; /* file being written to */ unsigned int flag; /* unwritten or not */ struct page *page; /* page struct for buffer write */ loff_t offset; /* offset in the file */ ssize_t size; /* size of the extent */ struct work_struct work; /* data work queue */ + struct kiocb *iocb; /* iocb struct for AIO */ + int result; /* error value for AIO */ } ext4_io_end_t; /* @@ -460,7 +465,7 @@ struct ext4_new_group_data { }; /* - * Flags used by ext4_get_blocks() + * Flags used by ext4_map_blocks() */ /* Allocate any needed blocks and/or convert an unitialized extent to be an initialized ext4 */ @@ -873,7 +878,6 @@ struct ext4_inode_info { #define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */ #define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */ #define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */ -#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */ #define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */ #define EXT4_MOUNT_USRQUOTA 0x100000 /* "old" user quota */ #define EXT4_MOUNT_GRPQUOTA 0x200000 /* "old" group quota */ @@ -982,7 +986,7 @@ struct ext4_super_block { __le32 s_last_orphan; /* start of list of inodes to delete */ __le32 s_hash_seed[4]; /* HTREE hash seed */ __u8 s_def_hash_version; /* Default hash version to use */ - __u8 s_reserved_char_pad; + __u8 s_jnl_backup_type; __le16 s_desc_size; /* size of group descriptor */ /*100*/ __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ @@ -1000,12 +1004,34 @@ struct ext4_super_block { __le64 s_mmp_block; /* Block for multi-mount protection */ __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ __u8 s_log_groups_per_flex; /* FLEX_BG group size */ - __u8 s_reserved_char_pad2; + __u8 s_reserved_char_pad; __le16 s_reserved_pad; __le64 s_kbytes_written; /* nr of lifetime kilobytes written */ - __u32 s_reserved[160]; /* Padding to the end of the block */ + __le32 s_snapshot_inum; /* Inode number of active snapshot */ + __le32 s_snapshot_id; /* sequential ID of active snapshot */ + __le64 s_snapshot_r_blocks_count; /* reserved blocks for active + snapshot's future use */ + __le32 s_snapshot_list; /* inode number of the head of the + on-disk snapshot list */ +#define EXT4_S_ERR_START offsetof(struct ext4_super_block, s_error_count) + __le32 s_error_count; /* number of fs errors */ + __le32 s_first_error_time; /* first time an error happened */ + __le32 s_first_error_ino; /* inode involved in first error */ + __le64 s_first_error_block; /* block involved of first error */ + __u8 s_first_error_func[32]; /* function where the error happened */ + __le32 s_first_error_line; /* line number where error happened */ + __le32 s_last_error_time; /* most recent time of an error */ + __le32 s_last_error_ino; /* inode involved in last error */ + __le32 s_last_error_line; /* line number where error happened */ + __le64 s_last_error_block; /* block involved of last error */ + __u8 s_last_error_func[32]; /* function where the error happened */ +#define EXT4_S_ERR_END offsetof(struct ext4_super_block, s_mount_opts) + __u8 s_mount_opts[64]; + __le32 s_reserved[112]; /* Padding to the end of the block */ }; +#define EXT4_S_ERR_LEN (EXT4_S_ERR_END - EXT4_S_ERR_START) + #ifdef __KERNEL__ /* @@ -1143,6 +1169,9 @@ struct ext4_sb_info { /* workqueue for dio unwritten */ struct workqueue_struct *dio_unwritten_wq; + + /* timer for periodic error stats printing */ + struct timer_list s_err_report; }; static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) @@ -1313,6 +1342,10 @@ EXT4_INODE_BIT_FNS(state, state_flags) #define EXT4_DEFM_JMODE_DATA 0x0020 #define EXT4_DEFM_JMODE_ORDERED 0x0040 #define EXT4_DEFM_JMODE_WBACK 0x0060 +#define EXT4_DEFM_NOBARRIER 0x0100 +#define EXT4_DEFM_BLOCK_VALIDITY 0x0200 +#define EXT4_DEFM_DISCARD 0x0400 +#define EXT4_DEFM_NODELALLOC 0x0800 /* * Default journal batch times @@ -1379,6 +1412,43 @@ struct ext4_dir_entry_2 { #define EXT4_MAX_REC_LEN ((1<<16)-1) /* + * If we ever get support for fs block sizes > page_size, we'll need + * to remove the #if statements in the next two functions... + */ +static inline unsigned int +ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) +{ + unsigned len = le16_to_cpu(dlen); + +#if (PAGE_CACHE_SIZE >= 65536) + if (len == EXT4_MAX_REC_LEN || len == 0) + return blocksize; + return (len & 65532) | ((len & 3) << 16); +#else + return len; +#endif +} + +static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) +{ + if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) + BUG(); +#if (PAGE_CACHE_SIZE >= 65536) + if (len < 65536) + return cpu_to_le16(len); + if (len == blocksize) { + if (blocksize == 65536) + return cpu_to_le16(EXT4_MAX_REC_LEN); + else + return cpu_to_le16(0); + } + return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); +#else + return cpu_to_le16(len); +#endif +} + +/* * Hash Tree Directory indexing * (c) Daniel Phillips, 2001 */ @@ -1510,9 +1580,11 @@ extern unsigned ext4_init_block_bitmap(struct super_block *sb, ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ -extern int ext4_check_dir_entry(const char *, struct inode *, - struct ext4_dir_entry_2 *, - struct buffer_head *, unsigned int); +extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, + struct ext4_dir_entry_2 *, + struct buffer_head *, unsigned int); +#define ext4_check_dir_entry(dir, de, bh, offset) \ + __ext4_check_dir_entry(__func__, __LINE__, (dir), (de), (bh), (offset)) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent); @@ -1601,8 +1673,6 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long); extern int ext4_ext_migrate(struct inode *); /* namei.c */ -extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize); -extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize); extern int ext4_orphan_add(handle_t *, struct inode *); extern int ext4_orphan_del(handle_t *, struct inode *); extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, @@ -1616,25 +1686,38 @@ extern int ext4_group_extend(struct super_block *sb, ext4_fsblk_t n_blocks_count); /* super.c */ -extern void __ext4_error(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -#define ext4_error(sb, message...) __ext4_error(sb, __func__, ## message) -extern void ext4_error_inode(const char *, struct inode *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void ext4_error_file(const char *, struct file *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void __ext4_std_error(struct super_block *, const char *, int); -extern void ext4_abort(struct super_block *, const char *, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -extern void __ext4_warning(struct super_block *, const char *, +extern void __ext4_error(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_error(sb, message...) __ext4_error(sb, __func__, \ + __LINE__, ## message) +extern void ext4_error_inode(struct inode *, const char *, unsigned int, + ext4_fsblk_t, const char *, ...) + __attribute__ ((format (printf, 5, 6))); +extern void ext4_error_file(struct file *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +extern void __ext4_std_error(struct super_block *, const char *, + unsigned int, int); +extern void __ext4_abort(struct super_block *, const char *, unsigned int, + const char *, ...) + __attribute__ ((format (printf, 4, 5))); +#define ext4_abort(sb, message...) __ext4_abort(sb, __func__, \ + __LINE__, ## message) +extern void __ext4_warning(struct super_block *, const char *, unsigned int, const char *, ...) - __attribute__ ((format (printf, 3, 4))); -#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, ## message) + __attribute__ ((format (printf, 4, 5))); +#define ext4_warning(sb, message...) __ext4_warning(sb, __func__, \ + __LINE__, ## message) extern void ext4_msg(struct super_block *, const char *, const char *, ...) __attribute__ ((format (printf, 3, 4))); -extern void ext4_grp_locked_error(struct super_block *, ext4_group_t, - const char *, const char *, ...) - __attribute__ ((format (printf, 4, 5))); +extern void __ext4_grp_locked_error(const char *, unsigned int, \ + struct super_block *, ext4_group_t, \ + unsigned long, ext4_fsblk_t, \ + const char *, ...) + __attribute__ ((format (printf, 7, 8))); +#define ext4_grp_locked_error(sb, grp, message...) \ + __ext4_grp_locked_error(__func__, __LINE__, (sb), (grp), ## message) extern void ext4_update_dynamic_rev(struct super_block *sb); extern int ext4_update_compat_feature(handle_t *handle, struct super_block *sb, __u32 compat); @@ -1768,7 +1851,7 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ - __ext4_std_error((sb), __func__, (errno)); \ + __ext4_std_error((sb), __func__, __LINE__, (errno)); \ } while (0) #ifdef CONFIG_SMP @@ -1860,6 +1943,12 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } +static inline void ext4_mark_super_dirty(struct super_block *sb) +{ + if (EXT4_SB(sb)->s_journal == NULL) + sb->s_dirt =1; +} + /* * Inodes and files operations */ @@ -1905,9 +1994,6 @@ extern int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, ssize_t len); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); -extern int ext4_get_blocks(handle_t *handle, struct inode *inode, - sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); /* move_extent.c */ diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 53d2764d71c..6e272ef6ba9 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -6,29 +6,29 @@ #include <trace/events/ext4.h> -int __ext4_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh) +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_get_undo_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, + ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } return err; } -int __ext4_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh) +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_get_write_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, + ext4_journal_abort_handle(where, line, __func__, bh, handle, err); } return err; @@ -46,9 +46,9 @@ int __ext4_journal_get_write_access(const char *where, handle_t *handle, * If the handle isn't valid we're not journaling, but we still need to * call into ext4_journal_revoke() to put the buffer head. */ -int __ext4_forget(const char *where, handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - ext4_fsblk_t blocknr) +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr) { int err; @@ -79,8 +79,8 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call jbd2_journal_forget"); err = jbd2_journal_forget(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); return err; } return 0; @@ -92,15 +92,16 @@ int __ext4_forget(const char *where, handle_t *handle, int is_metadata, BUFFER_TRACE(bh, "call jbd2_journal_revoke"); err = jbd2_journal_revoke(handle, blocknr, bh); if (err) { - ext4_journal_abort_handle(where, __func__, bh, handle, err); - ext4_abort(inode->i_sb, __func__, + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + __ext4_abort(inode->i_sb, where, line, "error %d when attempting revoke", err); } BUFFER_TRACE(bh, "exit"); return err; } -int __ext4_journal_get_create_access(const char *where, +int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh) { int err = 0; @@ -108,22 +109,23 @@ int __ext4_journal_get_create_access(const char *where, if (ext4_handle_valid(handle)) { err = jbd2_journal_get_create_access(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); } return err; } -int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, - struct inode *inode, struct buffer_head *bh) +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh) { int err = 0; if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); if (err) - ext4_journal_abort_handle(where, __func__, bh, - handle, err); + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); } else { if (inode) mark_buffer_dirty_inode(bh, inode); @@ -132,14 +134,33 @@ int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, if (inode && inode_needs_sync(inode)) { sync_dirty_buffer(bh); if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long) bh->b_blocknr); + struct ext4_super_block *es; + + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_block = + cpu_to_le64(bh->b_blocknr); + ext4_error_inode(inode, where, line, + bh->b_blocknr, + "IO error syncing itable block"); err = -EIO; } } } return err; } + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb) +{ + struct buffer_head *bh = EXT4_SB(sb)->s_sbh; + int err = 0; + + if (ext4_handle_valid(handle)) { + err = jbd2_journal_dirty_metadata(handle, bh); + if (err) + ext4_journal_abort_handle(where, line, __func__, + bh, handle, err); + } else + sb->s_dirt = 1; + return err; +} diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index dade0c02479..b0bd792c58c 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,39 +122,47 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); /* * Wrapper functions with which ext4 calls into JBD. */ -void ext4_journal_abort_handle(const char *caller, const char *err_fn, +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, handle_t *handle, int err); -int __ext4_journal_get_undo_access(const char *where, handle_t *handle, - struct buffer_head *bh); +int __ext4_journal_get_undo_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); -int __ext4_journal_get_write_access(const char *where, handle_t *handle, - struct buffer_head *bh); +int __ext4_journal_get_write_access(const char *where, unsigned int line, + handle_t *handle, struct buffer_head *bh); -int __ext4_forget(const char *where, handle_t *handle, int is_metadata, - struct inode *inode, struct buffer_head *bh, - ext4_fsblk_t blocknr); +int __ext4_forget(const char *where, unsigned int line, handle_t *handle, + int is_metadata, struct inode *inode, + struct buffer_head *bh, ext4_fsblk_t blocknr); -int __ext4_journal_get_create_access(const char *where, +int __ext4_journal_get_create_access(const char *where, unsigned int line, handle_t *handle, struct buffer_head *bh); -int __ext4_handle_dirty_metadata(const char *where, handle_t *handle, - struct inode *inode, struct buffer_head *bh); +int __ext4_handle_dirty_metadata(const char *where, unsigned int line, + handle_t *handle, struct inode *inode, + struct buffer_head *bh); + +int __ext4_handle_dirty_super(const char *where, unsigned int line, + handle_t *handle, struct super_block *sb); #define ext4_journal_get_undo_access(handle, bh) \ - __ext4_journal_get_undo_access(__func__, (handle), (bh)) + __ext4_journal_get_undo_access(__func__, __LINE__, (handle), (bh)) #define ext4_journal_get_write_access(handle, bh) \ - __ext4_journal_get_write_access(__func__, (handle), (bh)) + __ext4_journal_get_write_access(__func__, __LINE__, (handle), (bh)) #define ext4_forget(handle, is_metadata, inode, bh, block_nr) \ - __ext4_forget(__func__, (handle), (is_metadata), (inode), (bh),\ - (block_nr)) + __ext4_forget(__func__, __LINE__, (handle), (is_metadata), (inode), \ + (bh), (block_nr)) #define ext4_journal_get_create_access(handle, bh) \ - __ext4_journal_get_create_access(__func__, (handle), (bh)) + __ext4_journal_get_create_access(__func__, __LINE__, (handle), (bh)) #define ext4_handle_dirty_metadata(handle, inode, bh) \ - __ext4_handle_dirty_metadata(__func__, (handle), (inode), (bh)) + __ext4_handle_dirty_metadata(__func__, __LINE__, (handle), (inode), \ + (bh)) +#define ext4_handle_dirty_super(handle, sb) \ + __ext4_handle_dirty_super(__func__, __LINE__, (handle), (sb)) handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks); -int __ext4_journal_stop(const char *where, handle_t *handle); +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle); #define EXT4_NOJOURNAL_MAX_REF_COUNT ((unsigned long) 4096) @@ -207,7 +215,7 @@ static inline handle_t *ext4_journal_start(struct inode *inode, int nblocks) } #define ext4_journal_stop(handle) \ - __ext4_journal_stop(__func__, (handle)) + __ext4_journal_stop(__func__, __LINE__, (handle)) static inline handle_t *ext4_journal_current_handle(void) { @@ -308,17 +316,15 @@ static inline int ext4_should_writeback_data(struct inode *inode) * This function controls whether or not we should try to go down the * dioread_nolock code paths, which makes it safe to avoid taking * i_mutex for direct I/O reads. This only works for extent-based - * files, and it doesn't work for nobh or if data journaling is - * enabled, since the dioread_nolock code uses b_private to pass - * information back to the I/O completion handler, and this conflicts - * with the jbd's use of b_private. + * files, and it doesn't work if data journaling is enabled, since the + * dioread_nolock code uses b_private to pass information back to the + * I/O completion handler, and this conflicts with the jbd's use of + * b_private. */ static inline int ext4_should_dioread_nolock(struct inode *inode) { if (!test_opt(inode->i_sb, DIOREAD_NOLOCK)) return 0; - if (test_opt(inode->i_sb, NOBH)) - return 0; if (!S_ISREG(inode->i_mode)) return 0; if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index bf029c7d551..06328d3e571 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -401,9 +401,9 @@ static int ext4_valid_extent_entries(struct inode *inode, return 1; } -static int __ext4_ext_check(const char *function, struct inode *inode, - struct ext4_extent_header *eh, - int depth) +static int __ext4_ext_check(const char *function, unsigned int line, + struct inode *inode, struct ext4_extent_header *eh, + int depth) { const char *error_msg; int max = 0; @@ -436,7 +436,7 @@ static int __ext4_ext_check(const char *function, struct inode *inode, return 0; corrupted: - ext4_error_inode(function, inode, + ext4_error_inode(inode, function, line, 0, "bad header/extent: %s - magic %x, " "entries %u, max %u(%u), depth %u(%u)", error_msg, le16_to_cpu(eh->eh_magic), @@ -447,7 +447,7 @@ corrupted: } #define ext4_ext_check(inode, eh, depth) \ - __ext4_ext_check(__func__, inode, eh, depth) + __ext4_ext_check(__func__, __LINE__, inode, eh, depth) int ext4_ext_check_inode(struct inode *inode) { @@ -1083,7 +1083,6 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, { struct ext4_ext_path *curp = path; struct ext4_extent_header *neh; - struct ext4_extent_idx *fidx; struct buffer_head *bh; ext4_fsblk_t newblock; int err = 0; @@ -1144,10 +1143,10 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, ext4_idx_store_pblock(curp->p_idx, newblock); neh = ext_inode_hdr(inode); - fidx = EXT_FIRST_INDEX(neh); ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), - le32_to_cpu(fidx->ei_block), idx_pblock(fidx)); + le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), + idx_pblock(EXT_FIRST_INDEX(neh))); neh->eh_depth = cpu_to_le16(path->p_depth + 1); err = ext4_ext_dirty(handle, inode, curp); @@ -2954,7 +2953,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, struct ext4_extent *ex1 = NULL; struct ext4_extent *ex2 = NULL; struct ext4_extent *ex3 = NULL; - struct ext4_extent_header *eh; ext4_lblk_t ee_block, eof_block; unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; @@ -2971,7 +2969,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, eof_block = map->m_lblk + map->m_len; depth = ext_depth(inode); - eh = path[depth].p_hdr; ex = path[depth].p_ext; ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); @@ -3058,7 +3055,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, err = PTR_ERR(path); goto out; } - eh = path[depth].p_hdr; ex = path[depth].p_ext; if (ex2 != &newex) ex2 = ex; diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5313ae4cda2..ee92b66d455 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -70,7 +70,8 @@ ext4_file_write(struct kiocb *iocb, const struct iovec *iov, struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); size_t length = iov_length(iov, nr_segs); - if (pos > sbi->s_bitmap_maxbytes) + if ((pos > sbi->s_bitmap_maxbytes || + (pos == sbi->s_bitmap_maxbytes && length > 0))) return -EFBIG; if (pos + length > sbi->s_bitmap_maxbytes) { @@ -123,7 +124,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp) if (!IS_ERR(cp)) { memcpy(sbi->s_es->s_last_mounted, cp, sizeof(sbi->s_es->s_last_mounted)); - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); } } return dquot_file_open(inode, filp); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 25c4b3173fd..ac377505ed5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -279,7 +279,7 @@ out: err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!fatal) fatal = err; - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); } else ext4_error(sb, "bit already cleared for inode %lu", ino); @@ -965,7 +965,7 @@ got: percpu_counter_dec(&sbi->s_freeinodes_counter); if (S_ISDIR(mode)) percpu_counter_inc(&sbi->s_dirs_counter); - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); if (sbi->s_log_groups_per_flex) { flex_group = ext4_flex_group(sbi, group); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0afc8c1d8cf..a0ab3754d0d 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -221,6 +221,7 @@ void ext4_delete_inode(struct inode *inode) "couldn't extend journal (err %d)", err); stop_handle: ext4_journal_stop(handle); + ext4_orphan_del(NULL, inode); goto no_delete; } } @@ -337,9 +338,11 @@ static int ext4_block_to_path(struct inode *inode, return n; } -static int __ext4_check_blockref(const char *function, struct inode *inode, +static int __ext4_check_blockref(const char *function, unsigned int line, + struct inode *inode, __le32 *p, unsigned int max) { + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; __le32 *bref = p; unsigned int blk; @@ -348,8 +351,9 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, if (blk && unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), blk, 1))) { - ext4_error_inode(function, inode, - "invalid block reference %u", blk); + es->s_last_error_block = cpu_to_le64(blk); + ext4_error_inode(inode, function, line, blk, + "invalid block"); return -EIO; } } @@ -358,11 +362,13 @@ static int __ext4_check_blockref(const char *function, struct inode *inode, #define ext4_check_indirect_blockref(inode, bh) \ - __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + (__le32 *)(bh)->b_data, \ EXT4_ADDR_PER_BLOCK((inode)->i_sb)) #define ext4_check_inode_blockref(inode) \ - __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \ + __ext4_check_blockref(__func__, __LINE__, inode, \ + EXT4_I(inode)->i_data, \ EXT4_NDIR_BLOCKS) /** @@ -1128,20 +1134,24 @@ void ext4_da_update_reserve_space(struct inode *inode, ext4_discard_preallocations(inode); } -static int check_block_validity(struct inode *inode, const char *func, +static int __check_block_validity(struct inode *inode, const char *func, + unsigned int line, struct ext4_map_blocks *map) { if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), map->m_pblk, map->m_len)) { - ext4_error_inode(func, inode, - "lblock %lu mapped to illegal pblock %llu " - "(length %d)", (unsigned long) map->m_lblk, - map->m_pblk, map->m_len); + ext4_error_inode(inode, func, line, map->m_pblk, + "lblock %lu mapped to illegal pblock " + "(length %d)", (unsigned long) map->m_lblk, + map->m_len); return -EIO; } return 0; } +#define check_block_validity(inode, map) \ + __check_block_validity((inode), __func__, __LINE__, (map)) + /* * Return the number of contiguous dirty pages in a given inode * starting at page frame idx. @@ -1244,7 +1254,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_read((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, __func__, map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -1324,9 +1334,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { - int ret = check_block_validity(inode, - "ext4_map_blocks_after_alloc", - map); + int ret = check_block_validity(inode, map); if (ret != 0) return ret; } @@ -1519,9 +1527,25 @@ static int walk_page_buffers(handle_t *handle, static int do_journal_get_write_access(handle_t *handle, struct buffer_head *bh) { + int dirty = buffer_dirty(bh); + int ret; + if (!buffer_mapped(bh) || buffer_freed(bh)) return 0; - return ext4_journal_get_write_access(handle, bh); + /* + * __block_prepare_write() could have dirtied some buffers. Clean + * the dirty bit as jbd2_journal_get_write_access() could complain + * otherwise about fs integrity issues. Setting of the dirty bit + * by __block_prepare_write() isn't a real problem here as we clear + * the bit before releasing a page lock and thus writeback cannot + * ever write the buffer. + */ + if (dirty) + clear_buffer_dirty(bh); + ret = ext4_journal_get_write_access(handle, bh); + if (!ret && dirty) + ret = ext4_handle_dirty_metadata(handle, NULL, bh); + return ret; } /* @@ -2194,7 +2218,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) BUG_ON(!handle); /* - * Call ext4_get_blocks() to allocate any delayed allocation + * Call ext4_map_blocks() to allocate any delayed allocation * blocks, or to convert an uninitialized extent to be * initialized (in the case where we have written into * one or more preallocated blocks). @@ -2203,7 +2227,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * indicate that we are on the delayed allocation path. This * affects functions in many different parts of the allocation * call path. This flag exists primarily because we don't - * want to change *many* call functions, so ext4_get_blocks() + * want to change *many* call functions, so ext4_map_blocks() * will set the magic i_delalloc_reserved_flag once the * inode's allocation semaphore is taken. * @@ -2221,6 +2245,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) blks = ext4_map_blocks(handle, mpd->inode, &map, get_blocks_flags); if (blks < 0) { + struct super_block *sb = mpd->inode->i_sb; + err = blks; /* * If get block returns with error we simply @@ -2231,7 +2257,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) return 0; if (err == -ENOSPC && - ext4_count_free_blocks(mpd->inode->i_sb)) { + ext4_count_free_blocks(sb)) { mpd->retval = err; return 0; } @@ -2243,16 +2269,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * writepage and writepages will again try to write * the same. */ - ext4_msg(mpd->inode->i_sb, KERN_CRIT, - "delayed block allocation failed for inode %lu at " - "logical offset %llu with max blocks %zd with " - "error %d", mpd->inode->i_ino, - (unsigned long long) next, - mpd->b_size >> mpd->inode->i_blkbits, err); - printk(KERN_CRIT "This should not happen!! " - "Data will be lost\n"); - if (err == -ENOSPC) { - ext4_print_free_blocks(mpd->inode); + if (!(EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) { + ext4_msg(sb, KERN_CRIT, + "delayed block allocation failed for inode %lu " + "at logical offset %llu with max blocks %zd " + "with error %d", mpd->inode->i_ino, + (unsigned long long) next, + mpd->b_size >> mpd->inode->i_blkbits, err); + ext4_msg(sb, KERN_CRIT, + "This should not happen!! Data will be lost\n"); + if (err == -ENOSPC) + ext4_print_free_blocks(mpd->inode); } /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, @@ -2320,7 +2347,7 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd, * XXX Don't go larger than mballoc is willing to allocate * This is a stopgap solution. We eventually need to fold * mpage_da_submit_io() into this function and then call - * ext4_get_blocks() multiple times in a loop + * ext4_map_blocks() multiple times in a loop */ if (nrblocks >= 8*1024*1024/mpd->inode->i_sb->s_blocksize) goto flush_it; @@ -2553,18 +2580,16 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, /* * This function is used as a standard get_block_t calback function * when there is no desire to allocate any blocks. It is used as a - * callback function for block_prepare_write(), nobh_writepage(), and - * block_write_full_page(). These functions should only try to map a - * single block at a time. + * callback function for block_prepare_write() and block_write_full_page(). + * These functions should only try to map a single block at a time. * * Since this function doesn't do block allocations even if the caller * requests it by passing in create=1, it is critically important that * any caller checks to make sure that any buffer heads are returned * by this function are either all already mapped or marked for - * delayed allocation before calling nobh_writepage() or - * block_write_full_page(). Otherwise, b_blocknr could be left - * unitialized, and the page write functions will be taken by - * surprise. + * delayed allocation before calling block_write_full_page(). Otherwise, + * b_blocknr could be left unitialized, and the page write functions will + * be taken by surprise. */ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2749,9 +2774,7 @@ static int ext4_writepage(struct page *page, return __ext4_journalled_writepage(page, len); } - if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, noalloc_get_block_write, wbc); - else if (page_bufs && buffer_uninit(page_bufs)) { + if (page_bufs && buffer_uninit(page_bufs)) { ext4_set_bh_endio(page_bufs, inode); ret = block_write_full_page_endio(page, noalloc_get_block_write, wbc, ext4_end_io_buffer_write); @@ -3146,13 +3169,10 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, int ret, retries = 0; struct page *page; pgoff_t index; - unsigned from, to; struct inode *inode = mapping->host; handle_t *handle; index = pos >> PAGE_CACHE_SHIFT; - from = pos & (PAGE_CACHE_SIZE - 1); - to = from + len; if (ext4_nonda_switch(inode->i_sb)) { *fsdata = (void *)FALL_BACK_TO_NONDELALLOC; @@ -3668,6 +3688,8 @@ static int ext4_end_io_nolock(ext4_io_end_t *io) return ret; } + if (io->iocb) + aio_complete(io->iocb, io->result, 0); /* clear the DIO AIO unwritten flag */ io->flag = 0; return ret; @@ -3767,6 +3789,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) io->offset = 0; io->size = 0; io->page = NULL; + io->iocb = NULL; + io->result = 0; INIT_WORK(&io->work, ext4_end_io_work); INIT_LIST_HEAD(&io->list); } @@ -3796,12 +3820,18 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; - goto out; +out: + if (is_async) + aio_complete(iocb, ret, 0); + return; } io_end->offset = offset; io_end->size = size; - io_end->flag = EXT4_IO_UNWRITTEN; + if (is_async) { + io_end->iocb = iocb; + io_end->result = ret; + } wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; /* queue the work to convert unwritten extents to written */ @@ -3813,9 +3843,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; -out: - if (is_async) - aio_complete(iocb, ret, 0); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) @@ -3941,7 +3968,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, return -ENOMEM; /* * we save the io structure for current async - * direct IO, so that later ext4_get_blocks() + * direct IO, so that later ext4_map_blocks() * could flag the io structure whether there * is a unwritten extents needs to be converted * when IO is completed. @@ -4132,17 +4159,6 @@ int ext4_block_truncate_page(handle_t *handle, length = blocksize - (offset & (blocksize - 1)); iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); - /* - * For "nobh" option, we can only work if we don't need to - * read-in the page - otherwise we create buffers to do the IO. - */ - if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) && - ext4_should_writeback_data(inode) && PageUptodate(page)) { - zero_user(page, offset, length); - set_page_dirty(page); - goto unlock; - } - if (!page_has_buffers(page)) create_empty_buffers(page, blocksize, 0); @@ -4492,9 +4508,8 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, * (should be rare). */ if (!bh) { - EXT4_ERROR_INODE(inode, - "Read failure block=%llu", - (unsigned long long) nr); + EXT4_ERROR_INODE_BLOCK(inode, nr, + "Read failure"); continue; } @@ -4506,27 +4521,6 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, depth); /* - * We've probably journalled the indirect block several - * times during the truncate. But it's no longer - * needed and we now drop it from the transaction via - * jbd2_journal_revoke(). - * - * That's easy if it's exclusively part of this - * transaction. But if it's part of the committing - * transaction then jbd2_journal_forget() will simply - * brelse() it. That means that if the underlying - * block is reallocated in ext4_get_block(), - * unmap_underlying_metadata() will find this block - * and will try to get rid of it. damn, damn. - * - * If this block has already been committed to the - * journal, a revoke record will be written. And - * revoke records must be emitted *before* clearing - * this block's bit in the bitmaps. - */ - ext4_forget(handle, 1, inode, bh, bh->b_blocknr); - - /* * Everything below this this pointer has been * released. Now let this top-of-subtree go. * @@ -4550,8 +4544,20 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, blocks_for_truncate(inode)); } + /* + * The forget flag here is critical because if + * we are journaling (and not doing data + * journaling), we have to make sure a revoke + * record is written to prevent the journal + * replay from overwriting the (former) + * indirect block if it gets reallocated as a + * data block. This must happen in the same + * transaction where the data blocks are + * actually freed. + */ ext4_free_blocks(handle, inode, 0, nr, 1, - EXT4_FREE_BLOCKS_METADATA); + EXT4_FREE_BLOCKS_METADATA| + EXT4_FREE_BLOCKS_FORGET); if (parent_bh) { /* @@ -4809,8 +4815,8 @@ static int __ext4_get_inode_loc(struct inode *inode, bh = sb_getblk(sb, block); if (!bh) { - EXT4_ERROR_INODE(inode, "unable to read inode block - " - "block %llu", block); + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); return -EIO; } if (!buffer_uptodate(bh)) { @@ -4908,8 +4914,8 @@ make_io: submit_bh(READ_META, bh); wait_on_buffer(bh); if (!buffer_uptodate(bh)) { - EXT4_ERROR_INODE(inode, "unable to read inode " - "block %llu", block); + EXT4_ERROR_INODE_BLOCK(inode, block, + "unable to read itable block"); brelse(bh); return -EIO; } @@ -4980,7 +4986,7 @@ static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode, /* we are using combined 48 bit field */ i_blocks = ((u64)le16_to_cpu(raw_inode->i_blocks_high)) << 32 | le32_to_cpu(raw_inode->i_blocks_lo); - if (ei->i_flags & EXT4_HUGE_FILE_FL) { + if (ext4_test_inode_flag(inode, EXT4_INODE_HUGE_FILE)) { /* i_blocks represent file system block size */ return i_blocks << (inode->i_blkbits - 9); } else { @@ -5076,7 +5082,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else @@ -5085,7 +5091,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) tid = transaction->t_tid; else tid = journal->j_commit_sequence; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } @@ -5130,7 +5136,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_file_acl); ret = -EIO; goto bad_inode; - } else if (ei->i_flags & EXT4_EXTENTS_FL) { + } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || (S_ISLNK(inode->i_mode) && !ext4_inode_is_fast_symlink(inode))) @@ -5410,9 +5416,8 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (wbc->sync_mode == WB_SYNC_ALL) sync_dirty_buffer(iloc.bh); if (buffer_req(iloc.bh) && !buffer_uptodate(iloc.bh)) { - EXT4_ERROR_INODE(inode, - "IO error syncing inode (block=%llu)", - (unsigned long long) iloc.bh->b_blocknr); + EXT4_ERROR_INODE_BLOCK(inode, iloc.bh->b_blocknr, + "IO error syncing inode"); err = -EIO; } brelse(iloc.bh); @@ -5487,10 +5492,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - if (attr->ia_size > sbi->s_bitmap_maxbytes) { - error = -EFBIG; - goto err_out; - } + if (attr->ia_size > sbi->s_bitmap_maxbytes) + return -EFBIG; } } @@ -5692,7 +5695,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks() to map/allocate a chunk of contiguous disk blocks. + * ext4_map_blocks() to map/allocate a chunk of contiguous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. @@ -5758,7 +5761,6 @@ static int ext4_expand_extra_isize(struct inode *inode, { struct ext4_inode *raw_inode; struct ext4_xattr_ibody_header *header; - struct ext4_xattr_entry *entry; if (EXT4_I(inode)->i_extra_isize >= new_extra_isize) return 0; @@ -5766,7 +5768,6 @@ static int ext4_expand_extra_isize(struct inode *inode, raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - entry = IFIRST(header); /* No extended attributes present */ if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR) || diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 0e83dfd351d..4b4ad4b7ce5 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -446,10 +446,11 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += first + i; ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, - first + i, e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing block already freed " + "(bit %u)", + first + i); } mb_clear_bit(first + i, e4b->bd_info->bb_bitmap); } @@ -712,9 +713,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, grp->bb_fragments = fragments; if (free != grp->bb_free) { - ext4_grp_locked_error(sb, group, __func__, - "EXT4-fs: group %u: %u blocks in bitmap, %u in gd", - group, free, grp->bb_free); + ext4_grp_locked_error(sb, group, 0, 0, + "%u blocks in bitmap, %u in gd", + free, grp->bb_free); /* * If we intent to continue, we consider group descritor * corrupt and update bb_free using bitmap value @@ -1296,10 +1297,10 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, blocknr = ext4_group_first_block_no(sb, e4b->bd_group); blocknr += block; ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "double-free of inode" - " %lu's block %llu(bit %u in group %u)", - inode ? inode->i_ino : 0, blocknr, block, - e4b->bd_group); + inode ? inode->i_ino : 0, + blocknr, + "freeing already freed block " + "(bit %u)", block); } mb_clear_bit(block, EXT4_MB_BITMAP(e4b)); e4b->bd_info->bb_counters[order]++; @@ -1788,8 +1789,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, * free blocks even though group info says we * we have free blocks */ - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " "group info. But bitmap says 0", free); break; @@ -1798,8 +1799,8 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, mb_find_extent(e4b, 0, i, ac->ac_g_ex.fe_len, &ex); BUG_ON(ex.fe_len <= 0); if (free < ex.fe_len) { - ext4_grp_locked_error(sb, e4b->bd_group, - __func__, "%d free blocks as per " + ext4_grp_locked_error(sb, e4b->bd_group, 0, 0, + "%d free blocks as per " "group info. But got %d blocks", free, ex.fe_len); /* @@ -1821,8 +1822,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac, /* * This is a special case for storages like raid5 - * we try to find stripe-aligned chunks for stripe-size requests - * XXX should do so at least for multiples of stripe size as well + * we try to find stripe-aligned chunks for stripe-size-multiple requests */ static noinline_for_stack void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, @@ -1999,7 +1999,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ext4_group_t ngroups, group, i; int cr; int err = 0; - int bsbits; struct ext4_sb_info *sbi; struct super_block *sb; struct ext4_buddy e4b; @@ -2041,8 +2040,6 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) ac->ac_2order = i - 1; } - bsbits = ac->ac_sb->s_blocksize_bits; - /* if stream allocation is enabled, use global goal */ if (ac->ac_flags & EXT4_MB_STREAM_ALLOC) { /* TBD: may be hot point */ @@ -2094,8 +2091,8 @@ repeat: ac->ac_groups_scanned++; if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); - else if (cr == 1 && - ac->ac_g_ex.fe_len == sbi->s_stripe) + else if (cr == 1 && sbi->s_stripe && + !(ac->ac_g_ex.fe_len % sbi->s_stripe)) ext4_mb_scan_aligned(ac, &e4b); else ext4_mb_complex_scan_group(ac, &e4b); @@ -2221,7 +2218,7 @@ static int ext4_mb_seq_groups_open(struct inode *inode, struct file *file) rc = seq_open(file, &ext4_mb_seq_groups_ops); if (rc == 0) { - struct seq_file *m = (struct seq_file *)file->private_data; + struct seq_file *m = file->private_data; m->private = sb; } return rc; @@ -2560,6 +2557,22 @@ int ext4_mb_release(struct super_block *sb) return 0; } +static inline void ext4_issue_discard(struct super_block *sb, + ext4_group_t block_group, ext4_grpblk_t block, int count) +{ + int ret; + ext4_fsblk_t discard_block; + + discard_block = block + ext4_group_first_block_no(sb, block_group); + trace_ext4_discard_blocks(sb, + (unsigned long long) discard_block, count); + ret = sb_issue_discard(sb, discard_block, count); + if (ret == EOPNOTSUPP) { + ext4_warning(sb, "discard not supported, disabling"); + clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); + } +} + /* * This function is called by the jbd2 layer once the commit has finished, * so we know we can free the blocks that were released with that commit. @@ -2579,22 +2592,9 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn) mb_debug(1, "gonna free %u blocks in group %u (0x%p):", entry->count, entry->group, entry); - if (test_opt(sb, DISCARD)) { - int ret; - ext4_fsblk_t discard_block; - - discard_block = entry->start_blk + - ext4_group_first_block_no(sb, entry->group); - trace_ext4_discard_blocks(sb, - (unsigned long long)discard_block, - entry->count); - ret = sb_issue_discard(sb, discard_block, entry->count); - if (ret == EOPNOTSUPP) { - ext4_warning(sb, - "discard not supported, disabling"); - clear_opt(EXT4_SB(sb)->s_mount_opt, DISCARD); - } - } + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, entry->group, + entry->start_blk, entry->count); err = ext4_mb_load_buddy(sb, entry->group, &e4b); /* we expect to find existing buddy because it's pinned */ @@ -2712,7 +2712,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, handle_t *handle, unsigned int reserv_blks) { struct buffer_head *bitmap_bh = NULL; - struct ext4_super_block *es; struct ext4_group_desc *gdp; struct buffer_head *gdp_bh; struct ext4_sb_info *sbi; @@ -2725,8 +2724,6 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, sb = ac->ac_sb; sbi = EXT4_SB(sb); - es = sbi->s_es; - err = -EIO; bitmap_bh = ext4_read_block_bitmap(sb, ac->ac_b_ex.fe_group); @@ -2812,7 +2809,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, err = ext4_handle_dirty_metadata(handle, NULL, gdp_bh); out_err: - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); brelse(bitmap_bh); return err; } @@ -2850,7 +2847,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, int bsbits, max; ext4_lblk_t end; loff_t size, orig_size, start_off; - ext4_lblk_t start, orig_start; + ext4_lblk_t start; struct ext4_inode_info *ei = EXT4_I(ac->ac_inode); struct ext4_prealloc_space *pa; @@ -2881,6 +2878,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, size = size << bsbits; if (size < i_size_read(ac->ac_inode)) size = i_size_read(ac->ac_inode); + orig_size = size; /* max size of free chunks */ max = 2 << bsbits; @@ -2922,8 +2920,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, start_off = (loff_t)ac->ac_o_ex.fe_logical << bsbits; size = ac->ac_o_ex.fe_len << bsbits; } - orig_size = size = size >> bsbits; - orig_start = start = start_off >> bsbits; + size = size >> bsbits; + start = start_off >> bsbits; /* don't cover already allocated blocks in selected range */ if (ar->pleft && start <= ar->lleft) { @@ -3547,7 +3545,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, ext4_group_t group; ext4_grpblk_t bit; unsigned long long grp_blk_start; - sector_t start; int err = 0; int free = 0; @@ -3567,10 +3564,9 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - start = ext4_group_first_block_no(sb, group) + bit; mb_debug(1, " free preallocated %u/%u in group %u\n", - (unsigned) start, (unsigned) next - bit, - (unsigned) group); + (unsigned) ext4_group_first_block_no(sb, group) + bit, + (unsigned) next - bit, (unsigned) group); free += next - bit; if (ac) { @@ -3581,7 +3577,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, trace_ext4_mballoc_discard(ac); } - trace_ext4_mb_release_inode_pa(ac, pa, grp_blk_start + bit, + trace_ext4_mb_release_inode_pa(sb, ac, pa, grp_blk_start + bit, next - bit); mb_free_blocks(pa->pa_inode, e4b, bit, next - bit); bit = next + 1; @@ -3591,8 +3587,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, (unsigned long) pa->pa_len); - ext4_grp_locked_error(sb, group, - __func__, "free %u, pa_free %u", + ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* * pa is already deleted so we use the value obtained @@ -3613,7 +3608,7 @@ ext4_mb_release_group_pa(struct ext4_buddy *e4b, ext4_group_t group; ext4_grpblk_t bit; - trace_ext4_mb_release_group_pa(ac, pa); + trace_ext4_mb_release_group_pa(sb, ac, pa); BUG_ON(pa->pa_deleted == 0); ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, &bit); BUG_ON(group != e4b->bd_group && pa->pa_len != 0); @@ -3889,6 +3884,9 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) struct super_block *sb = ac->ac_sb; ext4_group_t ngroups, i; + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return; + printk(KERN_ERR "EXT4-fs: Can't allocate:" " Allocation context details:\n"); printk(KERN_ERR "EXT4-fs: status %d flags %d\n", @@ -4255,7 +4253,7 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) * to usual allocation */ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, - struct ext4_allocation_request *ar, int *errp) + struct ext4_allocation_request *ar, int *errp) { int freed; struct ext4_allocation_context *ac = NULL; @@ -4299,7 +4297,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, inquota = ar->len; if (ar->len == 0) { *errp = -EDQUOT; - goto out3; + goto out; } } @@ -4307,13 +4305,13 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (!ac) { ar->len = 0; *errp = -ENOMEM; - goto out1; + goto out; } *errp = ext4_mb_initialize_context(ac, ar); if (*errp) { ar->len = 0; - goto out2; + goto out; } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; @@ -4322,7 +4320,9 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ext4_mb_normalize_request(ac, ar); repeat: /* allocate space in core */ - ext4_mb_regular_allocator(ac); + *errp = ext4_mb_regular_allocator(ac); + if (*errp) + goto errout; /* as we've just preallocated more space than * user requested orinally, we store allocated @@ -4333,7 +4333,7 @@ repeat: } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_blks); - if (*errp == -EAGAIN) { + if (*errp == -EAGAIN) { /* * drop the reference that we took * in ext4_mb_use_best_found @@ -4344,12 +4344,10 @@ repeat: ac->ac_b_ex.fe_len = 0; ac->ac_status = AC_STATUS_CONTINUE; goto repeat; - } else if (*errp) { + } else if (*errp) + errout: ext4_discard_allocated_blocks(ac); - ac->ac_b_ex.fe_len = 0; - ar->len = 0; - ext4_mb_show_ac(ac); - } else { + else { block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); ar->len = ac->ac_b_ex.fe_len; } @@ -4358,19 +4356,19 @@ repeat: if (freed) goto repeat; *errp = -ENOSPC; + } + + if (*errp) { ac->ac_b_ex.fe_len = 0; ar->len = 0; ext4_mb_show_ac(ac); } - ext4_mb_release_context(ac); - -out2: - kmem_cache_free(ext4_ac_cachep, ac); -out1: +out: + if (ac) + kmem_cache_free(ext4_ac_cachep, ac); if (inquota && ar->len < inquota) dquot_free_block(ar->inode, inquota - ar->len); -out3: if (!ar->len) { if (!EXT4_I(ar->inode)->i_delalloc_reserved_flag) /* release all the reserved blocks if non delalloc */ @@ -4402,6 +4400,7 @@ static noinline_for_stack int ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, struct ext4_free_data *new_entry) { + ext4_group_t group = e4b->bd_group; ext4_grpblk_t block; struct ext4_free_data *entry; struct ext4_group_info *db = e4b->bd_info; @@ -4434,9 +4433,9 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b, else if (block >= (entry->start_blk + entry->count)) n = &(*n)->rb_right; else { - ext4_grp_locked_error(sb, e4b->bd_group, __func__, - "Double free of blocks %d (%d %d)", - block, entry->start_blk, entry->count); + ext4_grp_locked_error(sb, group, 0, + ext4_group_first_block_no(sb, group) + block, + "Block already on to-be-freed list"); return 0; } } @@ -4494,7 +4493,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_allocation_context *ac = NULL; struct ext4_group_desc *gdp; - struct ext4_super_block *es; unsigned long freed = 0; unsigned int overflow; ext4_grpblk_t bit; @@ -4513,7 +4511,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, } sbi = EXT4_SB(sb); - es = EXT4_SB(sb)->s_es; if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) && !ext4_data_block_valid(sbi, block, count)) { ext4_error(sb, "Freeing blocks not in datazone - " @@ -4647,6 +4644,8 @@ do_more: mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); + if (test_opt(sb, DISCARD)) + ext4_issue_discard(sb, block_group, bit, count); } ret = ext4_free_blks_count(sb, gdp) + count; @@ -4680,7 +4679,7 @@ do_more: put_bh(bitmap_bh); goto do_more; } - sb->s_dirt = 1; + ext4_mark_super_dirty(sb); error_return: if (freed) dquot_free_block(inode, freed); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 6f3a27ec30b..1765c2c50a9 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -376,7 +376,7 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * We have the extent map build with the tmp inode. * Now copy the i_data across */ - ei->i_flags |= EXT4_EXTENTS_FL; + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS); memcpy(ei->i_data, tmp_ei->i_data, sizeof(ei->i_data)); /* diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 52abfa12762..5f1ed9fc913 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -148,17 +148,17 @@ mext_next_extent(struct inode *inode, struct ext4_ext_path *path, */ static int mext_check_null_inode(struct inode *inode1, struct inode *inode2, - const char *function) + const char *function, unsigned int line) { int ret = 0; if (inode1 == NULL) { - __ext4_error(inode2->i_sb, function, + __ext4_error(inode2->i_sb, function, line, "Both inodes should not be NULL: " "inode1 NULL inode2 %lu", inode2->i_ino); ret = -EIO; } else if (inode2 == NULL) { - __ext4_error(inode1->i_sb, function, + __ext4_error(inode1->i_sb, function, line, "Both inodes should not be NULL: " "inode1 %lu inode2 NULL", inode1->i_ino); ret = -EIO; @@ -1084,7 +1084,7 @@ mext_inode_double_lock(struct inode *inode1, struct inode *inode2) BUG_ON(inode1 == NULL && inode2 == NULL); - ret = mext_check_null_inode(inode1, inode2, __func__); + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); if (ret < 0) goto out; @@ -1121,7 +1121,7 @@ mext_inode_double_unlock(struct inode *inode1, struct inode *inode2) BUG_ON(inode1 == NULL && inode2 == NULL); - ret = mext_check_null_inode(inode1, inode2, __func__); + ret = mext_check_null_inode(inode1, inode2, __func__, __LINE__); if (ret < 0) goto out; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a43e6617b35..314c0d3b3fa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -179,30 +179,6 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); -unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize) -{ - unsigned len = le16_to_cpu(dlen); - - if (len == EXT4_MAX_REC_LEN || len == 0) - return blocksize; - return (len & 65532) | ((len & 3) << 16); -} - -__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) -{ - if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3)) - BUG(); - if (len < 65536) - return cpu_to_le16(len); - if (len == blocksize) { - if (blocksize == 65536) - return cpu_to_le16(EXT4_MAX_REC_LEN); - else - return cpu_to_le16(0); - } - return cpu_to_le16((len & 65532) | ((len >> 16) & 3)); -} - /* * p is at least 6 bytes before the end of page */ @@ -605,7 +581,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0)); for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) { - if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh, + if (!ext4_check_dir_entry(dir, de, bh, (block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb)) +((char *)de - bh->b_data))) { /* On error, skip the f_pos to the next block. */ @@ -844,8 +820,7 @@ static inline int search_dirblock(struct buffer_head *bh, if ((char *) de + namelen <= dlimit && ext4_match (namelen, name, de)) { /* found a match - just to be sure, do a full check */ - if (!ext4_check_dir_entry("ext4_find_entry", - dir, de, bh, offset)) + if (!ext4_check_dir_entry(dir, de, bh, offset)) return -1; *res_dir = de; return 1; @@ -1019,7 +994,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q int off = (block << EXT4_BLOCK_SIZE_BITS(sb)) + ((char *) de - bh->b_data); - if (!ext4_check_dir_entry(__func__, dir, de, bh, off)) { + if (!ext4_check_dir_entry(dir, de, bh, off)) { brelse(bh); *err = ERR_BAD_DX_DIR; goto errout; @@ -1088,7 +1063,6 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - struct inode *inode; static const struct qstr dotdot = { .name = "..", .len = 2, @@ -1097,7 +1071,6 @@ struct dentry *ext4_get_parent(struct dentry *child) struct buffer_head *bh; bh = ext4_find_entry(child->d_inode, &dotdot, &de); - inode = NULL; if (!bh) return ERR_PTR(-ENOENT); ino = le32_to_cpu(de->inode); @@ -1305,8 +1278,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *)bh->b_data; top = bh->b_data + blocksize - reclen; while ((char *) de <= top) { - if (!ext4_check_dir_entry("ext4_add_entry", dir, de, - bh, offset)) + if (!ext4_check_dir_entry(dir, de, bh, offset)) return -EIO; if (ext4_match(namelen, name, de)) return -EEXIST; @@ -1673,7 +1645,7 @@ static int ext4_delete_entry(handle_t *handle, pde = NULL; de = (struct ext4_dir_entry_2 *) bh->b_data; while (i < bh->b_size) { - if (!ext4_check_dir_entry("ext4_delete_entry", dir, de, bh, i)) + if (!ext4_check_dir_entry(dir, de, bh, i)) return -EIO; if (de == de_del) { BUFFER_TRACE(bh, "get_write_access"); @@ -1956,7 +1928,7 @@ static int empty_dir(struct inode *inode) } de = (struct ext4_dir_entry_2 *) bh->b_data; } - if (!ext4_check_dir_entry("empty_dir", inode, de, bh, offset)) { + if (!ext4_check_dir_entry(inode, de, bh, offset)) { de = (struct ext4_dir_entry_2 *)(bh->b_data + sb->s_blocksize); offset = (offset | (sb->s_blocksize - 1)) + 1; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 6df797eb9ae..ca5c8aa00a2 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -921,8 +921,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) &sbi->s_flex_groups[flex_group].free_inodes); } - ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh); - sb->s_dirt = 1; + ext4_handle_dirty_super(handle, sb); exit_journal: mutex_unlock(&sbi->s_resize_lock); @@ -953,7 +952,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, ext4_fsblk_t n_blocks_count) { ext4_fsblk_t o_blocks_count; - ext4_group_t o_groups_count; ext4_grpblk_t last; ext4_grpblk_t add; struct buffer_head *bh; @@ -965,7 +963,6 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, * yet: we're going to revalidate es->s_blocks_count after * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); - o_groups_count = EXT4_SB(sb)->s_groups_count; if (test_opt(sb, DEBUG)) printk(KERN_DEBUG "EXT4-fs: extending last group from %llu uto %llu blocks\n", @@ -1045,13 +1042,12 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); - sb->s_dirt = 1; mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ ext4_add_groupblocks(handle, sb, o_blocks_count, add); + ext4_handle_dirty_super(handle, sb); ext4_debug("freed blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); if ((err = ext4_journal_stop(handle))) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e72d3235b2f..8d65575f8c8 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -241,14 +241,14 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) if (sb->s_flags & MS_RDONLY) return ERR_PTR(-EROFS); - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); /* Special case here: if the journal has aborted behind our * backs (eg. EIO in the commit thread), then we still need to * take the FS itself readonly cleanly. */ journal = EXT4_SB(sb)->s_journal; if (journal) { if (is_journal_aborted(journal)) { - ext4_abort(sb, __func__, "Detected aborted journal"); + ext4_abort(sb, "Detected aborted journal"); return ERR_PTR(-EROFS); } return jbd2_journal_start(journal, nblocks); @@ -262,7 +262,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks) * that sync() will call the filesystem's write_super callback if * appropriate. */ -int __ext4_journal_stop(const char *where, handle_t *handle) +int __ext4_journal_stop(const char *where, unsigned int line, handle_t *handle) { struct super_block *sb; int err; @@ -279,12 +279,13 @@ int __ext4_journal_stop(const char *where, handle_t *handle) if (!err) err = rc; if (err) - __ext4_std_error(sb, where, err); + __ext4_std_error(sb, where, line, err); return err; } -void ext4_journal_abort_handle(const char *caller, const char *err_fn, - struct buffer_head *bh, handle_t *handle, int err) +void ext4_journal_abort_handle(const char *caller, unsigned int line, + const char *err_fn, struct buffer_head *bh, + handle_t *handle, int err) { char nbuf[16]; const char *errstr = ext4_decode_error(NULL, err, nbuf); @@ -300,12 +301,47 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, if (is_handle_aborted(handle)) return; - printk(KERN_ERR "%s: aborting transaction: %s in %s\n", - caller, errstr, err_fn); + printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n", + caller, line, errstr, err_fn); jbd2_journal_abort_handle(handle); } +static void __save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); + es->s_last_error_time = cpu_to_le32(get_seconds()); + strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func)); + es->s_last_error_line = cpu_to_le32(line); + if (!es->s_first_error_time) { + es->s_first_error_time = es->s_last_error_time; + strncpy(es->s_first_error_func, func, + sizeof(es->s_first_error_func)); + es->s_first_error_line = cpu_to_le32(line); + es->s_first_error_ino = es->s_last_error_ino; + es->s_first_error_block = es->s_last_error_block; + } + /* + * Start the daily error reporting function if it hasn't been + * started already + */ + if (!es->s_error_count) + mod_timer(&EXT4_SB(sb)->s_err_report, jiffies + 24*60*60*HZ); + es->s_error_count = cpu_to_le32(le32_to_cpu(es->s_error_count) + 1); +} + +static void save_error_info(struct super_block *sb, const char *func, + unsigned int line) +{ + __save_error_info(sb, func, line); + ext4_commit_super(sb, 1); +} + + /* Deal with the reporting of failure conditions on a filesystem such as * inconsistencies detected or read IO failures. * @@ -323,11 +359,6 @@ void ext4_journal_abort_handle(const char *caller, const char *err_fn, static void ext4_handle_error(struct super_block *sb) { - struct ext4_super_block *es = EXT4_SB(sb)->s_es; - - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - if (sb->s_flags & MS_RDONLY) return; @@ -342,19 +373,19 @@ static void ext4_handle_error(struct super_block *sb) ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); sb->s_flags |= MS_RDONLY; } - ext4_commit_super(sb, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } void __ext4_error(struct super_block *sb, const char *function, - const char *fmt, ...) + unsigned int line, const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: comm %s: ", + sb->s_id, function, line, current->comm); vprintk(fmt, args); printk("\n"); va_end(args); @@ -362,14 +393,22 @@ void __ext4_error(struct super_block *sb, const char *function, ext4_handle_error(sb); } -void ext4_error_inode(const char *function, struct inode *inode, +void ext4_error_inode(struct inode *inode, const char *function, + unsigned int line, ext4_fsblk_t block, const char *fmt, ...) { va_list args; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + es->s_last_error_block = cpu_to_le64(block); + save_error_info(inode->i_sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: inode #%lu: (comm %s) ", - inode->i_sb->s_id, function, inode->i_ino, current->comm); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ", + inode->i_sb->s_id, function, line, inode->i_ino); + if (block) + printk("block %llu: ", block); + printk("comm %s: ", current->comm); vprintk(fmt, args); printk("\n"); va_end(args); @@ -377,20 +416,26 @@ void ext4_error_inode(const char *function, struct inode *inode, ext4_handle_error(inode->i_sb); } -void ext4_error_file(const char *function, struct file *file, - const char *fmt, ...) +void ext4_error_file(struct file *file, const char *function, + unsigned int line, const char *fmt, ...) { va_list args; + struct ext4_super_block *es; struct inode *inode = file->f_dentry->d_inode; char pathname[80], *path; + es = EXT4_SB(inode->i_sb)->s_es; + es->s_last_error_ino = cpu_to_le32(inode->i_ino); + save_error_info(inode->i_sb, function, line); va_start(args, fmt); path = d_path(&(file->f_path), pathname, sizeof(pathname)); if (!path) path = "(unknown)"; printk(KERN_CRIT - "EXT4-fs error (device %s): %s: inode #%lu (comm %s path %s): ", - inode->i_sb->s_id, function, inode->i_ino, current->comm, path); + "EXT4-fs error (device %s): %s:%d: inode #%lu " + "(comm %s path %s): ", + inode->i_sb->s_id, function, line, inode->i_ino, + current->comm, path); vprintk(fmt, args); printk("\n"); va_end(args); @@ -435,7 +480,8 @@ static const char *ext4_decode_error(struct super_block *sb, int errno, /* __ext4_std_error decodes expected errors from journaling functions * automatically and invokes the appropriate error response. */ -void __ext4_std_error(struct super_block *sb, const char *function, int errno) +void __ext4_std_error(struct super_block *sb, const char *function, + unsigned int line, int errno) { char nbuf[16]; const char *errstr; @@ -448,8 +494,9 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno) return; errstr = ext4_decode_error(sb, errno, nbuf); - printk(KERN_CRIT "EXT4-fs error (device %s) in %s: %s\n", - sb->s_id, function, errstr); + printk(KERN_CRIT "EXT4-fs error (device %s) in %s:%d: %s\n", + sb->s_id, function, line, errstr); + save_error_info(sb, function, line); ext4_handle_error(sb); } @@ -464,29 +511,29 @@ void __ext4_std_error(struct super_block *sb, const char *function, int errno) * case we take the easy way out and panic immediately. */ -void ext4_abort(struct super_block *sb, const char *function, - const char *fmt, ...) +void __ext4_abort(struct super_block *sb, const char *function, + unsigned int line, const char *fmt, ...) { va_list args; + save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: ", sb->s_id, + function, line); vprintk(fmt, args); printk("\n"); va_end(args); + if ((sb->s_flags & MS_RDONLY) == 0) { + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); + sb->s_flags |= MS_RDONLY; + EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; + if (EXT4_SB(sb)->s_journal) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); + save_error_info(sb, function, line); + } if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs panic from previous error\n"); - - if (sb->s_flags & MS_RDONLY) - return; - - ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - sb->s_flags |= MS_RDONLY; - EXT4_SB(sb)->s_mount_flags |= EXT4_MF_FS_ABORTED; - if (EXT4_SB(sb)->s_journal) - jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO); } void ext4_msg (struct super_block * sb, const char *prefix, @@ -502,38 +549,47 @@ void ext4_msg (struct super_block * sb, const char *prefix, } void __ext4_warning(struct super_block *sb, const char *function, - const char *fmt, ...) + unsigned int line, const char *fmt, ...) { va_list args; va_start(args, fmt); - printk(KERN_WARNING "EXT4-fs warning (device %s): %s: ", - sb->s_id, function); + printk(KERN_WARNING "EXT4-fs warning (device %s): %s:%d: ", + sb->s_id, function, line); vprintk(fmt, args); printk("\n"); va_end(args); } -void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp, - const char *function, const char *fmt, ...) +void __ext4_grp_locked_error(const char *function, unsigned int line, + struct super_block *sb, ext4_group_t grp, + unsigned long ino, ext4_fsblk_t block, + const char *fmt, ...) __releases(bitlock) __acquires(bitlock) { va_list args; struct ext4_super_block *es = EXT4_SB(sb)->s_es; + es->s_last_error_ino = cpu_to_le32(ino); + es->s_last_error_block = cpu_to_le64(block); + __save_error_info(sb, function, line); va_start(args, fmt); - printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function); + printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: group %u", + sb->s_id, function, line, grp); + if (ino) + printk("inode %lu: ", ino); + if (block) + printk("block %llu:", (unsigned long long) block); vprintk(fmt, args); printk("\n"); va_end(args); if (test_opt(sb, ERRORS_CONT)) { - EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; - es->s_state |= cpu_to_le16(EXT4_ERROR_FS); ext4_commit_super(sb, 0); return; } + ext4_unlock_group(sb, grp); ext4_handle_error(sb); /* @@ -660,8 +716,7 @@ static void ext4_put_super(struct super_block *sb) err = jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; if (err < 0) - ext4_abort(sb, __func__, - "Couldn't clean up the journal"); + ext4_abort(sb, "Couldn't clean up the journal"); } ext4_release_system_zone(sb); @@ -946,14 +1001,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) seq_puts(seq, ",journal_async_commit"); else if (test_opt(sb, JOURNAL_CHECKSUM)) seq_puts(seq, ",journal_checksum"); - if (test_opt(sb, NOBH)) - seq_puts(seq, ",nobh"); if (test_opt(sb, I_VERSION)) seq_puts(seq, ",i_version"); - if (!test_opt(sb, DELALLOC)) + if (!test_opt(sb, DELALLOC) && + !(def_mount_opts & EXT4_DEFM_NODELALLOC)) seq_puts(seq, ",nodelalloc"); - if (sbi->s_stripe) seq_printf(seq, ",stripe=%lu", sbi->s_stripe); /* @@ -977,7 +1030,7 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NO_AUTO_DA_ALLOC)) seq_puts(seq, ",noauto_da_alloc"); - if (test_opt(sb, DISCARD)) + if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sb, NOLOAD)) @@ -986,6 +1039,10 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, DIOREAD_NOLOCK)) seq_puts(seq, ",dioread_nolock"); + if (test_opt(sb, BLOCK_VALIDITY) && + !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY)) + seq_puts(seq, ",block_validity"); + ext4_show_quota_options(seq, sb); return 0; @@ -1065,6 +1122,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, char *path); +static int ext4_quota_off(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1086,7 +1144,7 @@ static const struct dquot_operations ext4_quota_operations = { static const struct quotactl_ops ext4_qctl_operations = { .quota_on = ext4_quota_on, - .quota_off = dquot_quota_off, + .quota_off = ext4_quota_off, .quota_sync = dquot_quota_sync, .get_info = dquot_get_dqinfo, .set_info = dquot_set_dqinfo, @@ -1624,10 +1682,12 @@ set_qf_format: *n_blocks_count = option; break; case Opt_nobh: - set_opt(sbi->s_mount_opt, NOBH); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated nobh option"); break; case Opt_bh: - clear_opt(sbi->s_mount_opt, NOBH); + ext4_msg(sb, KERN_WARNING, + "Ignoring deprecated bh option"); break; case Opt_i_version: set_opt(sbi->s_mount_opt, I_VERSION); @@ -2249,6 +2309,8 @@ static ssize_t session_write_kbytes_show(struct ext4_attr *a, { struct super_block *sb = sbi->s_buddy_cache->i_sb; + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%lu\n", (part_stat_read(sb->s_bdev->bd_part, sectors[1]) - sbi->s_sectors_written_start) >> 1); @@ -2259,6 +2321,8 @@ static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a, { struct super_block *sb = sbi->s_buddy_cache->i_sb; + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); return snprintf(buf, PAGE_SIZE, "%llu\n", (unsigned long long)(sbi->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - @@ -2431,6 +2495,53 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) return 1; } +/* + * This function is called once a day if we have errors logged + * on the file system + */ +static void print_daily_error_info(unsigned long arg) +{ + struct super_block *sb = (struct super_block *) arg; + struct ext4_sb_info *sbi; + struct ext4_super_block *es; + + sbi = EXT4_SB(sb); + es = sbi->s_es; + + if (es->s_error_count) + ext4_msg(sb, KERN_NOTICE, "error count: %u", + le32_to_cpu(es->s_error_count)); + if (es->s_first_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): initial error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_first_error_time), + (int) sizeof(es->s_first_error_func), + es->s_first_error_func, + le32_to_cpu(es->s_first_error_line)); + if (es->s_first_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_first_error_ino)); + if (es->s_first_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_first_error_block)); + printk("\n"); + } + if (es->s_last_error_time) { + printk(KERN_NOTICE "EXT4-fs (%s): last error at %u: %.*s:%d", + sb->s_id, le32_to_cpu(es->s_last_error_time), + (int) sizeof(es->s_last_error_func), + es->s_last_error_func, + le32_to_cpu(es->s_last_error_line)); + if (es->s_last_error_ino) + printk(": inode %u", + le32_to_cpu(es->s_last_error_ino)); + if (es->s_last_error_block) + printk(": block %llu", (unsigned long long) + le64_to_cpu(es->s_last_error_block)); + printk("\n"); + } + mod_timer(&sbi->s_err_report, jiffies + 24*60*60*HZ); /* Once a day */ +} + static int ext4_fill_super(struct super_block *sb, void *data, int silent) __releases(kernel_lock) __acquires(kernel_lock) @@ -2448,7 +2559,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct inode *root; char *cp; const char *descr; - int ret = -EINVAL; + int ret = -ENOMEM; int blocksize; unsigned int db_count; unsigned int i; @@ -2459,13 +2570,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) - return -ENOMEM; + goto out_free_orig; sbi->s_blockgroup_lock = kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL); if (!sbi->s_blockgroup_lock) { kfree(sbi); - return -ENOMEM; + goto out_free_orig; } sb->s_fs_info = sbi; sbi->s_mount_opt = 0; @@ -2473,8 +2584,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_resgid = EXT4_DEF_RESGID; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sb_block = sb_block; - sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part, - sectors[1]); + if (sb->s_bdev->bd_part) + sbi->s_sectors_written_start = + part_stat_read(sb->s_bdev->bd_part, sectors[1]); unlock_kernel(); @@ -2482,6 +2594,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (cp = sb->s_id; (cp = strchr(cp, '/'));) *cp = '!'; + ret = -EINVAL; blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE); if (!blocksize) { ext4_msg(sb, KERN_ERR, "unable to set blocksize"); @@ -2546,6 +2659,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sbi->s_mount_opt, ERRORS_CONT); else set_opt(sbi->s_mount_opt, ERRORS_RO); + if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) + set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + if (def_mount_opts & EXT4_DEFM_DISCARD) + set_opt(sbi->s_mount_opt, DISCARD); sbi->s_resuid = le16_to_cpu(es->s_def_resuid); sbi->s_resgid = le16_to_cpu(es->s_def_resgid); @@ -2553,15 +2670,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; - set_opt(sbi->s_mount_opt, BARRIER); + if ((def_mount_opts & EXT4_DEFM_NOBARRIER) == 0) + set_opt(sbi->s_mount_opt, BARRIER); /* * enable delayed allocation by default * Use -o nodelalloc to turn it off */ - if (!IS_EXT3_SB(sb)) + if (!IS_EXT3_SB(sb) && + ((def_mount_opts & EXT4_DEFM_NODELALLOC) == 0)) set_opt(sbi->s_mount_opt, DELALLOC); + if (!parse_options((char *) sbi->s_es->s_mount_opts, sb, + &journal_devnum, &journal_ioprio, NULL, 0)) { + ext4_msg(sb, KERN_WARNING, + "failed to parse options in superblock: %s", + sbi->s_es->s_mount_opts); + } if (!parse_options((char *) data, sb, &journal_devnum, &journal_ioprio, NULL, 0)) goto failed_mount; @@ -2912,18 +3037,7 @@ no_journal: ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount_wq; } - if (test_opt(sb, NOBH)) { - if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) { - ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - " - "its supported only with writeback mode"); - clear_opt(sbi->s_mount_opt, NOBH); - } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_WARNING, "dioread_nolock option is " - "not supported with nobh mode"); - goto failed_mount_wq; - } - } + EXT4_SB(sb)->dio_unwritten_wq = create_workqueue("ext4-dio-unwritten"); if (!EXT4_SB(sb)->dio_unwritten_wq) { printk(KERN_ERR "EXT4-fs: failed to create DIO workqueue\n"); @@ -3043,7 +3157,14 @@ no_journal: descr = "out journal"; ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. " - "Opts: %s", descr, orig_data); + "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts, + *sbi->s_es->s_mount_opts ? "; " : "", orig_data); + + init_timer(&sbi->s_err_report); + sbi->s_err_report.function = print_daily_error_info; + sbi->s_err_report.data = (unsigned long) sb; + if (es->s_error_count) + mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ lock_kernel(); kfree(orig_data); @@ -3093,6 +3214,7 @@ out_fail: kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); +out_free_orig: kfree(orig_data); return ret; } @@ -3110,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else @@ -3119,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } static journal_t *ext4_get_journal(struct super_block *sb, @@ -3327,8 +3449,17 @@ static int ext4_load_journal(struct super_block *sb, if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) err = jbd2_journal_wipe(journal, !really_read_only); - if (!err) + if (!err) { + char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL); + if (save) + memcpy(save, ((char *) es) + + EXT4_S_ERR_START, EXT4_S_ERR_LEN); err = jbd2_journal_load(journal); + if (save) + memcpy(((char *) es) + EXT4_S_ERR_START, + save, EXT4_S_ERR_LEN); + kfree(save); + } if (err) { ext4_msg(sb, KERN_ERR, "error loading journal"); @@ -3384,10 +3515,14 @@ static int ext4_commit_super(struct super_block *sb, int sync) */ if (!(sb->s_flags & MS_RDONLY)) es->s_wtime = cpu_to_le32(get_seconds()); - es->s_kbytes_written = - cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + + if (sb->s_bdev->bd_part) + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written + ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) - EXT4_SB(sb)->s_sectors_written_start) >> 1)); + else + es->s_kbytes_written = + cpu_to_le64(EXT4_SB(sb)->s_kbytes_written); ext4_free_blocks_count_set(es, percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( @@ -3491,7 +3626,7 @@ int ext4_force_commit(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; if (journal) { - vfs_check_frozen(sb, SB_FREEZE_WRITE); + vfs_check_frozen(sb, SB_FREEZE_TRANS); ret = ext4_journal_force_commit(journal); } @@ -3616,7 +3751,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) - ext4_abort(sb, __func__, "Abort forced by user"); + ext4_abort(sb, "Abort forced by user"); sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sb, POSIX_ACL) ? MS_POSIXACL : 0); @@ -3981,6 +4116,18 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, return err; } +static int ext4_quota_off(struct super_block *sb, int type) +{ + /* Force all delayed allocation blocks to be allocated */ + if (test_opt(sb, DELALLOC)) { + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + return dquot_quota_off(sb, type); +} + /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and noone else should touch the files) @@ -4030,7 +4177,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); int err = 0; int offset = off & (sb->s_blocksize - 1); - int journal_quota = EXT4_SB(sb)->s_qf_names[type] != NULL; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -4055,24 +4201,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, bh = ext4_bread(handle, inode, blk, 1, &err); if (!bh) goto out; - if (journal_quota) { - err = ext4_journal_get_write_access(handle, bh); - if (err) { - brelse(bh); - goto out; - } + err = ext4_journal_get_write_access(handle, bh); + if (err) { + brelse(bh); + goto out; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); flush_dcache_page(bh->b_page); unlock_buffer(bh); - if (journal_quota) - err = ext4_handle_dirty_metadata(handle, NULL, bh); - else { - /* Always do at least ordered writes for quotas */ - err = ext4_jbd2_file_inode(handle, inode); - mark_buffer_dirty(bh); - } + err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: if (err) { diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 04338009793..a6f31424957 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -458,8 +458,7 @@ static void ext4_xattr_update_super_block(handle_t *handle, if (ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh) == 0) { EXT4_SET_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_EXT_ATTR); - sb->s_dirt = 1; - ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); + ext4_handle_dirty_super(handle, sb); } } diff --git a/fs/file.c b/fs/file.c index 34bb7f71d99..cccaead962c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -178,7 +178,6 @@ static struct fdtable * alloc_fdtable(unsigned int nr) fdt->open_fds = (fd_set *)data; data += nr / BITS_PER_BYTE; fdt->close_on_exec = (fd_set *)data; - INIT_RCU_HEAD(&fdt->rcu); fdt->next = NULL; return fdt; @@ -312,7 +311,6 @@ struct files_struct *dup_fd(struct files_struct *oldf, int *errorp) new_fdt->close_on_exec = (fd_set *)&newf->close_on_exec_init; new_fdt->open_fds = (fd_set *)&newf->open_fds_init; new_fdt->fd = &newf->fd_array[0]; - INIT_RCU_HEAD(&new_fdt->rcu); new_fdt->next = NULL; spin_lock(&oldf->file_lock); @@ -430,7 +428,6 @@ struct files_struct init_files = { .fd = &init_files.fd_array[0], .close_on_exec = (fd_set *)&init_files.close_on_exec_init, .open_fds = (fd_set *)&init_files.open_fds_init, - .rcu = RCU_HEAD_INIT, }, .file_lock = __SPIN_LOCK_UNLOCKED(init_task.file_lock), }; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d5be1693ac9..30ac305e829 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -530,7 +530,8 @@ void writeback_inodes_wb(struct bdi_writeback *wb, { int ret = 0; - wbc->wb_start = jiffies; /* livelock avoidance */ + if (!wbc->wb_start) + wbc->wb_start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -559,7 +560,6 @@ static void __writeback_inodes_sb(struct super_block *sb, { WARN_ON(!rwsem_is_locked(&sb->s_umount)); - wbc->wb_start = jiffies; /* livelock avoidance */ spin_lock(&inode_lock); if (!wbc->for_kupdate || list_empty(&wb->b_io)) queue_io(wb, wbc->older_than_this); @@ -625,6 +625,7 @@ static long wb_writeback(struct bdi_writeback *wb, wbc.range_end = LLONG_MAX; } + wbc.wb_start = jiffies; /* livelock avoidance */ for (;;) { /* * Stop writeback when nr_pages has been consumed diff --git a/fs/fscache/Kconfig b/fs/fscache/Kconfig index cc94bb9563f..3f6dfa98988 100644 --- a/fs/fscache/Kconfig +++ b/fs/fscache/Kconfig @@ -1,7 +1,6 @@ config FSCACHE tristate "General filesystem local caching manager" - select SLOW_WORK help This option enables a generic filesystem caching manager that can be used by various network and other filesystems to cache data locally. diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index edd7434ab6e..6a026441c5a 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -82,6 +82,14 @@ extern unsigned fscache_defer_lookup; extern unsigned fscache_defer_create; extern unsigned fscache_debug; extern struct kobject *fscache_root; +extern struct workqueue_struct *fscache_object_wq; +extern struct workqueue_struct *fscache_op_wq; +DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +static inline bool fscache_object_congested(void) +{ + return workqueue_congested(WORK_CPU_UNBOUND, fscache_object_wq); +} extern int fscache_wait_bit(void *); extern int fscache_wait_bit_interruptible(void *); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index add6bdb53f0..f9d856773f7 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -15,6 +15,7 @@ #include <linux/sched.h> #include <linux/completion.h> #include <linux/slab.h> +#include <linux/seq_file.h> #include "internal.h" MODULE_DESCRIPTION("FS Cache Manager"); @@ -40,22 +41,105 @@ MODULE_PARM_DESC(fscache_debug, "FS-Cache debugging mask"); struct kobject *fscache_root; +struct workqueue_struct *fscache_object_wq; +struct workqueue_struct *fscache_op_wq; + +DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); + +/* these values serve as lower bounds, will be adjusted in fscache_init() */ +static unsigned fscache_object_max_active = 4; +static unsigned fscache_op_max_active = 2; + +#ifdef CONFIG_SYSCTL +static struct ctl_table_header *fscache_sysctl_header; + +static int fscache_max_active_sysctl(struct ctl_table *table, int write, + void __user *buffer, + size_t *lenp, loff_t *ppos) +{ + struct workqueue_struct **wqp = table->extra1; + unsigned int *datap = table->data; + int ret; + + ret = proc_dointvec(table, write, buffer, lenp, ppos); + if (ret == 0) + workqueue_set_max_active(*wqp, *datap); + return ret; +} + +ctl_table fscache_sysctls[] = { + { + .procname = "object_max_active", + .data = &fscache_object_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_object_wq, + }, + { + .procname = "operation_max_active", + .data = &fscache_op_max_active, + .maxlen = sizeof(unsigned), + .mode = 0644, + .proc_handler = fscache_max_active_sysctl, + .extra1 = &fscache_op_wq, + }, + {} +}; + +ctl_table fscache_sysctls_root[] = { + { + .procname = "fscache", + .mode = 0555, + .child = fscache_sysctls, + }, + {} +}; +#endif /* * initialise the fs caching module */ static int __init fscache_init(void) { + unsigned int nr_cpus = num_possible_cpus(); + unsigned int cpu; int ret; - ret = slow_work_register_user(THIS_MODULE); - if (ret < 0) - goto error_slow_work; + fscache_object_max_active = + clamp_val(nr_cpus, + fscache_object_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_object_wq = alloc_workqueue("fscache_object", WQ_UNBOUND, + fscache_object_max_active); + if (!fscache_object_wq) + goto error_object_wq; + + fscache_op_max_active = + clamp_val(fscache_object_max_active / 2, + fscache_op_max_active, WQ_UNBOUND_MAX_ACTIVE); + + ret = -ENOMEM; + fscache_op_wq = alloc_workqueue("fscache_operation", WQ_UNBOUND, + fscache_op_max_active); + if (!fscache_op_wq) + goto error_op_wq; + + for_each_possible_cpu(cpu) + init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); ret = fscache_proc_init(); if (ret < 0) goto error_proc; +#ifdef CONFIG_SYSCTL + ret = -ENOMEM; + fscache_sysctl_header = register_sysctl_table(fscache_sysctls_root); + if (!fscache_sysctl_header) + goto error_sysctl; +#endif + fscache_cookie_jar = kmem_cache_create("fscache_cookie_jar", sizeof(struct fscache_cookie), 0, @@ -78,10 +162,16 @@ static int __init fscache_init(void) error_kobj: kmem_cache_destroy(fscache_cookie_jar); error_cookie_jar: +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +error_sysctl: +#endif fscache_proc_cleanup(); error_proc: - slow_work_unregister_user(THIS_MODULE); -error_slow_work: + destroy_workqueue(fscache_op_wq); +error_op_wq: + destroy_workqueue(fscache_object_wq); +error_object_wq: return ret; } @@ -96,8 +186,12 @@ static void __exit fscache_exit(void) kobject_put(fscache_root); kmem_cache_destroy(fscache_cookie_jar); +#ifdef CONFIG_SYSCTL + unregister_sysctl_table(fscache_sysctl_header); +#endif fscache_proc_cleanup(); - slow_work_unregister_user(THIS_MODULE); + destroy_workqueue(fscache_op_wq); + destroy_workqueue(fscache_object_wq); printk(KERN_NOTICE "FS-Cache: Unloaded\n"); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index 4a8eb31c533..ebe29c58138 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -34,8 +34,8 @@ struct fscache_objlist_data { #define FSCACHE_OBJLIST_CONFIG_NOREADS 0x00000200 /* show objects without active reads */ #define FSCACHE_OBJLIST_CONFIG_EVENTS 0x00000400 /* show objects with events */ #define FSCACHE_OBJLIST_CONFIG_NOEVENTS 0x00000800 /* show objects without no events */ -#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with slow work */ -#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without slow work */ +#define FSCACHE_OBJLIST_CONFIG_WORK 0x00001000 /* show objects with work */ +#define FSCACHE_OBJLIST_CONFIG_NOWORK 0x00002000 /* show objects without work */ u8 buf[512]; /* key and aux data buffer */ }; @@ -231,12 +231,11 @@ static int fscache_objlist_show(struct seq_file *m, void *v) READS, NOREADS); FILTER(obj->events & obj->event_mask, EVENTS, NOEVENTS); - FILTER(obj->work.flags & ~(1UL << SLOW_WORK_VERY_SLOW), - WORK, NOWORK); + FILTER(work_busy(&obj->work), WORK, NOWORK); } seq_printf(m, - "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1lx | ", + "%8x %8x %s %5u %3u %3u %3u %2u %5u %2lx %2lx %1lx %1x | ", obj->debug_id, obj->parent ? obj->parent->debug_id : -1, fscache_object_states_short[obj->state], @@ -249,7 +248,7 @@ static int fscache_objlist_show(struct seq_file *m, void *v) obj->event_mask & FSCACHE_OBJECT_EVENTS_MASK, obj->events, obj->flags, - obj->work.flags); + work_busy(&obj->work)); no_cookie = true; keylen = auxlen = 0; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index 0b589a9b4ff..b6b897c550a 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -14,7 +14,6 @@ #define FSCACHE_DEBUG_LEVEL COOKIE #include <linux/module.h> -#include <linux/seq_file.h> #include "internal.h" const char *fscache_object_states[FSCACHE_OBJECT__NSTATES] = { @@ -50,12 +49,8 @@ const char fscache_object_states_short[FSCACHE_OBJECT__NSTATES][5] = { [FSCACHE_OBJECT_DEAD] = "DEAD", }; -static void fscache_object_slow_work_put_ref(struct slow_work *); -static int fscache_object_slow_work_get_ref(struct slow_work *); -static void fscache_object_slow_work_execute(struct slow_work *); -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_object_slow_work_desc(struct slow_work *, struct seq_file *); -#endif +static int fscache_get_object(struct fscache_object *); +static void fscache_put_object(struct fscache_object *); static void fscache_initialise_object(struct fscache_object *); static void fscache_lookup_object(struct fscache_object *); static void fscache_object_available(struct fscache_object *); @@ -64,17 +59,6 @@ static void fscache_withdraw_object(struct fscache_object *); static void fscache_enqueue_dependents(struct fscache_object *); static void fscache_dequeue_object(struct fscache_object *); -const struct slow_work_ops fscache_object_slow_work_ops = { - .owner = THIS_MODULE, - .get_ref = fscache_object_slow_work_get_ref, - .put_ref = fscache_object_slow_work_put_ref, - .execute = fscache_object_slow_work_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = fscache_object_slow_work_desc, -#endif -}; -EXPORT_SYMBOL(fscache_object_slow_work_ops); - /* * we need to notify the parent when an op completes that we had outstanding * upon it @@ -345,7 +329,7 @@ unsupported_event: /* * execute an object */ -static void fscache_object_slow_work_execute(struct slow_work *work) +void fscache_object_work_func(struct work_struct *work) { struct fscache_object *object = container_of(work, struct fscache_object, work); @@ -359,23 +343,9 @@ static void fscache_object_slow_work_execute(struct slow_work *work) if (object->events & object->event_mask) fscache_enqueue_object(object); clear_bit(FSCACHE_OBJECT_EV_REQUEUE, &object->events); + fscache_put_object(object); } - -/* - * describe an object for slow-work debugging - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_object_slow_work_desc(struct slow_work *work, - struct seq_file *m) -{ - struct fscache_object *object = - container_of(work, struct fscache_object, work); - - seq_printf(m, "FSC: OBJ%x: %s", - object->debug_id, - fscache_object_states_short[object->state]); -} -#endif +EXPORT_SYMBOL(fscache_object_work_func); /* * initialise an object @@ -393,7 +363,6 @@ static void fscache_initialise_object(struct fscache_object *object) _enter(""); ASSERT(object->cookie != NULL); ASSERT(object->cookie->parent != NULL); - ASSERT(list_empty(&object->work.link)); if (object->events & ((1 << FSCACHE_OBJECT_EV_ERROR) | (1 << FSCACHE_OBJECT_EV_RELEASE) | @@ -671,10 +640,8 @@ static void fscache_drop_object(struct fscache_object *object) object->parent = NULL; } - /* this just shifts the object release to the slow work processor */ - fscache_stat(&fscache_n_cop_put_object); - object->cache->ops->put_object(object); - fscache_stat_d(&fscache_n_cop_put_object); + /* this just shifts the object release to the work processor */ + fscache_put_object(object); _leave(""); } @@ -758,12 +725,10 @@ void fscache_withdrawing_object(struct fscache_cache *cache, } /* - * allow the slow work item processor to get a ref on an object + * get a ref on an object */ -static int fscache_object_slow_work_get_ref(struct slow_work *work) +static int fscache_get_object(struct fscache_object *object) { - struct fscache_object *object = - container_of(work, struct fscache_object, work); int ret; fscache_stat(&fscache_n_cop_grab_object); @@ -773,13 +738,10 @@ static int fscache_object_slow_work_get_ref(struct slow_work *work) } /* - * allow the slow work item processor to discard a ref on a work item + * discard a ref on a work item */ -static void fscache_object_slow_work_put_ref(struct slow_work *work) +static void fscache_put_object(struct fscache_object *object) { - struct fscache_object *object = - container_of(work, struct fscache_object, work); - fscache_stat(&fscache_n_cop_put_object); object->cache->ops->put_object(object); fscache_stat_d(&fscache_n_cop_put_object); @@ -792,8 +754,48 @@ void fscache_enqueue_object(struct fscache_object *object) { _enter("{OBJ%x}", object->debug_id); - slow_work_enqueue(&object->work); + if (fscache_get_object(object) >= 0) { + wait_queue_head_t *cong_wq = + &get_cpu_var(fscache_object_cong_wait); + + if (queue_work(fscache_object_wq, &object->work)) { + if (fscache_object_congested()) + wake_up(cong_wq); + } else + fscache_put_object(object); + + put_cpu_var(fscache_object_cong_wait); + } +} + +/** + * fscache_object_sleep_till_congested - Sleep until object wq is congested + * @timoutp: Scheduler sleep timeout + * + * Allow an object handler to sleep until the object workqueue is congested. + * + * The caller must set up a wake up event before calling this and must have set + * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own + * condition before calling this function as no test is made here. + * + * %true is returned if the object wq is congested, %false otherwise. + */ +bool fscache_object_sleep_till_congested(signed long *timeoutp) +{ + wait_queue_head_t *cong_wq = &__get_cpu_var(fscache_object_cong_wait); + DEFINE_WAIT(wait); + + if (fscache_object_congested()) + return true; + + add_wait_queue_exclusive(cong_wq, &wait); + if (!fscache_object_congested()) + *timeoutp = schedule_timeout(*timeoutp); + finish_wait(cong_wq, &wait); + + return fscache_object_congested(); } +EXPORT_SYMBOL_GPL(fscache_object_sleep_till_congested); /* * enqueue the dependents of an object for metadata-type processing @@ -819,9 +821,7 @@ static void fscache_enqueue_dependents(struct fscache_object *object) /* sort onto appropriate lists */ fscache_enqueue_object(dep); - fscache_stat(&fscache_n_cop_put_object); - dep->cache->ops->put_object(dep); - fscache_stat_d(&fscache_n_cop_put_object); + fscache_put_object(dep); if (!list_empty(&object->dependents)) cond_resched_lock(&object->lock); diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c index f17cecafae4..b9f34eaede0 100644 --- a/fs/fscache/operation.c +++ b/fs/fscache/operation.c @@ -42,16 +42,12 @@ void fscache_enqueue_operation(struct fscache_operation *op) fscache_stat(&fscache_n_op_enqueue); switch (op->flags & FSCACHE_OP_TYPE) { - case FSCACHE_OP_FAST: - _debug("queue fast"); + case FSCACHE_OP_ASYNC: + _debug("queue async"); atomic_inc(&op->usage); - if (!schedule_work(&op->fast_work)) + if (!queue_work(fscache_op_wq, &op->work)) fscache_put_operation(op); break; - case FSCACHE_OP_SLOW: - _debug("queue slow"); - slow_work_enqueue(&op->slow_work); - break; case FSCACHE_OP_MYTHREAD: _debug("queue for caller's attention"); break; @@ -455,36 +451,13 @@ void fscache_operation_gc(struct work_struct *work) } /* - * allow the slow work item processor to get a ref on an operation - */ -static int fscache_op_get_ref(struct slow_work *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - atomic_inc(&op->usage); - return 0; -} - -/* - * allow the slow work item processor to discard a ref on an operation - */ -static void fscache_op_put_ref(struct slow_work *work) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - fscache_put_operation(op); -} - -/* - * execute an operation using the slow thread pool to provide processing context - * - the caller holds a ref to this object, so we don't need to hold one + * execute an operation using fs_op_wq to provide processing context - + * the caller holds a ref to this object, so we don't need to hold one */ -static void fscache_op_execute(struct slow_work *work) +void fscache_op_work_func(struct work_struct *work) { struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); + container_of(work, struct fscache_operation, work); unsigned long start; _enter("{OBJ%x OP%x,%d}", @@ -494,31 +467,7 @@ static void fscache_op_execute(struct slow_work *work) start = jiffies; op->processor(op); fscache_hist(fscache_ops_histogram, start); + fscache_put_operation(op); _leave(""); } - -/* - * describe an operation for slow-work debugging - */ -#ifdef CONFIG_SLOW_WORK_DEBUG -static void fscache_op_desc(struct slow_work *work, struct seq_file *m) -{ - struct fscache_operation *op = - container_of(work, struct fscache_operation, slow_work); - - seq_printf(m, "FSC: OBJ%x OP%x: %s/%s fl=%lx", - op->object->debug_id, op->debug_id, - op->name, op->state, op->flags); -} -#endif - -const struct slow_work_ops fscache_op_slow_work_ops = { - .owner = THIS_MODULE, - .get_ref = fscache_op_get_ref, - .put_ref = fscache_op_put_ref, - .execute = fscache_op_execute, -#ifdef CONFIG_SLOW_WORK_DEBUG - .desc = fscache_op_desc, -#endif -}; diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 723b889fd21..41c441c2058 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -105,7 +105,7 @@ bool __fscache_maybe_release_page(struct fscache_cookie *cookie, page_busy: /* we might want to wait here, but that could deadlock the allocator as - * the slow-work threads writing to the cache may all end up sleeping + * the work threads writing to the cache may all end up sleeping * on memory allocation */ fscache_stat(&fscache_n_store_vmscan_busy); return false; @@ -188,9 +188,8 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) return -ENOMEM; } - fscache_operation_init(op, NULL); - fscache_operation_init_slow(op, fscache_attr_changed_op); - op->flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_EXCLUSIVE); + fscache_operation_init(op, fscache_attr_changed_op, NULL); + op->flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_EXCLUSIVE); fscache_set_op_name(op, "Attr"); spin_lock(&cookie->lock); @@ -218,24 +217,6 @@ nobufs: EXPORT_SYMBOL(__fscache_attr_changed); /* - * handle secondary execution given to a retrieval op on behalf of the - * cache - */ -static void fscache_retrieval_work(struct work_struct *work) -{ - struct fscache_retrieval *op = - container_of(work, struct fscache_retrieval, op.fast_work); - unsigned long start; - - _enter("{OP%x}", op->op.debug_id); - - start = jiffies; - op->op.processor(&op->op); - fscache_hist(fscache_ops_histogram, start); - fscache_put_operation(&op->op); -} - -/* * release a retrieval op reference */ static void fscache_release_retrieval_op(struct fscache_operation *_op) @@ -269,13 +250,12 @@ static struct fscache_retrieval *fscache_alloc_retrieval( return NULL; } - fscache_operation_init(&op->op, fscache_release_retrieval_op); + fscache_operation_init(&op->op, NULL, fscache_release_retrieval_op); op->op.flags = FSCACHE_OP_MYTHREAD | (1 << FSCACHE_OP_WAITING); op->mapping = mapping; op->end_io_func = end_io_func; op->context = context; op->start_time = jiffies; - INIT_WORK(&op->op.fast_work, fscache_retrieval_work); INIT_LIST_HEAD(&op->to_do); fscache_set_op_name(&op->op, "Retr"); return op; @@ -795,9 +775,9 @@ int __fscache_write_page(struct fscache_cookie *cookie, if (!op) goto nomem; - fscache_operation_init(&op->op, fscache_release_write_op); - fscache_operation_init_slow(&op->op, fscache_write_op); - op->op.flags = FSCACHE_OP_SLOW | (1 << FSCACHE_OP_WAITING); + fscache_operation_init(&op->op, fscache_write_op, + fscache_release_write_op); + op->op.flags = FSCACHE_OP_ASYNC | (1 << FSCACHE_OP_WAITING); fscache_set_op_name(&op->op, "Write1"); ret = radix_tree_preload(gfp & ~__GFP_HIGHMEM); @@ -852,7 +832,7 @@ int __fscache_write_page(struct fscache_cookie *cookie, fscache_stat(&fscache_n_store_ops); fscache_stat(&fscache_n_stores_ok); - /* the slow work queue now carries its own ref on the object */ + /* the work queue now carries its own ref on the object */ fscache_put_operation(&op->op); _leave(" = 0"); return 0; diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 9424796d663..69ad053ffd7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -239,7 +239,6 @@ static u64 fuse_get_unique(struct fuse_conn *fc) static void queue_request(struct fuse_conn *fc, struct fuse_req *req) { - req->in.h.unique = fuse_get_unique(fc); req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); list_add_tail(&req->list, &fc->pending); @@ -261,6 +260,7 @@ static void flush_bg_queue(struct fuse_conn *fc) req = list_entry(fc->bg_queue.next, struct fuse_req, list); list_del(&req->list); fc->active_background++; + req->in.h.unique = fuse_get_unique(fc); queue_request(fc, req); } } @@ -398,6 +398,7 @@ void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req) else if (fc->conn_error) req->out.h.error = -ECONNREFUSED; else { + req->in.h.unique = fuse_get_unique(fc); queue_request(fc, req); /* acquire extra reference, since request is still needed after request_end() */ @@ -450,6 +451,23 @@ void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req) } EXPORT_SYMBOL_GPL(fuse_request_send_background); +static int fuse_request_send_notify_reply(struct fuse_conn *fc, + struct fuse_req *req, u64 unique) +{ + int err = -ENODEV; + + req->isreply = 0; + req->in.h.unique = unique; + spin_lock(&fc->lock); + if (fc->connected) { + queue_request(fc, req); + err = 0; + } + spin_unlock(&fc->lock); + + return err; +} + /* * Called under fc->lock * @@ -535,13 +553,13 @@ static void fuse_copy_finish(struct fuse_copy_state *cs) if (!cs->write) { buf->ops->unmap(cs->pipe, buf, cs->mapaddr); } else { - kunmap_atomic(cs->mapaddr, KM_USER0); + kunmap(buf->page); buf->len = PAGE_SIZE - cs->len; } cs->currbuf = NULL; cs->mapaddr = NULL; } else if (cs->mapaddr) { - kunmap_atomic(cs->mapaddr, KM_USER0); + kunmap(cs->pg); if (cs->write) { flush_dcache_page(cs->pg); set_page_dirty_lock(cs->pg); @@ -572,7 +590,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) BUG_ON(!cs->nr_segs); cs->currbuf = buf; - cs->mapaddr = buf->ops->map(cs->pipe, buf, 1); + cs->mapaddr = buf->ops->map(cs->pipe, buf, 0); cs->len = buf->len; cs->buf = cs->mapaddr + buf->offset; cs->pipebufs++; @@ -592,7 +610,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) buf->len = 0; cs->currbuf = buf; - cs->mapaddr = kmap_atomic(page, KM_USER0); + cs->mapaddr = kmap(page); cs->buf = cs->mapaddr; cs->len = PAGE_SIZE; cs->pipebufs++; @@ -611,7 +629,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) return err; BUG_ON(err != 1); offset = cs->addr % PAGE_SIZE; - cs->mapaddr = kmap_atomic(cs->pg, KM_USER0); + cs->mapaddr = kmap(cs->pg); cs->buf = cs->mapaddr + offset; cs->len = min(PAGE_SIZE - offset, cs->seglen); cs->seglen -= cs->len; @@ -1231,6 +1249,199 @@ err: return err; } +static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_store_out outarg; + struct inode *inode; + struct address_space *mapping; + u64 nodeid; + int err; + pgoff_t index; + unsigned int offset; + unsigned int num; + loff_t file_size; + loff_t end; + + err = -EINVAL; + if (size < sizeof(outarg)) + goto out_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto out_finish; + + err = -EINVAL; + if (size - sizeof(outarg) != outarg.size) + goto out_finish; + + nodeid = outarg.nodeid; + + down_read(&fc->killsb); + + err = -ENOENT; + if (!fc->sb) + goto out_up_killsb; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (!inode) + goto out_up_killsb; + + mapping = inode->i_mapping; + index = outarg.offset >> PAGE_CACHE_SHIFT; + offset = outarg.offset & ~PAGE_CACHE_MASK; + file_size = i_size_read(inode); + end = outarg.offset + outarg.size; + if (end > file_size) { + file_size = end; + fuse_write_update_size(inode, file_size); + } + + num = outarg.size; + while (num) { + struct page *page; + unsigned int this_num; + + err = -ENOMEM; + page = find_or_create_page(mapping, index, + mapping_gfp_mask(mapping)); + if (!page) + goto out_iput; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + err = fuse_copy_page(cs, &page, offset, this_num, 0); + if (!err && offset == 0 && (num != 0 || file_size == end)) + SetPageUptodate(page); + unlock_page(page); + page_cache_release(page); + + if (err) + goto out_iput; + + num -= this_num; + offset = 0; + index++; + } + + err = 0; + +out_iput: + iput(inode); +out_up_killsb: + up_read(&fc->killsb); +out_finish: + fuse_copy_finish(cs); + return err; +} + +static void fuse_retrieve_end(struct fuse_conn *fc, struct fuse_req *req) +{ + int i; + + for (i = 0; i < req->num_pages; i++) { + struct page *page = req->pages[i]; + page_cache_release(page); + } +} + +static int fuse_retrieve(struct fuse_conn *fc, struct inode *inode, + struct fuse_notify_retrieve_out *outarg) +{ + int err; + struct address_space *mapping = inode->i_mapping; + struct fuse_req *req; + pgoff_t index; + loff_t file_size; + unsigned int num; + unsigned int offset; + size_t total_len; + + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + + offset = outarg->offset & ~PAGE_CACHE_MASK; + + req->in.h.opcode = FUSE_NOTIFY_REPLY; + req->in.h.nodeid = outarg->nodeid; + req->in.numargs = 2; + req->in.argpages = 1; + req->page_offset = offset; + req->end = fuse_retrieve_end; + + index = outarg->offset >> PAGE_CACHE_SHIFT; + file_size = i_size_read(inode); + num = outarg->size; + if (outarg->offset > file_size) + num = 0; + else if (outarg->offset + num > file_size) + num = file_size - outarg->offset; + + while (num) { + struct page *page; + unsigned int this_num; + + page = find_get_page(mapping, index); + if (!page) + break; + + this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset); + req->pages[req->num_pages] = page; + req->num_pages++; + + num -= this_num; + total_len += this_num; + } + req->misc.retrieve_in.offset = outarg->offset; + req->misc.retrieve_in.size = total_len; + req->in.args[0].size = sizeof(req->misc.retrieve_in); + req->in.args[0].value = &req->misc.retrieve_in; + req->in.args[1].size = total_len; + + err = fuse_request_send_notify_reply(fc, req, outarg->notify_unique); + if (err) + fuse_retrieve_end(fc, req); + + return err; +} + +static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_retrieve_out outarg; + struct inode *inode; + int err; + + err = -EINVAL; + if (size != sizeof(outarg)) + goto copy_finish; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + goto copy_finish; + + fuse_copy_finish(cs); + + down_read(&fc->killsb); + err = -ENOENT; + if (fc->sb) { + u64 nodeid = outarg.nodeid; + + inode = ilookup5(fc->sb, nodeid, fuse_inode_eq, &nodeid); + if (inode) { + err = fuse_retrieve(fc, inode, &outarg); + iput(inode); + } + } + up_read(&fc->killsb); + + return err; + +copy_finish: + fuse_copy_finish(cs); + return err; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -1244,6 +1455,12 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INVAL_ENTRY: return fuse_notify_inval_entry(fc, size, cs); + case FUSE_NOTIFY_STORE: + return fuse_notify_store(fc, size, cs); + + case FUSE_NOTIFY_RETRIEVE: + return fuse_notify_retrieve(fc, size, cs); + default: fuse_copy_finish(cs); return -EINVAL; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index ada0adeb3bb..147c1f71bdb 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -706,7 +706,7 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, return 0; } -static void fuse_write_update_size(struct inode *inode, loff_t pos) +void fuse_write_update_size(struct inode *inode, loff_t pos) { struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index 8f309f04064..57d4a3a0f10 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -272,6 +272,7 @@ struct fuse_req { struct fuse_write_in in; struct fuse_write_out out; } write; + struct fuse_notify_retrieve_in retrieve_in; struct fuse_lk_in lk_in; } misc; @@ -748,4 +749,6 @@ long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg, unsigned fuse_file_poll(struct file *file, poll_table *wait); int fuse_dev_release(struct inode *inode, struct file *file); +void fuse_write_update_size(struct inode *inode, loff_t pos); + #endif /* _FS_FUSE_I_H */ diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index a47b4310711..cc966552214 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -7,7 +7,6 @@ config GFS2_FS select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 - select SLOW_WORK select QUOTACTL help A cluster filesystem. diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 8fcbce48a12..fdbf4b366fa 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -12,7 +12,6 @@ #include <linux/fs.h> #include <linux/workqueue.h> -#include <linux/slow-work.h> #include <linux/dlm.h> #include <linux/buffer_head.h> @@ -383,7 +382,7 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; - struct slow_work jd_work; + struct work_struct jd_work; struct inode *jd_inode; unsigned long jd_flags; #define JDF_RECOVERY 1 diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index fb2a5f93b7c..b1e9630eb46 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -15,7 +15,6 @@ #include <linux/init.h> #include <linux/gfs2_ondisk.h> #include <asm/atomic.h> -#include <linux/slow-work.h> #include "gfs2.h" #include "incore.h" @@ -24,6 +23,7 @@ #include "util.h" #include "glock.h" #include "quota.h" +#include "recovery.h" static struct shrinker qd_shrinker = { .shrink = gfs2_shrink_qd_memory, @@ -138,9 +138,11 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; - error = slow_work_register_user(THIS_MODULE); - if (error) - goto fail_slow; + error = -ENOMEM; + gfs_recovery_wq = alloc_workqueue("gfs_recovery", + WQ_NON_REENTRANT | WQ_RESCUER, 0); + if (!gfs_recovery_wq) + goto fail_wq; gfs2_register_debugfs(); @@ -148,7 +150,7 @@ static int __init init_gfs2_fs(void) return 0; -fail_slow: +fail_wq: unregister_filesystem(&gfs2meta_fs_type); fail_unregister: unregister_filesystem(&gfs2_fs_type); @@ -190,7 +192,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); - slow_work_unregister_user(THIS_MODULE); + destroy_workqueue(gfs_recovery_wq); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 45a4a36195d..4f44bdeb2f0 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -17,7 +17,6 @@ #include <linux/namei.h> #include <linux/mount.h> #include <linux/gfs2_ondisk.h> -#include <linux/slow-work.h> #include <linux/quotaops.h> #include "gfs2.h" @@ -673,7 +672,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); - slow_work_init(&jd->jd_work, &gfs2_recover_ops); + INIT_WORK(&jd->jd_work, gfs2_recover_func); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) @@ -782,7 +781,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) if (sdp->sd_lockstruct.ls_first) { unsigned int x; for (x = 0; x < sdp->sd_journals; x++) { - error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x)); + error = gfs2_recover_journal(gfs2_jdesc_find(sdp, x), + true); if (error) { fs_err(sdp, "error recovering journal %u: %d\n", x, error); @@ -792,7 +792,7 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) gfs2_others_may_mount(sdp); } else if (!sdp->sd_args.ar_spectator) { - error = gfs2_recover_journal(sdp->sd_jdesc); + error = gfs2_recover_journal(sdp->sd_jdesc, true); if (error) { fs_err(sdp, "error recovering my journal: %d\n", error); goto fail_jinode_gh; diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 8bb643cb265..1bc6b5695e6 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -1449,10 +1449,10 @@ static int gfs2_quota_get_xstate(struct super_block *sb, switch (sdp->sd_args.ar_quota) { case GFS2_QUOTA_ON: - fqs->qs_flags |= (XFS_QUOTA_UDQ_ENFD | XFS_QUOTA_GDQ_ENFD); + fqs->qs_flags |= (FS_QUOTA_UDQ_ENFD | FS_QUOTA_GDQ_ENFD); /*FALLTHRU*/ case GFS2_QUOTA_ACCOUNT: - fqs->qs_flags |= (XFS_QUOTA_UDQ_ACCT | XFS_QUOTA_GDQ_ACCT); + fqs->qs_flags |= (FS_QUOTA_UDQ_ACCT | FS_QUOTA_GDQ_ACCT); break; case GFS2_QUOTA_OFF: break; @@ -1498,7 +1498,7 @@ static int gfs2_get_dqblk(struct super_block *sb, int type, qid_t id, qlvb = (struct gfs2_quota_lvb *)qd->qd_gl->gl_lvb; fdq->d_version = FS_DQUOT_VERSION; - fdq->d_flags = (type == QUOTA_USER) ? XFS_USER_QUOTA : XFS_GROUP_QUOTA; + fdq->d_flags = (type == QUOTA_USER) ? FS_USER_QUOTA : FS_GROUP_QUOTA; fdq->d_id = id; fdq->d_blk_hardlimit = be64_to_cpu(qlvb->qb_limit); fdq->d_blk_softlimit = be64_to_cpu(qlvb->qb_warn); @@ -1533,12 +1533,12 @@ static int gfs2_set_dqblk(struct super_block *sb, int type, qid_t id, switch(type) { case USRQUOTA: type = QUOTA_USER; - if (fdq->d_flags != XFS_USER_QUOTA) + if (fdq->d_flags != FS_USER_QUOTA) return -EINVAL; break; case GRPQUOTA: type = QUOTA_GROUP; - if (fdq->d_flags != XFS_GROUP_QUOTA) + if (fdq->d_flags != FS_GROUP_QUOTA) return -EINVAL; break; default: diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 4b9bece3d43..f7f89a94a5a 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -14,7 +14,6 @@ #include <linux/buffer_head.h> #include <linux/gfs2_ondisk.h> #include <linux/crc32.h> -#include <linux/slow-work.h> #include "gfs2.h" #include "incore.h" @@ -28,6 +27,8 @@ #include "util.h" #include "dir.h" +struct workqueue_struct *gfs_recovery_wq; + int gfs2_replay_read_block(struct gfs2_jdesc *jd, unsigned int blk, struct buffer_head **bh) { @@ -443,23 +444,7 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } -static int gfs2_recover_get_ref(struct slow_work *work) -{ - struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); - if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) - return -EBUSY; - return 0; -} - -static void gfs2_recover_put_ref(struct slow_work *work) -{ - struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); - clear_bit(JDF_RECOVERY, &jd->jd_flags); - smp_mb__after_clear_bit(); - wake_up_bit(&jd->jd_flags, JDF_RECOVERY); -} - -static void gfs2_recover_work(struct slow_work *work) +void gfs2_recover_func(struct work_struct *work) { struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); @@ -578,7 +563,7 @@ static void gfs2_recover_work(struct slow_work *work) gfs2_glock_dq_uninit(&j_gh); fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); - return; + goto done; fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); @@ -590,32 +575,35 @@ fail_gunlock_j: } fs_info(sdp, "jid=%u: %s\n", jd->jd_jid, (error) ? "Failed" : "Done"); - fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); +done: + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); } -struct slow_work_ops gfs2_recover_ops = { - .owner = THIS_MODULE, - .get_ref = gfs2_recover_get_ref, - .put_ref = gfs2_recover_put_ref, - .execute = gfs2_recover_work, -}; - - static int gfs2_recovery_wait(void *word) { schedule(); return 0; } -int gfs2_recover_journal(struct gfs2_jdesc *jd) +int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) { int rv; - rv = slow_work_enqueue(&jd->jd_work); - if (rv) - return rv; - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); + + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + + /* we have JDF_RECOVERY, queue should always succeed */ + rv = queue_work(gfs_recovery_wq, &jd->jd_work); + BUG_ON(!rv); + + if (wait) + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, + TASK_UNINTERRUPTIBLE); + return 0; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 1616ac22569..2226136c764 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -12,6 +12,8 @@ #include "incore.h" +extern struct workqueue_struct *gfs_recovery_wq; + static inline void gfs2_replay_incr_blk(struct gfs2_sbd *sdp, unsigned int *blk) { if (++*blk == sdp->sd_jdesc->jd_blocks) @@ -27,8 +29,8 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); -extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -extern struct slow_work_ops gfs2_recover_ops; +extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd, bool wait); +extern void gfs2_recover_func(struct work_struct *work); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d019d0d55e0..ccacffd2faa 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -25,6 +25,7 @@ #include "quota.h" #include "util.h" #include "glops.h" +#include "recovery.h" struct gfs2_attr { struct attribute attr; @@ -376,7 +377,7 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { if (jd->jd_jid != jid) continue; - rv = slow_work_enqueue(&jd->jd_work); + rv = gfs2_recover_journal(jd, false); break; } out: diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 93d1e47647b..f19ce94693d 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -1281,13 +1281,9 @@ int journal_check_used_features (journal_t *journal, unsigned long compat, int journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1481,7 +1477,6 @@ int journal_flush(journal_t *journal) int journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JFS_LOADED)); @@ -1490,8 +1485,6 @@ int journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; diff --git a/fs/jbd/recovery.c b/fs/jbd/recovery.c index 54c9bc9e1b1..81051dafebf 100644 --- a/fs/jbd/recovery.c +++ b/fs/jbd/recovery.c @@ -283,12 +283,9 @@ int journal_recover(journal_t *journal) int journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; - struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -297,7 +294,8 @@ int journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); #endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", @@ -321,11 +319,6 @@ static int do_one_pass(journal_t *journal, unsigned int sequence; int blocktype; - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / sizeof(journal_block_tag_t)); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 076d1cc44f9..1c23a0f4e8a 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh) void __jbd2_log_wait_for_space(journal_t *journal) { int nblocks, space_left; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ nblocks = jbd_space_needed(journal); while (__jbd2_log_space_left(journal) < nblocks) { if (journal->j_flags & JBD2_ABORT) return; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); /* @@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) * filesystem, so abort the journal and leave a stack * trace for forensic evidence. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); space_left = __jbd2_log_space_left(journal); @@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) if (journal->j_committing_transaction) tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); } else if (jbd2_cleanup_journal_tail(journal) == 0) { @@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) WARN_ON(1); jbd2_journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); } @@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * next transaction ID we will write, and where it will * start. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; if (transaction) { @@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* If the oldest pinned transaction is at the tail of the log already then there's not much we can do right now. */ if (journal->j_tail_sequence == first_tid) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 1; } @@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_free += freed; journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * If there is an external journal, we need to make sure that @@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 75716d3d2be..f52e5e8049f 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -150,11 +150,11 @@ static int journal_submit_commit_record(journal_t *journal, */ if (ret == -EOPNOTSUPP && barrier_done) { printk(KERN_WARNING - "JBD: barrier-based sync failed on %s - " - "disabling barriers\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + "JBD2: Disabling barriers on %s, " + "not supported by device\n", journal->j_devname); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* And try again, without the barrier */ lock_buffer(bh); @@ -180,11 +180,11 @@ retry: wait_on_buffer(bh); if (buffer_eopnotsupp(bh) && (journal->j_flags & JBD2_BARRIER)) { printk(KERN_WARNING - "JBD2: wait_on_commit_record: sync failed on %s - " - "disabling barriers\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + "JBD2: %s: disabling barries on %s - not supported " + "by device\n", __func__, journal->j_devname); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); lock_buffer(bh); clear_buffer_dirty(bh); @@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; /* @@ -417,23 +417,23 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { + while (atomic_read(&commit_transaction->t_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); - if (commit_transaction->t_updates) { + if (atomic_read(&commit_transaction->t_updates)) { spin_unlock(&commit_transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); } spin_unlock(&commit_transaction->t_handle_lock); - J_ASSERT (commit_transaction->t_outstanding_credits <= + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); /* @@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); @@ -519,19 +519,20 @@ void jbd2_journal_commit_transaction(journal_t *journal) * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks = + atomic_read(&commit_transaction->t_outstanding_credits); stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); + atomic_read(&commit_transaction->t_outstanding_credits)); err = 0; descriptor = NULL; @@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * the free space in the log, but this counter is changed * by jbd2_journal_next_log_block() also. */ - commit_transaction->t_outstanding_credits--; + atomic_dec(&commit_transaction->t_outstanding_credits); /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get @@ -977,7 +978,7 @@ restart_loop: * __jbd2_journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction @@ -985,7 +986,7 @@ restart_loop: */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); goto restart_loop; } @@ -1003,7 +1004,8 @@ restart_loop: * File the transaction statistics */ stats.ts_tid = commit_transaction->t_tid; - stats.run.rs_handle_count = commit_transaction->t_handle_count; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); @@ -1037,7 +1039,7 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 036880895bf..ad5866aaf0f 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -41,6 +41,7 @@ #include <linux/hash.h> #include <linux/log2.h> #include <linux/vmalloc.h> +#include <linux/backing-dev.h> #define CREATE_TRACE_POINTS #include <trace/events/jbd2.h> @@ -48,8 +49,6 @@ #include <asm/uaccess.h> #include <asm/page.h> -EXPORT_SYMBOL(jbd2_journal_start); -EXPORT_SYMBOL(jbd2_journal_restart); EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -143,7 +142,7 @@ static int kjournald2(void *arg) /* * And now, wait forever for commit wakeup events. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); loop: if (journal->j_flags & JBD2_UNMOUNT) @@ -154,10 +153,10 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd_debug(1, "OK, requests differ\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); goto loop; } @@ -169,9 +168,9 @@ loop: * be already stopped. */ jbd_debug(1, "Now suspending kjournald2\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); refrigerator(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { /* * We assume on resume that commits are already there, @@ -191,9 +190,9 @@ loop: if (journal->j_flags & JBD2_UNMOUNT) should_sleep = 0; if (should_sleep) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } finish_wait(&journal->j_wait_commit, &wait); } @@ -211,7 +210,7 @@ loop: goto loop; end_loop: - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); @@ -234,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal) static void journal_kill_thread(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -310,7 +309,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); +retry_alloc: + new_bh = alloc_buffer_head(GFP_NOFS); + if (!new_bh) { + /* + * Failure is not an option, but __GFP_NOFAIL is going + * away; so we retry ourselves here. + */ + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_alloc; + } + /* keep subsequent assertions sane */ new_bh->b_state = 0; init_buffer(new_bh, NULL, NULL); @@ -442,7 +451,7 @@ int __jbd2_log_space_left(journal_t *journal) { int left = journal->j_free; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ /* * Be pessimistic here about the number of those free blocks which @@ -487,9 +496,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) { int ret; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ret = __jbd2_log_start_commit(journal, tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -508,7 +517,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction_t *transaction = NULL; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; __jbd2_log_start_commit(journal, transaction->t_tid); @@ -516,12 +525,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction = journal->j_committing_transaction; if (!transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return 0; /* Nothing to retry */ } tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); return 1; } @@ -535,7 +544,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { int ret = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; @@ -554,7 +563,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) *ptid = journal->j_committing_transaction->t_tid; ret = 1; } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -566,26 +575,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; + read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG - spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } - spin_unlock(&journal->j_state_lock); #endif - spin_lock(&journal->j_state_lock); while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) { printk(KERN_EMERG "journal commit I/O error\n"); @@ -602,7 +609,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) { unsigned long blocknr; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); blocknr = journal->j_head; @@ -610,7 +617,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) journal->j_free--; if (journal->j_head == journal->j_last) journal->j_head = journal->j_first; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return jbd2_journal_bmap(journal, blocknr, retp); } @@ -830,7 +837,7 @@ static journal_t * journal_init_common (void) mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); - spin_lock_init(&journal->j_state_lock); + rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; @@ -1096,14 +1103,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(journal->j_tail); sb->s_errno = cpu_to_be32(journal->j_errno); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); @@ -1124,12 +1131,12 @@ out: * any future commit will have to be careful to update the * superblock again to re-record the true start of the log. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (sb->s_start) journal->j_flags &= ~JBD2_FLUSHED; else journal->j_flags |= JBD2_FLUSHED; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -1391,13 +1398,9 @@ int jbd2_journal_check_used_features (journal_t *journal, unsigned long compat, int jbd2_journal_check_available_features (journal_t *journal, unsigned long compat, unsigned long ro, unsigned long incompat) { - journal_superblock_t *sb; - if (!compat && !ro && !incompat) return 1; - sb = journal->j_superblock; - /* We can support any known requested features iff the * superblock is in version 2. Otherwise we fail to support any * extended sb features. */ @@ -1545,7 +1548,7 @@ int jbd2_journal_flush(journal_t *journal) transaction_t *transaction = NULL; unsigned long old_tail; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); /* Force everything buffered to the log... */ if (journal->j_running_transaction) { @@ -1558,10 +1561,10 @@ int jbd2_journal_flush(journal_t *journal) if (transaction) { tid_t tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); } else { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* ...and flush everything in the log out to disk. */ @@ -1585,12 +1588,12 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); old_tail = journal->j_tail; journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_tail = old_tail; J_ASSERT(!journal->j_running_transaction); @@ -1598,7 +1601,7 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(!journal->j_checkpoint_transactions); J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } @@ -1617,7 +1620,6 @@ int jbd2_journal_flush(journal_t *journal) int jbd2_journal_wipe(journal_t *journal, int write) { - journal_superblock_t *sb; int err = 0; J_ASSERT (!(journal->j_flags & JBD2_LOADED)); @@ -1626,8 +1628,6 @@ int jbd2_journal_wipe(journal_t *journal, int write) if (err) return err; - sb = journal->j_superblock; - if (!journal->j_tail) goto no_recovery; @@ -1665,12 +1665,12 @@ void __jbd2_journal_abort_hard(journal_t *journal) printk(KERN_ERR "Aborting journal on device %s.\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_ABORT; transaction = journal->j_running_transaction; if (transaction) __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* Soft abort: record the abort error status in the journal superblock, @@ -1755,12 +1755,12 @@ int jbd2_journal_errno(journal_t *journal) { int err; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else err = journal->j_errno; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return err; } @@ -1775,12 +1775,12 @@ int jbd2_journal_clear_err(journal_t *journal) { int err = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else journal->j_errno = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return err; } @@ -1793,10 +1793,10 @@ int jbd2_journal_clear_err(journal_t *journal) */ void jbd2_journal_ack_err(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_errno) journal->j_flags |= JBD2_ACK_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int jbd2_journal_blocks_per_page(struct inode *inode) @@ -2201,8 +2201,6 @@ void jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode) void jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode) { - int writeout = 0; - if (!journal) return; restart: @@ -2219,9 +2217,6 @@ restart: goto restart; } - /* Do we need to wait for data writeback? */ - if (journal->j_committing_transaction == jinode->i_transaction) - writeout = 1; if (jinode->i_transaction) { list_del(&jinode->i_list); jinode->i_transaction = NULL; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 049281b7cb8..2bc4d5f116f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -285,12 +285,10 @@ int jbd2_journal_recover(journal_t *journal) int jbd2_journal_skip_recovery(journal_t *journal) { int err; - journal_superblock_t * sb; struct recovery_info info; memset (&info, 0, sizeof(info)); - sb = journal->j_superblock; err = do_one_pass(journal, &info, PASS_SCAN); @@ -299,7 +297,8 @@ int jbd2_journal_skip_recovery(journal_t *journal) ++journal->j_transaction_sequence; } else { #ifdef CONFIG_JBD2_DEBUG - int dropped = info.end_transaction - be32_to_cpu(sb->s_sequence); + int dropped = info.end_transaction - + be32_to_cpu(journal->j_superblock->s_sequence); #endif jbd_debug(1, "JBD: ignoring %d transaction%s from the journal.\n", @@ -365,11 +364,6 @@ static int do_one_pass(journal_t *journal, int tag_bytes = journal_tag_bytes(journal); __u32 crc32_sum = ~0; /* Transactional Checksums */ - /* Precompute the maximum metadata descriptors in a descriptor block */ - int MAX_BLOCKS_PER_DESC; - MAX_BLOCKS_PER_DESC = ((journal->j_blocksize-sizeof(journal_header_t)) - / tag_bytes); - /* * First thing is to establish what we expect to find in the log * (in terms of transaction IDs), and where (in terms of log diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index b8e0806681b..d95cc9d0401 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -26,6 +26,8 @@ #include <linux/mm.h> #include <linux/highmem.h> #include <linux/hrtimer.h> +#include <linux/backing-dev.h> +#include <linux/module.h> static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -53,6 +55,9 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -83,65 +88,75 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * transaction's buffer credits. */ -static int start_this_handle(journal_t *journal, handle_t *handle) +static int start_this_handle(journal_t *journal, handle_t *handle, + int gfp_mask) { transaction_t *transaction; int needed; int nblocks = handle->h_buffer_credits; transaction_t *new_transaction = NULL; - int ret = 0; unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); - ret = -ENOSPC; - goto out; + return -ENOSPC; } alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); if (!new_transaction) { - ret = -ENOMEM; - goto out; + /* + * If __GFP_FS is not present, then we may be + * being called from inside the fs writeback + * layer, so we MUST NOT fail. Since + * __GFP_NOFAIL is going away, we will arrange + * to retry the allocation ourselves. + */ + if ((gfp_mask & __GFP_FS) == 0) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; + } + return -ENOMEM; } } jbd_debug(3, "New handle %p going live.\n", handle); -repeat: - /* * We need to hold j_state_lock until t_updates has been incremented, * for proper journal barrier handling */ - spin_lock(&journal->j_state_lock); -repeat_locked: +repeat: + read_lock(&journal->j_state_lock); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { - spin_unlock(&journal->j_state_lock); - ret = -EROFS; - goto out; + read_unlock(&journal->j_state_lock); + kfree(new_transaction); + return -EROFS; } /* Wait on the journal's transaction barrier if necessary */ if (journal->j_barrier_count) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); goto repeat; } if (!journal->j_running_transaction) { - if (!new_transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); + if (!new_transaction) goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; } - jbd2_get_transaction(journal, new_transaction); - new_transaction = NULL; + write_unlock(&journal->j_state_lock); + goto repeat; } transaction = journal->j_running_transaction; @@ -155,7 +170,7 @@ repeat_locked: prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -166,8 +181,8 @@ repeat_locked: * buffers requested by this operation, we need to stall pending a log * checkpoint to free some more log space. */ - spin_lock(&transaction->t_handle_lock); - needed = transaction->t_outstanding_credits + nblocks; + needed = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); if (needed > journal->j_max_transaction_buffers) { /* @@ -178,11 +193,11 @@ repeat_locked: DEFINE_WAIT(wait); jbd_debug(2, "Handle %p starting new commit...\n", handle); - spin_unlock(&transaction->t_handle_lock); + atomic_sub(nblocks, &transaction->t_outstanding_credits); prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -215,35 +230,48 @@ repeat_locked: */ if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - spin_unlock(&transaction->t_handle_lock); - __jbd2_log_wait_for_space(journal); - goto repeat_locked; + atomic_sub(nblocks, &transaction->t_outstanding_credits); + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + goto repeat; } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - - if (time_after(transaction->t_start, ts)) { + * use and add the handle to the running transaction. + * + * In order for t_max_wait to be reliable, it must be + * protected by a lock. But doing so will mean that + * start_this_handle() can not be run in parallel on SMP + * systems, which limits our scalability. So we only enable + * it when debugging is enabled. We may want to use a + * separate flag, eventually, so we can enable this + * independently of debugging. + */ +#ifdef CONFIG_JBD2_DEBUG + if (jbd2_journal_enable_debug && + time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); + spin_lock(&transaction->t_handle_lock); if (ts > transaction->t_max_wait) transaction->t_max_wait = ts; + spin_unlock(&transaction->t_handle_lock); } - +#endif handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; - transaction->t_handle_count++; + atomic_inc(&transaction->t_updates); + atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, transaction->t_outstanding_credits, + handle, nblocks, + atomic_read(&transaction->t_outstanding_credits), __jbd2_log_space_left(journal)); - spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); -out: - if (unlikely(new_transaction)) /* It's usually NULL */ - kfree(new_transaction); - return ret; + kfree(new_transaction); + return 0; } static struct lock_class_key jbd2_handle_key; @@ -278,7 +306,7 @@ static handle_t *new_handle(int nblocks) * * Return a pointer to a newly allocated handle, or NULL on failure */ -handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -298,7 +326,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) current->journal_info = handle; - err = start_this_handle(journal, handle); + err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { jbd2_free_handle(handle); current->journal_info = NULL; @@ -308,6 +336,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) out: return handle; } +EXPORT_SYMBOL(jbd2__journal_start); + + +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_start); + /** * int jbd2_journal_extend() - extend buffer credits. @@ -342,7 +379,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) result = 1; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ if (handle->h_transaction->t_state != T_RUNNING) { @@ -352,7 +389,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } spin_lock(&transaction->t_handle_lock); - wanted = transaction->t_outstanding_credits + nblocks; + wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; if (wanted > journal->j_max_transaction_buffers) { jbd_debug(3, "denied handle %p %d blocks: " @@ -367,14 +404,14 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } handle->h_buffer_credits += nblocks; - transaction->t_outstanding_credits += nblocks; + atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); unlock: spin_unlock(&transaction->t_handle_lock); error_out: - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); out: return result; } @@ -394,8 +431,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ - -int jbd2_journal_restart(handle_t *handle, int nblocks) +int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -410,29 +446,35 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); J_ASSERT(journal_current_handle() == handle); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - - if (!transaction->t_updates) + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "restarting handle %p\n", handle); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; - ret = start_this_handle(journal, handle); + ret = start_this_handle(journal, handle, gfp_mask); return ret; } +EXPORT_SYMBOL(jbd2__journal_restart); +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); + /** * void jbd2_journal_lock_updates () - establish a transaction barrier. * @journal: Journal to establish a barrier on. @@ -447,7 +489,7 @@ void jbd2_journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -458,19 +500,19 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); - if (!transaction->t_updates) { + if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); break; } prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_updates, &wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * We have now established a barrier against other normal updates, but @@ -494,9 +536,9 @@ void jbd2_journal_unlock_updates (journal_t *journal) J_ASSERT(journal->j_barrier_count != 0); mutex_unlock(&journal->j_barrier); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); --journal->j_barrier_count; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } @@ -1238,7 +1280,8 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int err; + int err, wait_for_commit = 0; + tid_t tid; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1246,7 +1289,7 @@ int jbd2_journal_stop(handle_t *handle) if (is_handle_aborted(handle)) err = -EIO; else { - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); err = 0; } @@ -1291,9 +1334,9 @@ int jbd2_journal_stop(handle_t *handle) journal->j_last_sync_writer = pid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_time = journal->j_average_commit_time; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); @@ -1314,14 +1357,8 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; current->journal_info = NULL; - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - if (!transaction->t_updates) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit @@ -1330,15 +1367,13 @@ int jbd2_journal_stop(handle_t *handle) * transaction is too old now. */ if (handle->h_sync || - transaction->t_outstanding_credits > - journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + (atomic_read(&transaction->t_outstanding_credits) > + journal->j_max_transaction_buffers) || + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ - tid_t tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ @@ -1349,11 +1384,25 @@ int jbd2_journal_stop(handle_t *handle) * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) - err = jbd2_log_wait_commit(journal, tid); - } else { - spin_unlock(&transaction->t_handle_lock); + wait_for_commit = 1; } + /* + * Once we drop t_updates, if it goes to zero the transaction + * could start commiting on us and eventually disappear. So + * once we do this, we must not dereference transaction + * pointer again. + */ + tid = transaction->t_tid; + if (atomic_dec_and_test(&transaction->t_updates)) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + lock_map_release(&handle->h_lockdep_map); jbd2_free_handle(handle); @@ -1719,7 +1768,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1772,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* There is no currently-running transaction. So the @@ -1786,7 +1835,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* The orphan record's transaction has @@ -1810,7 +1859,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -1829,7 +1878,7 @@ zap_buffer: zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2136,9 +2185,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started * a transaction adding the inode to orphan list */ - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); inode_trans = jinode->i_transaction; spin_unlock(&journal->j_list_lock); diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index a43d07e7b92..cc1bb33b59b 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -61,8 +61,8 @@ config NFS_V3_ACL If unsure, say N. config NFS_V4 - bool "NFS client support for NFS version 4 (EXPERIMENTAL)" - depends on NFS_FS && EXPERIMENTAL + bool "NFS client support for NFS version 4" + depends on NFS_FS select RPCSEC_GSS_KRB5 help This option enables support for version 4 of the NFS protocol @@ -72,16 +72,16 @@ config NFS_V4 space programs which can be found in the Linux nfs-utils package, available from http://linux-nfs.org/. - If unsure, say N. + If unsure, say Y. config NFS_V4_1 - bool "NFS client support for NFSv4.1 (DEVELOPER ONLY)" + bool "NFS client support for NFSv4.1 (EXPERIMENTAL)" depends on NFS_V4 && EXPERIMENTAL help This option enables support for minor version 1 of the NFSv4 protocol (draft-ietf-nfsv4-minorversion1) in the kernel's NFS client. - Unless you're an NFS developer, say N. + If unsure, say N. config ROOT_NFS bool "Root file system on NFS" diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index a08770a7e85..930d10fecda 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -37,8 +37,8 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * if (inode == NULL) goto out_putclient; nfsi = NFS_I(inode); - down_read(&nfsi->rwsem); - delegation = nfsi->delegation; + rcu_read_lock(); + delegation = rcu_dereference(nfsi->delegation); if (delegation == NULL || (delegation->type & FMODE_WRITE) == 0) goto out_iput; res->size = i_size_read(inode); @@ -53,7 +53,7 @@ __be32 nfs4_callback_getattr(struct cb_getattrargs *args, struct cb_getattrres * args->bitmap[1]; res->status = 0; out_iput: - up_read(&nfsi->rwsem); + rcu_read_unlock(); iput(inode); out_putclient: nfs_put_client(clp); @@ -62,16 +62,6 @@ out: return res->status; } -static int (*nfs_validate_delegation_stateid(struct nfs_client *clp))(struct nfs_delegation *, const nfs4_stateid *) -{ -#if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion > 0) - return nfs41_validate_delegation_stateid; -#endif - return nfs4_validate_delegation_stateid; -} - - __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) { struct nfs_client *clp; @@ -92,8 +82,7 @@ __be32 nfs4_callback_recall(struct cb_recallargs *args, void *dummy) inode = nfs_delegation_find_inode(clp, &args->fh); if (inode != NULL) { /* Set up a helper thread to actually return the delegation */ - switch (nfs_async_inode_return_delegation(inode, &args->stateid, - nfs_validate_delegation_stateid(clp))) { + switch (nfs_async_inode_return_delegation(inode, &args->stateid)) { case 0: res = 0; break; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index d25b5257b7a..4e7df2adb21 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -150,6 +150,7 @@ static struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_ clp->cl_boot_time = CURRENT_TIME; clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_minorversion = cl_init->minorversion; + clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; #endif cred = rpc_lookup_machine_cred(); if (!IS_ERR(cred)) @@ -178,7 +179,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) clp->cl_session = NULL; } - clp->cl_call_sync = _nfs4_call_sync; + clp->cl_mvops = nfs_v4_minor_ops[0]; #endif /* CONFIG_NFS_V4_1 */ } @@ -188,7 +189,7 @@ static void nfs4_clear_client_minor_version(struct nfs_client *clp) static void nfs4_destroy_callback(struct nfs_client *clp) { if (__test_and_clear_bit(NFS_CS_CALLBACK, &clp->cl_res_state)) - nfs_callback_down(clp->cl_minorversion); + nfs_callback_down(clp->cl_mvops->minor_version); } static void nfs4_shutdown_client(struct nfs_client *clp) @@ -1126,7 +1127,7 @@ static int nfs4_init_callback(struct nfs_client *clp) return error; } - error = nfs_callback_up(clp->cl_minorversion, + error = nfs_callback_up(clp->cl_mvops->minor_version, clp->cl_rpcclient->cl_xprt); if (error < 0) { dprintk("%s: failed to start callback. Error = %d\n", @@ -1143,10 +1144,8 @@ static int nfs4_init_callback(struct nfs_client *clp) */ static int nfs4_init_client_minor_version(struct nfs_client *clp) { - clp->cl_call_sync = _nfs4_call_sync; - #if defined(CONFIG_NFS_V4_1) - if (clp->cl_minorversion) { + if (clp->cl_mvops->minor_version) { struct nfs4_session *session = NULL; /* * Create the session and mark it expired. @@ -1158,7 +1157,13 @@ static int nfs4_init_client_minor_version(struct nfs_client *clp) return -ENOMEM; clp->cl_session = session; - clp->cl_call_sync = _nfs4_call_sync_session; + /* + * The create session reply races with the server back + * channel probe. Mark the client NFS_CS_SESSION_INITING + * so that the client back channel can find the + * nfs_client struct + */ + clp->cl_cons_state = NFS_CS_SESSION_INITING; } #endif /* CONFIG_NFS_V4_1 */ @@ -1454,7 +1459,7 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data, data->authflavor, parent_server->client->cl_xprt->prot, parent_server->client->cl_timeout, - parent_client->cl_minorversion); + parent_client->cl_mvops->minor_version); if (error < 0) goto error; diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 30163454397..b9c3c43cea1 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -268,14 +268,6 @@ out: return status; } -/* Sync all data to disk upon delegation return */ -static void nfs_msync_inode(struct inode *inode) -{ - filemap_fdatawrite(inode->i_mapping); - nfs_wb_all(inode); - filemap_fdatawait(inode->i_mapping); -} - /* * Basic procedure for returning a delegation to the server */ @@ -367,7 +359,7 @@ int nfs_inode_return_delegation(struct inode *inode) delegation = nfs_detach_delegation_locked(nfsi, NULL, clp); spin_unlock(&clp->cl_lock); if (delegation != NULL) { - nfs_msync_inode(inode); + nfs_wb_all(inode); err = __nfs_inode_return_delegation(inode, delegation, 1); } } @@ -471,9 +463,7 @@ void nfs_expire_unreferenced_delegations(struct nfs_client *clp) /* * Asynchronous delegation recall! */ -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)) +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid) { struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; struct nfs_delegation *delegation; @@ -481,7 +471,7 @@ int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *s rcu_read_lock(); delegation = rcu_dereference(NFS_I(inode)->delegation); - if (!validate_stateid(delegation, stateid)) { + if (!clp->cl_mvops->validate_stateid(delegation, stateid)) { rcu_read_unlock(); return -ENOENT; } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 69e7b814012..2026304bda1 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -34,9 +34,7 @@ enum { int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res); int nfs_inode_return_delegation(struct inode *inode); -int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid, - int (*validate_stateid)(struct nfs_delegation *delegation, - const nfs4_stateid *stateid)); +int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_return_delegation_noreclaim(struct inode *inode); struct inode *nfs_delegation_find_inode(struct nfs_client *clp, const struct nfs_fh *fhandle); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 832e9e23932..29539ceeb74 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1652,16 +1652,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry, } } - /* - * ... prune child dentries and writebacks if needed. - */ - if (atomic_read(&old_dentry->d_count) > 1) { - if (S_ISREG(old_inode->i_mode)) - nfs_wb_all(old_inode); - shrink_dcache_parent(old_dentry); - } nfs_inode_return_delegation(old_inode); - if (new_inode != NULL) nfs_inode_return_delegation(new_inode); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index ad4cd31d605..064a8096167 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -69,6 +69,7 @@ struct nfs_direct_req { /* I/O parameters */ struct nfs_open_context *ctx; /* file open context info */ + struct nfs_lock_context *l_ctx; /* Lock context info */ struct kiocb * iocb; /* controlling i/o request */ struct inode * inode; /* target file of i/o */ @@ -160,6 +161,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void) INIT_LIST_HEAD(&dreq->rewrite_list); dreq->iocb = NULL; dreq->ctx = NULL; + dreq->l_ctx = NULL; spin_lock_init(&dreq->lock); atomic_set(&dreq->io_count, 0); dreq->count = 0; @@ -173,6 +175,8 @@ static void nfs_direct_req_free(struct kref *kref) { struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref); + if (dreq->l_ctx != NULL) + nfs_put_lock_context(dreq->l_ctx); if (dreq->ctx != NULL) put_nfs_open_context(dreq->ctx); kmem_cache_free(nfs_direct_cachep, dreq); @@ -336,6 +340,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -416,24 +421,28 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, static ssize_t nfs_direct_read(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { - ssize_t result = 0; + ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; dreq = nfs_direct_req_alloc(); - if (!dreq) - return -ENOMEM; + if (dreq == NULL) + goto out; dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx == NULL) + goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; result = nfs_direct_read_schedule_iovec(dreq, iov, nr_segs, pos); if (!result) result = nfs_direct_wait(dreq); +out_release: nfs_direct_req_release(dreq); - +out: return result; } @@ -574,6 +583,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.offset = 0; data->args.count = 0; data->args.context = dreq->ctx; + data->args.lock_context = dreq->l_ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -761,6 +771,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); data->args.context = ctx; + data->args.lock_context = dreq->l_ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -845,7 +856,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos, size_t count) { - ssize_t result = 0; + ssize_t result = -ENOMEM; struct inode *inode = iocb->ki_filp->f_mapping->host; struct nfs_direct_req *dreq; size_t wsize = NFS_SERVER(inode)->wsize; @@ -853,7 +864,7 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq = nfs_direct_req_alloc(); if (!dreq) - return -ENOMEM; + goto out; nfs_alloc_commit_data(dreq); if (dreq->commit_data == NULL || count < wsize) @@ -861,14 +872,18 @@ static ssize_t nfs_direct_write(struct kiocb *iocb, const struct iovec *iov, dreq->inode = inode; dreq->ctx = get_nfs_open_context(nfs_file_open_context(iocb->ki_filp)); + dreq->l_ctx = nfs_get_lock_context(dreq->ctx); + if (dreq->l_ctx != NULL) + goto out_release; if (!is_sync_kiocb(iocb)) dreq->iocb = iocb; result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, sync); if (!result) result = nfs_direct_wait(dreq); +out_release: nfs_direct_req_release(dreq); - +out: return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index f036153d9f5..2d141a74ae8 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -203,37 +203,11 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) } /* - * Helper for nfs_file_flush() and nfs_file_fsync() - * - * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to - * disk, but it retrieves and clears ctx->error after synching, despite - * the two being set at the same time in nfs_context_set_write_error(). - * This is because the former is used to notify the _next_ call to - * nfs_file_write() that a write error occured, and hence cause it to - * fall back to doing a synchronous write. - */ -static int nfs_do_fsync(struct nfs_open_context *ctx, struct inode *inode) -{ - int have_error, status; - int ret = 0; - - have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - status = nfs_wb_all(inode); - have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); - if (have_error) - ret = xchg(&ctx->error, 0); - if (!ret) - ret = status; - return ret; -} - -/* * Flush all dirty pages, and check for write errors. */ static int nfs_file_flush(struct file *file, fl_owner_t id) { - struct nfs_open_context *ctx = nfs_file_open_context(file); struct dentry *dentry = file->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -246,7 +220,7 @@ nfs_file_flush(struct file *file, fl_owner_t id) return 0; /* Flush writes to the server and return any errors */ - return nfs_do_fsync(ctx, inode); + return vfs_fsync(file, 0); } static ssize_t @@ -321,6 +295,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma) * Flush any dirty pages for this process, and check for write errors. * The return status from this call provides a reliable indication of * whether any write errors occurred for this process. + * + * Notice that it clears the NFS_CONTEXT_ERROR_WRITE before synching to + * disk, but it retrieves and clears ctx->error after synching, despite + * the two being set at the same time in nfs_context_set_write_error(). + * This is because the former is used to notify the _next_ call to + * nfs_file_write() that a write error occured, and hence cause it to + * fall back to doing a synchronous write. */ static int nfs_file_fsync(struct file *file, int datasync) @@ -328,13 +309,23 @@ nfs_file_fsync(struct file *file, int datasync) struct dentry *dentry = file->f_path.dentry; struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = dentry->d_inode; + int have_error, status; + int ret = 0; + dprintk("NFS: fsync file(%s/%s) datasync %d\n", dentry->d_parent->d_name.name, dentry->d_name.name, datasync); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - return nfs_do_fsync(ctx, inode); + have_error = test_and_clear_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + status = nfs_commit_inode(inode, FLUSH_SYNC); + have_error |= test_bit(NFS_CONTEXT_ERROR_WRITE, &ctx->flags); + if (have_error) + ret = xchg(&ctx->error, 0); + if (!ret) + ret = status; + return ret; } /* @@ -648,7 +639,7 @@ static ssize_t nfs_file_write(struct kiocb *iocb, const struct iovec *iov, /* Return error values for O_DSYNC and IS_SYNC() */ if (result >= 0 && nfs_need_sync_write(iocb->ki_filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(iocb->ki_filp), inode); + int err = vfs_fsync(iocb->ki_filp, 0); if (err < 0) result = err; } @@ -684,7 +675,7 @@ static ssize_t nfs_file_splice_write(struct pipe_inode_info *pipe, written = ret; if (ret >= 0 && nfs_need_sync_write(filp, inode)) { - int err = nfs_do_fsync(nfs_file_open_context(filp), inode); + int err = vfs_fsync(filp, 0); if (err < 0) ret = err; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 099b3518fee..581d8f081e6 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -413,10 +413,8 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) return 0; /* Write all dirty data */ - if (S_ISREG(inode->i_mode)) { - filemap_write_and_wait(inode->i_mapping); + if (S_ISREG(inode->i_mode)) nfs_wb_all(inode); - } fattr = nfs_alloc_fattr(); if (fattr == NULL) @@ -530,6 +528,68 @@ out: return err; } +static void nfs_init_lock_context(struct nfs_lock_context *l_ctx) +{ + atomic_set(&l_ctx->count, 1); + l_ctx->lockowner = current->files; + l_ctx->pid = current->tgid; + INIT_LIST_HEAD(&l_ctx->list); +} + +static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *pos; + + list_for_each_entry(pos, &ctx->lock_context.list, list) { + if (pos->lockowner != current->files) + continue; + if (pos->pid != current->tgid) + continue; + atomic_inc(&pos->count); + return pos; + } + return NULL; +} + +struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) +{ + struct nfs_lock_context *res, *new = NULL; + struct inode *inode = ctx->path.dentry->d_inode; + + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + spin_unlock(&inode->i_lock); + new = kmalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) + return NULL; + nfs_init_lock_context(new); + spin_lock(&inode->i_lock); + res = __nfs_find_lock_context(ctx); + if (res == NULL) { + list_add_tail(&new->list, &ctx->lock_context.list); + new->open_context = ctx; + res = new; + new = NULL; + } + } + spin_unlock(&inode->i_lock); + kfree(new); + return res; +} + +void nfs_put_lock_context(struct nfs_lock_context *l_ctx) +{ + struct nfs_open_context *ctx = l_ctx->open_context; + struct inode *inode = ctx->path.dentry->d_inode; + + if (!atomic_dec_and_lock(&l_ctx->count, &inode->i_lock)) + return; + list_del(&l_ctx->list); + spin_unlock(&inode->i_lock); + kfree(l_ctx); +} + /** * nfs_close_context - Common close_context() routine NFSv2/v3 * @ctx: pointer to context @@ -566,11 +626,11 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct path_get(&ctx->path); ctx->cred = get_rpccred(cred); ctx->state = NULL; - ctx->lockowner = current->files; ctx->flags = 0; ctx->error = 0; ctx->dir_cookie = 0; - atomic_set(&ctx->count, 1); + nfs_init_lock_context(&ctx->lock_context); + ctx->lock_context.open_context = ctx; } return ctx; } @@ -578,7 +638,7 @@ static struct nfs_open_context *alloc_nfs_open_context(struct path *path, struct struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx) { if (ctx != NULL) - atomic_inc(&ctx->count); + atomic_inc(&ctx->lock_context.count); return ctx; } @@ -586,7 +646,7 @@ static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync) { struct inode *inode = ctx->path.dentry->d_inode; - if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock)) + if (!atomic_dec_and_lock(&ctx->lock_context.count, &inode->i_lock)) return; list_del(&ctx->list); spin_unlock(&inode->i_lock); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index e70f44b9b3f..4c2150d8671 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -370,10 +370,9 @@ unsigned int nfs_page_array_len(unsigned int base, size_t len) * Helper for restarting RPC calls in the possible presence of NFSv4.1 * sessions. */ -static inline void nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) +static inline int nfs_restart_rpc(struct rpc_task *task, const struct nfs_client *clp) { if (nfs4_has_session(clp)) - rpc_restart_call_prepare(task); - else - rpc_restart_call(task); + return rpc_restart_call_prepare(task); + return rpc_restart_call(task); } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 81cf1425791..db8846a0e82 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -233,7 +233,7 @@ nfs_xdr_removeargs(struct rpc_rqst *req, __be32 *p, const struct nfs_removeargs static int nfs_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 offset = (u32)args->offset; u32 count = args->count; @@ -393,8 +393,7 @@ nfs_xdr_symlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_symlinkargs *arg static int nfs_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs_readdirargs *args) { - struct rpc_task *task = req->rq_task; - struct rpc_auth *auth = task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -575,7 +574,7 @@ nfs_xdr_diropres(struct rpc_rqst *req, __be32 *p, struct nfs_diropok *res) static int nfs_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs_readlinkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 75dcfc7da36..9769704f8ce 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -330,7 +330,7 @@ nfs3_xdr_accessargs(struct rpc_rqst *req, __be32 *p, struct nfs3_accessargs *arg static int nfs3_xdr_readargs(struct rpc_rqst *req, __be32 *p, struct nfs_readargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -471,7 +471,7 @@ nfs3_xdr_linkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_linkargs *args) static int nfs3_xdr_readdirargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readdirargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; u32 count = args->count; @@ -675,7 +675,7 @@ static int nfs3_xdr_getaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_getaclargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); @@ -802,7 +802,7 @@ nfs3_xdr_accessres(struct rpc_rqst *req, __be32 *p, struct nfs3_accessres *res) static int nfs3_xdr_readlinkargs(struct rpc_rqst *req, __be32 *p, struct nfs3_readlinkargs *args) { - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; unsigned int replen; p = xdr_encode_fhandle(p, args->fh); diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c538c6106e1..311e15cc8af 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -45,10 +45,29 @@ enum nfs4_client_state { NFS4CLNT_RECLAIM_NOGRACE, NFS4CLNT_DELEGRETURN, NFS4CLNT_SESSION_RESET, - NFS4CLNT_SESSION_DRAINING, NFS4CLNT_RECALL_SLOT, }; +enum nfs4_session_state { + NFS4_SESSION_INITING, + NFS4_SESSION_DRAINING, +}; + +struct nfs4_minor_version_ops { + u32 minor_version; + + int (*call_sync)(struct nfs_server *server, + struct rpc_message *msg, + struct nfs4_sequence_args *args, + struct nfs4_sequence_res *res, + int cache_reply); + int (*validate_stateid)(struct nfs_delegation *, + const nfs4_stateid *); + const struct nfs4_state_recovery_ops *reboot_recovery_ops; + const struct nfs4_state_recovery_ops *nograce_recovery_ops; + const struct nfs4_state_maintenance_ops *state_renewal_ops; +}; + /* * struct rpc_sequence ensures that RPC calls are sent in the exact * order that they appear on the list. @@ -89,7 +108,6 @@ struct nfs_unique_id { */ struct nfs4_state_owner { struct nfs_unique_id so_owner_id; - struct nfs_client *so_client; struct nfs_server *so_server; struct rb_node so_client_node; @@ -99,7 +117,6 @@ struct nfs4_state_owner { atomic_t so_count; unsigned long so_flags; struct list_head so_states; - struct list_head so_delegations; struct nfs_seqid_counter so_seqid; struct rpc_sequence so_sequence; }; @@ -125,10 +142,20 @@ enum { * LOCK: one nfs4_state (LOCK) to hold the lock stateid nfs4_state(OPEN) */ +struct nfs4_lock_owner { + unsigned int lo_type; +#define NFS4_ANY_LOCK_TYPE (0U) +#define NFS4_FLOCK_LOCK_TYPE (1U << 0) +#define NFS4_POSIX_LOCK_TYPE (1U << 1) + union { + fl_owner_t posix_owner; + pid_t flock_owner; + } lo_u; +}; + struct nfs4_lock_state { struct list_head ls_locks; /* Other lock stateids */ struct nfs4_state * ls_state; /* Pointer to open state */ - fl_owner_t ls_owner; /* POSIX lock owner */ #define NFS_LOCK_INITIALIZED 1 int ls_flags; struct nfs_seqid_counter ls_seqid; @@ -136,6 +163,7 @@ struct nfs4_lock_state { struct nfs_unique_id ls_id; nfs4_stateid ls_stateid; atomic_t ls_count; + struct nfs4_lock_owner ls_owner; }; /* bits for nfs4_state->flags */ @@ -219,11 +247,15 @@ extern int nfs4_open_revalidate(struct inode *, struct dentry *, int, struct nam extern int nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle); extern int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name, struct nfs4_fs_locations *fs_locations, struct page *page); +extern void nfs4_release_lockowner(const struct nfs4_lock_state *); -extern struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[]; -extern struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[]; #if defined(CONFIG_NFS_V4_1) -extern int nfs4_setup_sequence(struct nfs_client *clp, +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return server->nfs_client->cl_session; +} + +extern int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task); extern void nfs4_destroy_session(struct nfs4_session *session); @@ -234,7 +266,12 @@ extern int nfs4_init_session(struct nfs_server *server); extern int nfs4_proc_get_lease_time(struct nfs_client *clp, struct nfs_fsinfo *fsinfo); #else /* CONFIG_NFS_v4_1 */ -static inline int nfs4_setup_sequence(struct nfs_client *clp, +static inline struct nfs4_session *nfs4_get_session(const struct nfs_server *server) +{ + return NULL; +} + +static inline int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task) { @@ -247,7 +284,7 @@ static inline int nfs4_init_session(struct nfs_server *server) } #endif /* CONFIG_NFS_V4_1 */ -extern struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[]; +extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; extern const u32 nfs4_fattr_bitmap[2]; extern const u32 nfs4_statfs_bitmap[2]; @@ -284,7 +321,7 @@ extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) extern void nfs41_handle_recall_slot(struct nfs_client *clp); extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp); extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl); -extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t); +extern void nfs4_copy_stateid(nfs4_stateid *, struct nfs4_state *, fl_owner_t, pid_t); extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask); extern int nfs_wait_on_sequence(struct nfs_seqid *seqid, struct rpc_task *task); diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 70015dd60a9..7ffbb98ddec 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -303,15 +303,19 @@ do_state_recovery: } -static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +static void do_renew_lease(struct nfs_client *clp, unsigned long timestamp) { - struct nfs_client *clp = server->nfs_client; spin_lock(&clp->cl_lock); if (time_before(clp->cl_last_renewal,timestamp)) clp->cl_last_renewal = timestamp; spin_unlock(&clp->cl_lock); } +static void renew_lease(const struct nfs_server *server, unsigned long timestamp) +{ + do_renew_lease(server->nfs_client, timestamp); +} + #if defined(CONFIG_NFS_V4_1) /* @@ -356,7 +360,7 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses) { struct rpc_task *task; - if (!test_bit(NFS4CLNT_SESSION_DRAINING, &ses->clp->cl_state)) { + if (!test_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { task = rpc_wake_up_next(&ses->fc_slot_table.slot_tbl_waitq); if (task) rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); @@ -370,12 +374,11 @@ static void nfs41_check_drain_session_complete(struct nfs4_session *ses) complete(&ses->complete); } -static void nfs41_sequence_free_slot(const struct nfs_client *clp, - struct nfs4_sequence_res *res) +static void nfs41_sequence_free_slot(struct nfs4_sequence_res *res) { struct nfs4_slot_table *tbl; - tbl = &clp->cl_session->fc_slot_table; + tbl = &res->sr_session->fc_slot_table; if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) { /* just wake up the next guy waiting since * we may have not consumed a slot after all */ @@ -385,18 +388,17 @@ static void nfs41_sequence_free_slot(const struct nfs_client *clp, spin_lock(&tbl->slot_tbl_lock); nfs4_free_slot(tbl, res->sr_slotid); - nfs41_check_drain_session_complete(clp->cl_session); + nfs41_check_drain_session_complete(res->sr_session); spin_unlock(&tbl->slot_tbl_lock); res->sr_slotid = NFS4_MAX_SLOT_TABLE; } -static void nfs41_sequence_done(struct nfs_client *clp, - struct nfs4_sequence_res *res, - int rpc_status) +static int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res) { unsigned long timestamp; struct nfs4_slot_table *tbl; struct nfs4_slot *slot; + struct nfs_client *clp; /* * sr_status remains 1 if an RPC level error occurred. The server @@ -411,25 +413,51 @@ static void nfs41_sequence_done(struct nfs_client *clp, if (res->sr_slotid == NFS4_MAX_SLOT_TABLE) goto out; + tbl = &res->sr_session->fc_slot_table; + slot = tbl->slots + res->sr_slotid; + /* Check the SEQUENCE operation status */ - if (res->sr_status == 0) { - tbl = &clp->cl_session->fc_slot_table; - slot = tbl->slots + res->sr_slotid; + switch (res->sr_status) { + case 0: /* Update the slot's sequence and clientid lease timer */ ++slot->seq_nr; timestamp = res->sr_renewal_time; - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal, timestamp)) - clp->cl_last_renewal = timestamp; - spin_unlock(&clp->cl_lock); + clp = res->sr_session->clp; + do_renew_lease(clp, timestamp); /* Check sequence flags */ if (atomic_read(&clp->cl_count) > 1) nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags); + break; + case -NFS4ERR_DELAY: + /* The server detected a resend of the RPC call and + * returned NFS4ERR_DELAY as per Section 2.10.6.2 + * of RFC5661. + */ + dprintk("%s: slot=%d seq=%d: Operation in progress\n", + __func__, res->sr_slotid, slot->seq_nr); + goto out_retry; + default: + /* Just update the slot sequence no. */ + ++slot->seq_nr; } out: /* The session may be reset by one of the error handlers. */ dprintk("%s: Error %d free the slot \n", __func__, res->sr_status); - nfs41_sequence_free_slot(clp, res); + nfs41_sequence_free_slot(res); + return 1; +out_retry: + if (!rpc_restart_call(task)) + goto out; + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return 0; +} + +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + if (res->sr_session == NULL) + return 1; + return nfs41_sequence_done(task, res); } /* @@ -480,12 +508,11 @@ static int nfs41_setup_sequence(struct nfs4_session *session, if (res->sr_slotid != NFS4_MAX_SLOT_TABLE) return 0; - memset(res, 0, sizeof(*res)); res->sr_slotid = NFS4_MAX_SLOT_TABLE; tbl = &session->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - if (test_bit(NFS4CLNT_SESSION_DRAINING, &session->clp->cl_state) && + if (test_bit(NFS4_SESSION_DRAINING, &session->session_state) && !rpc_task_has_priority(task, RPC_PRIORITY_PRIVILEGED)) { /* * The state manager will wait until the slot table is empty. @@ -525,6 +552,7 @@ static int nfs41_setup_sequence(struct nfs4_session *session, res->sr_session = session; res->sr_slotid = slotid; res->sr_renewal_time = jiffies; + res->sr_status_flags = 0; /* * sr_status is only set in decode_sequence, and so will remain * set to 1 if an rpc level failure occurs. @@ -533,33 +561,33 @@ static int nfs41_setup_sequence(struct nfs4_session *session, return 0; } -int nfs4_setup_sequence(struct nfs_client *clp, +int nfs4_setup_sequence(const struct nfs_server *server, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, int cache_reply, struct rpc_task *task) { + struct nfs4_session *session = nfs4_get_session(server); int ret = 0; + if (session == NULL) { + args->sa_session = NULL; + res->sr_session = NULL; + goto out; + } + dprintk("--> %s clp %p session %p sr_slotid %d\n", - __func__, clp, clp->cl_session, res->sr_slotid); + __func__, session->clp, session, res->sr_slotid); - if (!nfs4_has_session(clp)) - goto out; - ret = nfs41_setup_sequence(clp->cl_session, args, res, cache_reply, + ret = nfs41_setup_sequence(session, args, res, cache_reply, task); - if (ret && ret != -EAGAIN) { - /* terminate rpc task */ - task->tk_status = ret; - task->tk_action = NULL; - } out: dprintk("<-- %s status=%d\n", __func__, ret); return ret; } struct nfs41_call_sync_data { - struct nfs_client *clp; + const struct nfs_server *seq_server; struct nfs4_sequence_args *seq_args; struct nfs4_sequence_res *seq_res; int cache_reply; @@ -569,9 +597,9 @@ static void nfs41_call_sync_prepare(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; - dprintk("--> %s data->clp->cl_session %p\n", __func__, - data->clp->cl_session); - if (nfs4_setup_sequence(data->clp, data->seq_args, + dprintk("--> %s data->seq_server %p\n", __func__, data->seq_server); + + if (nfs4_setup_sequence(data->seq_server, data->seq_args, data->seq_res, data->cache_reply, task)) return; rpc_call_start(task); @@ -587,7 +615,7 @@ static void nfs41_call_sync_done(struct rpc_task *task, void *calldata) { struct nfs41_call_sync_data *data = calldata; - nfs41_sequence_done(data->clp, data->seq_res, task->tk_status); + nfs41_sequence_done(task, data->seq_res); } struct rpc_call_ops nfs41_call_sync_ops = { @@ -600,8 +628,7 @@ struct rpc_call_ops nfs41_call_priv_sync_ops = { .rpc_call_done = nfs41_call_sync_done, }; -static int nfs4_call_sync_sequence(struct nfs_client *clp, - struct rpc_clnt *clnt, +static int nfs4_call_sync_sequence(struct nfs_server *server, struct rpc_message *msg, struct nfs4_sequence_args *args, struct nfs4_sequence_res *res, @@ -611,13 +638,13 @@ static int nfs4_call_sync_sequence(struct nfs_client *clp, int ret; struct rpc_task *task; struct nfs41_call_sync_data data = { - .clp = clp, + .seq_server = server, .seq_args = args, .seq_res = res, .cache_reply = cache_reply, }; struct rpc_task_setup task_setup = { - .rpc_client = clnt, + .rpc_client = server->client, .rpc_message = msg, .callback_ops = &nfs41_call_sync_ops, .callback_data = &data @@ -642,10 +669,15 @@ int _nfs4_call_sync_session(struct nfs_server *server, struct nfs4_sequence_res *res, int cache_reply) { - return nfs4_call_sync_sequence(server->nfs_client, server->client, - msg, args, res, cache_reply, 0); + return nfs4_call_sync_sequence(server, msg, args, res, cache_reply, 0); } +#else +static int nfs4_sequence_done(struct rpc_task *task, + struct nfs4_sequence_res *res) +{ + return 1; +} #endif /* CONFIG_NFS_V4_1 */ int _nfs4_call_sync(struct nfs_server *server, @@ -659,18 +691,9 @@ int _nfs4_call_sync(struct nfs_server *server, } #define nfs4_call_sync(server, msg, args, res, cache_reply) \ - (server)->nfs_client->cl_call_sync((server), (msg), &(args)->seq_args, \ + (server)->nfs_client->cl_mvops->call_sync((server), (msg), &(args)->seq_args, \ &(res)->seq_res, (cache_reply)) -static void nfs4_sequence_done(const struct nfs_server *server, - struct nfs4_sequence_res *res, int rpc_status) -{ -#ifdef CONFIG_NFS_V4_1 - if (nfs4_has_session(server->nfs_client)) - nfs41_sequence_done(server->nfs_client, res, rpc_status); -#endif /* CONFIG_NFS_V4_1 */ -} - static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo) { struct nfs_inode *nfsi = NFS_I(dir); @@ -745,19 +768,14 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct path *path, p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; - if (flags & O_EXCL) { - if (nfs4_has_persistent_session(server->nfs_client)) { - /* GUARDED */ - p->o_arg.u.attrs = &p->attrs; - memcpy(&p->attrs, attrs, sizeof(p->attrs)); - } else { /* EXCLUSIVE4_1 */ - u32 *s = (u32 *) p->o_arg.u.verifier.data; - s[0] = jiffies; - s[1] = current->pid; - } - } else if (flags & O_CREAT) { + if (flags & O_CREAT) { + u32 *s; + p->o_arg.u.attrs = &p->attrs; memcpy(&p->attrs, attrs, sizeof(p->attrs)); + s = (u32 *) p->o_arg.u.verifier.data; + s[0] = jiffies; + s[1] = current->pid; } p->c_arg.fh = &p->o_res.fh; p->c_arg.stateid = &p->o_res.stateid; @@ -1255,8 +1273,6 @@ static void nfs4_open_confirm_done(struct rpc_task *task, void *calldata) struct nfs4_opendata *data = calldata; data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) - return; if (data->rpc_status == 0) { memcpy(data->o_res.stateid.data, data->c_res.stateid.data, sizeof(data->o_res.stateid.data)); @@ -1356,13 +1372,13 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) } /* Update sequence id. */ data->o_arg.id = sp->so_owner_id.id; - data->o_arg.clientid = sp->so_client->cl_clientid; + data->o_arg.clientid = sp->so_server->nfs_client->cl_clientid; if (data->o_arg.claim == NFS4_OPEN_CLAIM_PREVIOUS) { task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OPEN_NOATTR]; nfs_copy_fh(&data->o_res.fh, data->o_arg.fh); } data->timestamp = jiffies; - if (nfs4_setup_sequence(data->o_arg.server->nfs_client, + if (nfs4_setup_sequence(data->o_arg.server, &data->o_arg.seq_args, &data->o_res.seq_res, 1, task)) return; @@ -1385,11 +1401,9 @@ static void nfs4_open_done(struct rpc_task *task, void *calldata) data->rpc_status = task->tk_status; - nfs4_sequence_done(data->o_arg.server, &data->o_res.seq_res, - task->tk_status); - - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &data->o_res.seq_res)) return; + if (task->tk_status == 0) { switch (data->o_res.f_attr->mode & S_IFMT) { case S_IFREG: @@ -1773,7 +1787,7 @@ static int _nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred, if (nfs4_copy_delegation_stateid(&arg.stateid, inode)) { /* Use that stateid */ } else if (state != NULL) { - nfs4_copy_stateid(&arg.stateid, state, current->files); + nfs4_copy_stateid(&arg.stateid, state, current->files, current->tgid); } else memcpy(&arg.stateid, &zero_stateid, sizeof(arg.stateid)); @@ -1838,8 +1852,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) struct nfs4_state *state = calldata->state; struct nfs_server *server = NFS_SERVER(calldata->inode); - nfs4_sequence_done(server, &calldata->res.seq_res, task->tk_status); - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; /* hmm. we are done with the inode, and in the process of freeing * the state_owner. we keep this around to process errors @@ -1903,7 +1916,7 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) nfs_fattr_init(calldata->res.fattr); calldata->timestamp = jiffies; - if (nfs4_setup_sequence((NFS_SERVER(calldata->inode))->nfs_client, + if (nfs4_setup_sequence(NFS_SERVER(calldata->inode), &calldata->arg.seq_args, &calldata->res.seq_res, 1, task)) return; @@ -2648,7 +2661,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) { struct nfs_removeres *res = task->tk_msg.rpc_resp; - nfs4_sequence_done(res->server, &res->seq_res, task->tk_status); + if (!nfs4_sequence_done(task, &res->seq_res)) + return 0; if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); @@ -3093,7 +3107,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_read_data *data) dprintk("--> %s\n", __func__); - nfs4_sequence_done(server, &data->res.seq_res, task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; if (nfs4_async_handle_error(task, server, data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, server->nfs_client); @@ -3116,8 +3131,8 @@ static int nfs4_write_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; if (nfs4_async_handle_error(task, NFS_SERVER(inode), data->args.context->state) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); @@ -3145,8 +3160,9 @@ static int nfs4_commit_done(struct rpc_task *task, struct nfs_write_data *data) { struct inode *inode = data->inode; - nfs4_sequence_done(NFS_SERVER(inode), &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return -EAGAIN; + if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { nfs_restart_rpc(task, NFS_SERVER(inode)->nfs_client); return -EAGAIN; @@ -3196,10 +3212,7 @@ static void nfs4_renew_done(struct rpc_task *task, void *calldata) nfs4_schedule_state_recovery(clp); return; } - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal,timestamp)) - clp->cl_last_renewal = timestamp; - spin_unlock(&clp->cl_lock); + do_renew_lease(clp, timestamp); } static const struct rpc_call_ops nfs4_renew_ops = { @@ -3240,10 +3253,7 @@ int nfs4_proc_renew(struct nfs_client *clp, struct rpc_cred *cred) status = rpc_call_sync(clp->cl_rpcclient, &msg, 0); if (status < 0) return status; - spin_lock(&clp->cl_lock); - if (time_before(clp->cl_last_renewal,now)) - clp->cl_last_renewal = now; - spin_unlock(&clp->cl_lock); + do_renew_lease(clp, now); return 0; } @@ -3464,9 +3474,11 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, size_t buflen } static int -_nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs_client *clp, struct nfs4_state *state) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) { - if (!clp || task->tk_status >= 0) + struct nfs_client *clp = server->nfs_client; + + if (task->tk_status >= 0) return 0; switch(task->tk_status) { case -NFS4ERR_ADMIN_REVOKED: @@ -3498,8 +3510,7 @@ _nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, return -EAGAIN; #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: - if (server) - nfs_inc_server_stats(server, NFSIOS_DELAY); + nfs_inc_server_stats(server, NFSIOS_DELAY); case -NFS4ERR_GRACE: case -EKEYEXPIRED: rpc_delay(task, NFS4_POLL_RETRY_MAX); @@ -3520,12 +3531,6 @@ do_state_recovery: return -EAGAIN; } -static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) -{ - return _nfs4_async_handle_error(task, server, server->nfs_client, state); -} - int nfs4_proc_setclientid(struct nfs_client *clp, u32 program, unsigned short port, struct rpc_cred *cred, struct nfs4_setclientid_res *res) @@ -3641,8 +3646,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) { struct nfs4_delegreturndata *data = calldata; - nfs4_sequence_done(data->res.server, &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; switch (task->tk_status) { case -NFS4ERR_STALE_STATEID: @@ -3672,7 +3677,7 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data) d_data = (struct nfs4_delegreturndata *)data; - if (nfs4_setup_sequence(d_data->res.server->nfs_client, + if (nfs4_setup_sequence(d_data->res.server, &d_data->args.seq_args, &d_data->res.seq_res, 1, task)) return; @@ -3892,9 +3897,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) { struct nfs4_unlockdata *calldata = data; - nfs4_sequence_done(calldata->server, &calldata->res.seq_res, - task->tk_status); - if (RPC_ASSASSINATED(task)) + if (!nfs4_sequence_done(task, &calldata->res.seq_res)) return; switch (task->tk_status) { case 0: @@ -3927,7 +3930,7 @@ static void nfs4_locku_prepare(struct rpc_task *task, void *data) return; } calldata->timestamp = jiffies; - if (nfs4_setup_sequence(calldata->server->nfs_client, + if (nfs4_setup_sequence(calldata->server, &calldata->arg.seq_args, &calldata->res.seq_res, 1, task)) return; @@ -4082,7 +4085,8 @@ static void nfs4_lock_prepare(struct rpc_task *task, void *calldata) } else data->arg.new_lock_owner = 0; data->timestamp = jiffies; - if (nfs4_setup_sequence(data->server->nfs_client, &data->arg.seq_args, + if (nfs4_setup_sequence(data->server, + &data->arg.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); @@ -4101,12 +4105,10 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata) dprintk("%s: begin!\n", __func__); - nfs4_sequence_done(data->server, &data->res.seq_res, - task->tk_status); + if (!nfs4_sequence_done(task, &data->res.seq_res)) + return; data->rpc_status = task->tk_status; - if (RPC_ASSASSINATED(task)) - goto out; if (data->arg.new_lock_owner != 0) { if (data->rpc_status == 0) nfs_confirm_seqid(&data->lsp->ls_seqid, 0); @@ -4424,6 +4426,34 @@ out: return err; } +static void nfs4_release_lockowner_release(void *calldata) +{ + kfree(calldata); +} + +const struct rpc_call_ops nfs4_release_lockowner_ops = { + .rpc_release = nfs4_release_lockowner_release, +}; + +void nfs4_release_lockowner(const struct nfs4_lock_state *lsp) +{ + struct nfs_server *server = lsp->ls_state->owner->so_server; + struct nfs_release_lockowner_args *args; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_RELEASE_LOCKOWNER], + }; + + if (server->nfs_client->cl_mvops->minor_version != 0) + return; + args = kmalloc(sizeof(*args), GFP_NOFS); + if (!args) + return; + args->lock_owner.clientid = server->nfs_client->cl_clientid; + args->lock_owner.id = lsp->ls_id.id; + msg.rpc_argp = args; + rpc_call_async(server->client, &msg, 0, &nfs4_release_lockowner_ops, args); +} + #define XATTR_NAME_NFSV4_ACL "system.nfs4_acl" int nfs4_setxattr(struct dentry *dentry, const char *key, const void *buf, @@ -4611,7 +4641,8 @@ static void nfs4_get_lease_time_done(struct rpc_task *task, void *calldata) (struct nfs4_get_lease_time_data *)calldata; dprintk("--> %s\n", __func__); - nfs41_sequence_done(data->clp, &data->res->lr_seq_res, task->tk_status); + if (!nfs41_sequence_done(task, &data->res->lr_seq_res)) + return; switch (task->tk_status) { case -NFS4ERR_DELAY: case -NFS4ERR_GRACE: @@ -4805,13 +4836,6 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) if (!session) return NULL; - /* - * The create session reply races with the server back - * channel probe. Mark the client NFS_CS_SESSION_INITING - * so that the client back channel can find the - * nfs_client struct - */ - clp->cl_cons_state = NFS_CS_SESSION_INITING; init_completion(&session->complete); tbl = &session->fc_slot_table; @@ -4824,6 +4848,8 @@ struct nfs4_session *nfs4_alloc_session(struct nfs_client *clp) spin_lock_init(&tbl->slot_tbl_lock); rpc_init_wait_queue(&tbl->slot_tbl_waitq, "BackChannel Slot table"); + session->session_state = 1<<NFS4_SESSION_INITING; + session->clp = clp; return session; } @@ -5040,6 +5066,10 @@ int nfs4_init_session(struct nfs_server *server) if (!nfs4_has_session(clp)) return 0; + session = clp->cl_session; + if (!test_and_clear_bit(NFS4_SESSION_INITING, &session->session_state)) + return 0; + rsize = server->rsize; if (rsize == 0) rsize = NFS_MAX_FILE_IO_SIZE; @@ -5047,7 +5077,6 @@ int nfs4_init_session(struct nfs_server *server) if (wsize == 0) wsize = NFS_MAX_FILE_IO_SIZE; - session = clp->cl_session; session->fc_attrs.max_rqst_sz = wsize + nfs41_maxwrite_overhead; session->fc_attrs.max_resp_sz = rsize + nfs41_maxread_overhead; @@ -5060,69 +5089,70 @@ int nfs4_init_session(struct nfs_server *server) /* * Renew the cl_session lease. */ -static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) -{ +struct nfs4_sequence_data { + struct nfs_client *clp; struct nfs4_sequence_args args; struct nfs4_sequence_res res; - - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], - .rpc_argp = &args, - .rpc_resp = &res, - .rpc_cred = cred, - }; - - args.sa_cache_this = 0; - - return nfs4_call_sync_sequence(clp, clp->cl_rpcclient, &msg, &args, - &res, args.sa_cache_this, 1); -} +}; static void nfs41_sequence_release(void *data) { - struct nfs_client *clp = (struct nfs_client *)data; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; if (atomic_read(&clp->cl_count) > 1) nfs4_schedule_state_renewal(clp); nfs_put_client(clp); + kfree(calldata); +} + +static int nfs41_sequence_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_state_recovery(clp); + } + return 0; } static void nfs41_sequence_call_done(struct rpc_task *task, void *data) { - struct nfs_client *clp = (struct nfs_client *)data; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; - nfs41_sequence_done(clp, task->tk_msg.rpc_resp, task->tk_status); + if (!nfs41_sequence_done(task, task->tk_msg.rpc_resp)) + return; if (task->tk_status < 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (atomic_read(&clp->cl_count) == 1) goto out; - if (_nfs4_async_handle_error(task, NULL, clp, NULL) - == -EAGAIN) { - nfs_restart_rpc(task, clp); + if (nfs41_sequence_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); return; } } dprintk("%s rpc_cred %p\n", __func__, task->tk_msg.rpc_cred); out: - kfree(task->tk_msg.rpc_argp); - kfree(task->tk_msg.rpc_resp); - dprintk("<-- %s\n", __func__); } static void nfs41_sequence_prepare(struct rpc_task *task, void *data) { - struct nfs_client *clp; + struct nfs4_sequence_data *calldata = data; + struct nfs_client *clp = calldata->clp; struct nfs4_sequence_args *args; struct nfs4_sequence_res *res; - clp = (struct nfs_client *)data; args = task->tk_msg.rpc_argp; res = task->tk_msg.rpc_resp; - if (nfs4_setup_sequence(clp, args, res, 0, task)) + if (nfs41_setup_sequence(clp->cl_session, args, res, 0, task)) return; rpc_call_start(task); } @@ -5133,32 +5163,67 @@ static const struct rpc_call_ops nfs41_sequence_ops = { .rpc_release = nfs41_sequence_release, }; -static int nfs41_proc_async_sequence(struct nfs_client *clp, - struct rpc_cred *cred) +static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) { - struct nfs4_sequence_args *args; - struct nfs4_sequence_res *res; + struct nfs4_sequence_data *calldata; struct rpc_message msg = { .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEQUENCE], .rpc_cred = cred, }; + struct rpc_task_setup task_setup_data = { + .rpc_client = clp->cl_rpcclient, + .rpc_message = &msg, + .callback_ops = &nfs41_sequence_ops, + .flags = RPC_TASK_ASYNC | RPC_TASK_SOFT, + }; if (!atomic_inc_not_zero(&clp->cl_count)) - return -EIO; - args = kzalloc(sizeof(*args), GFP_NOFS); - res = kzalloc(sizeof(*res), GFP_NOFS); - if (!args || !res) { - kfree(args); - kfree(res); + return ERR_PTR(-EIO); + calldata = kmalloc(sizeof(*calldata), GFP_NOFS); + if (calldata == NULL) { nfs_put_client(clp); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - res->sr_slotid = NFS4_MAX_SLOT_TABLE; - msg.rpc_argp = args; - msg.rpc_resp = res; + calldata->res.sr_slotid = NFS4_MAX_SLOT_TABLE; + msg.rpc_argp = &calldata->args; + msg.rpc_resp = &calldata->res; + calldata->clp = clp; + task_setup_data.callback_data = calldata; - return rpc_call_async(clp->cl_rpcclient, &msg, RPC_TASK_SOFT, - &nfs41_sequence_ops, (void *)clp); + return rpc_run_task(&task_setup_data); +} + +static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret = 0; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) + ret = PTR_ERR(task); + else + rpc_put_task(task); + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; +} + +static int nfs4_proc_sequence(struct nfs_client *clp, struct rpc_cred *cred) +{ + struct rpc_task *task; + int ret; + + task = _nfs41_proc_sequence(clp, cred); + if (IS_ERR(task)) { + ret = PTR_ERR(task); + goto out; + } + ret = rpc_wait_for_completion_task(task); + if (!ret) + ret = task->tk_status; + rpc_put_task(task); +out: + dprintk("<-- %s status=%d\n", __func__, ret); + return ret; } struct nfs4_reclaim_complete_data { @@ -5172,13 +5237,31 @@ static void nfs4_reclaim_complete_prepare(struct rpc_task *task, void *data) struct nfs4_reclaim_complete_data *calldata = data; rpc_task_set_priority(task, RPC_PRIORITY_PRIVILEGED); - if (nfs4_setup_sequence(calldata->clp, &calldata->arg.seq_args, + if (nfs41_setup_sequence(calldata->clp->cl_session, + &calldata->arg.seq_args, &calldata->res.seq_res, 0, task)) return; rpc_call_start(task); } +static int nfs41_reclaim_complete_handle_errors(struct rpc_task *task, struct nfs_client *clp) +{ + switch(task->tk_status) { + case 0: + case -NFS4ERR_COMPLETE_ALREADY: + case -NFS4ERR_WRONG_CRED: /* What to do here? */ + break; + case -NFS4ERR_DELAY: + case -EKEYEXPIRED: + rpc_delay(task, NFS4_POLL_RETRY_MAX); + return -EAGAIN; + default: + nfs4_schedule_state_recovery(clp); + } + return 0; +} + static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) { struct nfs4_reclaim_complete_data *calldata = data; @@ -5186,32 +5269,13 @@ static void nfs4_reclaim_complete_done(struct rpc_task *task, void *data) struct nfs4_sequence_res *res = &calldata->res.seq_res; dprintk("--> %s\n", __func__); - nfs41_sequence_done(clp, res, task->tk_status); - switch (task->tk_status) { - case 0: - case -NFS4ERR_COMPLETE_ALREADY: - break; - case -NFS4ERR_BADSESSION: - case -NFS4ERR_DEADSESSION: - /* - * Handle the session error, but do not retry the operation, as - * we have no way of telling whether the clientid had to be - * reset before we got our reply. If reset, a new wave of - * reclaim operations will follow, containing their own reclaim - * complete. We don't want our retry to get on the way of - * recovery by incorrectly indicating to the server that we're - * done reclaiming state since the process had to be restarted. - */ - _nfs4_async_handle_error(task, NULL, clp, NULL); - break; - default: - if (_nfs4_async_handle_error( - task, NULL, clp, NULL) == -EAGAIN) { - rpc_restart_call_prepare(task); - return; - } - } + if (!nfs41_sequence_done(task, res)) + return; + if (nfs41_reclaim_complete_handle_errors(task, clp) == -EAGAIN) { + rpc_restart_call_prepare(task); + return; + } dprintk("<-- %s\n", __func__); } @@ -5325,28 +5389,30 @@ struct nfs4_state_maintenance_ops nfs41_state_renewal_ops = { }; #endif -/* - * Per minor version reboot and network partition recovery ops - */ - -struct nfs4_state_recovery_ops *nfs4_reboot_recovery_ops[] = { - &nfs40_reboot_recovery_ops, -#if defined(CONFIG_NFS_V4_1) - &nfs41_reboot_recovery_ops, -#endif +static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = { + .minor_version = 0, + .call_sync = _nfs4_call_sync, + .validate_stateid = nfs4_validate_delegation_stateid, + .reboot_recovery_ops = &nfs40_reboot_recovery_ops, + .nograce_recovery_ops = &nfs40_nograce_recovery_ops, + .state_renewal_ops = &nfs40_state_renewal_ops, }; -struct nfs4_state_recovery_ops *nfs4_nograce_recovery_ops[] = { - &nfs40_nograce_recovery_ops, #if defined(CONFIG_NFS_V4_1) - &nfs41_nograce_recovery_ops, -#endif +static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { + .minor_version = 1, + .call_sync = _nfs4_call_sync_session, + .validate_stateid = nfs41_validate_delegation_stateid, + .reboot_recovery_ops = &nfs41_reboot_recovery_ops, + .nograce_recovery_ops = &nfs41_nograce_recovery_ops, + .state_renewal_ops = &nfs41_state_renewal_ops, }; +#endif -struct nfs4_state_maintenance_ops *nfs4_state_renewal_ops[] = { - &nfs40_state_renewal_ops, +const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { + [0] = &nfs_v4_0_minor_ops, #if defined(CONFIG_NFS_V4_1) - &nfs41_state_renewal_ops, + [1] = &nfs_v4_1_minor_ops, #endif }; diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index d87f10327b7..72b6c580af1 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -54,14 +54,14 @@ void nfs4_renew_state(struct work_struct *work) { - struct nfs4_state_maintenance_ops *ops; + const struct nfs4_state_maintenance_ops *ops; struct nfs_client *clp = container_of(work, struct nfs_client, cl_renewd.work); struct rpc_cred *cred; long lease; unsigned long last, now; - ops = nfs4_state_renewal_ops[clp->cl_minorversion]; + ops = clp->cl_mvops->state_renewal_ops; dprintk("%s: start\n", __func__); /* Are there any active superblocks? */ if (list_empty(&clp->cl_superblocks)) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 34acf5926fd..3e2f19b04c0 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -145,7 +145,9 @@ static void nfs4_end_drain_session(struct nfs_client *clp) struct nfs4_session *ses = clp->cl_session; int max_slots; - if (test_and_clear_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state)) { + if (ses == NULL) + return; + if (test_and_clear_bit(NFS4_SESSION_DRAINING, &ses->session_state)) { spin_lock(&ses->fc_slot_table.slot_tbl_lock); max_slots = ses->fc_slot_table.max_slots; while (max_slots--) { @@ -167,7 +169,7 @@ static int nfs4_begin_drain_session(struct nfs_client *clp) struct nfs4_slot_table *tbl = &ses->fc_slot_table; spin_lock(&tbl->slot_tbl_lock); - set_bit(NFS4CLNT_SESSION_DRAINING, &clp->cl_state); + set_bit(NFS4_SESSION_DRAINING, &ses->session_state); if (tbl->highest_used_slotid != -1) { INIT_COMPLETION(ses->complete); spin_unlock(&tbl->slot_tbl_lock); @@ -371,7 +373,6 @@ nfs4_alloc_state_owner(void) return NULL; spin_lock_init(&sp->so_lock); INIT_LIST_HEAD(&sp->so_states); - INIT_LIST_HEAD(&sp->so_delegations); rpc_init_wait_queue(&sp->so_sequence.wait, "Seqid_waitqueue"); sp->so_seqid.sequence = &sp->so_sequence; spin_lock_init(&sp->so_sequence.lock); @@ -384,7 +385,7 @@ static void nfs4_drop_state_owner(struct nfs4_state_owner *sp) { if (!RB_EMPTY_NODE(&sp->so_client_node)) { - struct nfs_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_server->nfs_client; spin_lock(&clp->cl_lock); rb_erase(&sp->so_client_node, &clp->cl_state_owners); @@ -406,7 +407,6 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct new = nfs4_alloc_state_owner(); if (new == NULL) return NULL; - new->so_client = clp; new->so_server = server; new->so_cred = cred; spin_lock(&clp->cl_lock); @@ -423,7 +423,7 @@ struct nfs4_state_owner *nfs4_get_state_owner(struct nfs_server *server, struct void nfs4_put_state_owner(struct nfs4_state_owner *sp) { - struct nfs_client *clp = sp->so_client; + struct nfs_client *clp = sp->so_server->nfs_client; struct rpc_cred *cred = sp->so_cred; if (!atomic_dec_and_lock(&sp->so_count, &clp->cl_lock)) @@ -602,12 +602,21 @@ void nfs4_close_sync(struct path *path, struct nfs4_state *state, fmode_t fmode) * that is compatible with current->files */ static struct nfs4_lock_state * -__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *pos; list_for_each_entry(pos, &state->lock_states, ls_locks) { - if (pos->ls_owner != fl_owner) + if (type != NFS4_ANY_LOCK_TYPE && pos->ls_owner.lo_type != type) continue; + switch (pos->ls_owner.lo_type) { + case NFS4_POSIX_LOCK_TYPE: + if (pos->ls_owner.lo_u.posix_owner != fl_owner) + continue; + break; + case NFS4_FLOCK_LOCK_TYPE: + if (pos->ls_owner.lo_u.flock_owner != fl_pid) + continue; + } atomic_inc(&pos->ls_count); return pos; } @@ -619,10 +628,10 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) +static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid, unsigned int type) { struct nfs4_lock_state *lsp; - struct nfs_client *clp = state->owner->so_client; + struct nfs_client *clp = state->owner->so_server->nfs_client; lsp = kzalloc(sizeof(*lsp), GFP_NOFS); if (lsp == NULL) @@ -633,7 +642,18 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f lsp->ls_seqid.sequence = &lsp->ls_sequence; atomic_set(&lsp->ls_count, 1); lsp->ls_state = state; - lsp->ls_owner = fl_owner; + lsp->ls_owner.lo_type = type; + switch (lsp->ls_owner.lo_type) { + case NFS4_FLOCK_LOCK_TYPE: + lsp->ls_owner.lo_u.flock_owner = fl_pid; + break; + case NFS4_POSIX_LOCK_TYPE: + lsp->ls_owner.lo_u.posix_owner = fl_owner; + break; + default: + kfree(lsp); + return NULL; + } spin_lock(&clp->cl_lock); nfs_alloc_unique_id(&clp->cl_lockowner_id, &lsp->ls_id, 1, 64); spin_unlock(&clp->cl_lock); @@ -643,7 +663,7 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) { - struct nfs_client *clp = lsp->ls_state->owner->so_client; + struct nfs_client *clp = lsp->ls_state->owner->so_server->nfs_client; spin_lock(&clp->cl_lock); nfs_free_unique_id(&clp->cl_lockowner_id, &lsp->ls_id); @@ -657,13 +677,13 @@ static void nfs4_free_lock_state(struct nfs4_lock_state *lsp) * exists, return an uninitialized one. * */ -static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner) +static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_owner_t owner, pid_t pid, unsigned int type) { struct nfs4_lock_state *lsp, *new = NULL; for(;;) { spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, owner); + lsp = __nfs4_find_lock_state(state, owner, pid, type); if (lsp != NULL) break; if (new != NULL) { @@ -674,7 +694,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_ break; } spin_unlock(&state->state_lock); - new = nfs4_alloc_lock_state(state, owner); + new = nfs4_alloc_lock_state(state, owner, pid, type); if (new == NULL) return NULL; } @@ -701,6 +721,8 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); + if (lsp->ls_flags & NFS_LOCK_INITIALIZED) + nfs4_release_lockowner(lsp); nfs4_free_lock_state(lsp); } @@ -728,7 +750,12 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) if (fl->fl_ops != NULL) return 0; - lsp = nfs4_get_lock_state(state, fl->fl_owner); + if (fl->fl_flags & FL_POSIX) + lsp = nfs4_get_lock_state(state, fl->fl_owner, 0, NFS4_POSIX_LOCK_TYPE); + else if (fl->fl_flags & FL_FLOCK) + lsp = nfs4_get_lock_state(state, 0, fl->fl_pid, NFS4_FLOCK_LOCK_TYPE); + else + return -EINVAL; if (lsp == NULL) return -ENOMEM; fl->fl_u.nfs4_fl.owner = lsp; @@ -740,7 +767,7 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl) * Byte-range lock aware utility to initialize the stateid of read/write * requests. */ -void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner) +void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t fl_owner, pid_t fl_pid) { struct nfs4_lock_state *lsp; int seq; @@ -753,7 +780,7 @@ void nfs4_copy_stateid(nfs4_stateid *dst, struct nfs4_state *state, fl_owner_t f return; spin_lock(&state->state_lock); - lsp = __nfs4_find_lock_state(state, fl_owner); + lsp = __nfs4_find_lock_state(state, fl_owner, fl_pid, NFS4_ANY_LOCK_TYPE); if (lsp != NULL && (lsp->ls_flags & NFS_LOCK_INITIALIZED) != 0) memcpy(dst, &lsp->ls_stateid, sizeof(*dst)); spin_unlock(&state->state_lock); @@ -1041,11 +1068,11 @@ restart: case -NFS4ERR_BAD_STATEID: case -NFS4ERR_RECLAIM_BAD: case -NFS4ERR_RECLAIM_CONFLICT: - nfs4_state_mark_reclaim_nograce(sp->so_client, state); + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); break; case -NFS4ERR_EXPIRED: case -NFS4ERR_NO_GRACE: - nfs4_state_mark_reclaim_nograce(sp->so_client, state); + nfs4_state_mark_reclaim_nograce(sp->so_server->nfs_client, state); case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_BADSESSION: case -NFS4ERR_BADSLOT: @@ -1120,8 +1147,7 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp) if (!test_and_clear_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) return; - nfs4_reclaim_complete(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + nfs4_reclaim_complete(clp, clp->cl_mvops->reboot_recovery_ops); for (pos = rb_first(&clp->cl_state_owners); pos != NULL; pos = rb_next(pos)) { sp = rb_entry(pos, struct nfs4_state_owner, so_client_node); @@ -1211,8 +1237,8 @@ restart: static int nfs4_check_lease(struct nfs_client *clp) { struct rpc_cred *cred; - struct nfs4_state_maintenance_ops *ops = - nfs4_state_renewal_ops[clp->cl_minorversion]; + const struct nfs4_state_maintenance_ops *ops = + clp->cl_mvops->state_renewal_ops; int status = -NFS4ERR_EXPIRED; /* Is the client already known to have an expired lease? */ @@ -1235,8 +1261,8 @@ out: static int nfs4_reclaim_lease(struct nfs_client *clp) { struct rpc_cred *cred; - struct nfs4_state_recovery_ops *ops = - nfs4_reboot_recovery_ops[clp->cl_minorversion]; + const struct nfs4_state_recovery_ops *ops = + clp->cl_mvops->reboot_recovery_ops; int status = -ENOENT; cred = ops->get_clid_cred(clp); @@ -1444,7 +1470,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* First recover reboot state... */ if (test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) { status = nfs4_do_reclaim(clp, - nfs4_reboot_recovery_ops[clp->cl_minorversion]); + clp->cl_mvops->reboot_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) continue; @@ -1458,7 +1484,7 @@ static void nfs4_state_manager(struct nfs_client *clp) /* Now recover expired state... */ if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) { status = nfs4_do_reclaim(clp, - nfs4_nograce_recovery_ops[clp->cl_minorversion]); + clp->cl_mvops->nograce_recovery_ops); if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 65c8dae4b26..08ef9129113 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -202,14 +202,17 @@ static int nfs4_stat_to_errno(int); #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) +#define encode_lockowner_maxsz (7) #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ - 1 + encode_stateid_maxsz + 8) + 1 + encode_stateid_maxsz + 1 + \ + encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ (8 + decode_lockowner_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) -#define encode_lockt_maxsz (op_encode_hdr_maxsz + 12) +#define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ + encode_lockowner_maxsz) #define decode_lockt_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_locku_maxsz (op_encode_hdr_maxsz + 3 + \ @@ -217,6 +220,11 @@ static int nfs4_stat_to_errno(int); 4) #define decode_locku_maxsz (op_decode_hdr_maxsz + \ decode_stateid_maxsz) +#define encode_release_lockowner_maxsz \ + (op_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define decode_release_lockowner_maxsz \ + (op_decode_hdr_maxsz) #define encode_access_maxsz (op_encode_hdr_maxsz + 1) #define decode_access_maxsz (op_decode_hdr_maxsz + 2) #define encode_symlink_maxsz (op_encode_hdr_maxsz + \ @@ -471,6 +479,12 @@ static int nfs4_stat_to_errno(int); decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_locku_maxsz) +#define NFS4_enc_release_lockowner_sz \ + (compound_encode_hdr_maxsz + \ + encode_lockowner_maxsz) +#define NFS4_dec_release_lockowner_sz \ + (compound_decode_hdr_maxsz + \ + decode_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -744,7 +758,7 @@ static void encode_compound_hdr(struct xdr_stream *xdr, struct compound_hdr *hdr) { __be32 *p; - struct rpc_auth *auth = req->rq_task->tk_msg.rpc_cred->cr_auth; + struct rpc_auth *auth = req->rq_cred->cr_auth; /* initialize running count of expected bytes in reply. * NOTE: the replied tag SHOULD be the same is the one sent, @@ -1042,6 +1056,17 @@ static inline uint64_t nfs4_lock_length(struct file_lock *fl) return fl->fl_end - fl->fl_start + 1; } +static void encode_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner) +{ + __be32 *p; + + p = reserve_space(xdr, 28); + p = xdr_encode_hyper(p, lowner->clientid); + *p++ = cpu_to_be32(16); + p = xdr_encode_opaque_fixed(p, "lock id:", 8); + xdr_encode_hyper(p, lowner->id); +} + /* * opcode,type,reclaim,offset,length,new_lock_owner = 32 * open_seqid,open_stateid,lock_seqid,lock_owner.clientid, lock_owner.id = 40 @@ -1058,14 +1083,11 @@ static void encode_lock(struct xdr_stream *xdr, const struct nfs_lock_args *args p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); *p = cpu_to_be32(args->new_lock_owner); if (args->new_lock_owner){ - p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+32); + p = reserve_space(xdr, 4+NFS4_STATEID_SIZE+4); *p++ = cpu_to_be32(args->open_seqid->sequence->counter); p = xdr_encode_opaque_fixed(p, args->open_stateid->data, NFS4_STATEID_SIZE); *p++ = cpu_to_be32(args->lock_seqid->sequence->counter); - p = xdr_encode_hyper(p, args->lock_owner.clientid); - *p++ = cpu_to_be32(16); - p = xdr_encode_opaque_fixed(p, "lock id:", 8); - xdr_encode_hyper(p, args->lock_owner.id); + encode_lockowner(xdr, &args->lock_owner); } else { p = reserve_space(xdr, NFS4_STATEID_SIZE+4); @@ -1080,15 +1102,12 @@ static void encode_lockt(struct xdr_stream *xdr, const struct nfs_lockt_args *ar { __be32 *p; - p = reserve_space(xdr, 52); + p = reserve_space(xdr, 24); *p++ = cpu_to_be32(OP_LOCKT); *p++ = cpu_to_be32(nfs4_lock_type(args->fl, 0)); p = xdr_encode_hyper(p, args->fl->fl_start); p = xdr_encode_hyper(p, nfs4_lock_length(args->fl)); - p = xdr_encode_hyper(p, args->lock_owner.clientid); - *p++ = cpu_to_be32(16); - p = xdr_encode_opaque_fixed(p, "lock id:", 8); - xdr_encode_hyper(p, args->lock_owner.id); + encode_lockowner(xdr, &args->lock_owner); hdr->nops++; hdr->replen += decode_lockt_maxsz; } @@ -1108,6 +1127,17 @@ static void encode_locku(struct xdr_stream *xdr, const struct nfs_locku_args *ar hdr->replen += decode_locku_maxsz; } +static void encode_release_lockowner(struct xdr_stream *xdr, const struct nfs_lowner *lowner, struct compound_hdr *hdr) +{ + __be32 *p; + + p = reserve_space(xdr, 4); + *p = cpu_to_be32(OP_RELEASE_LOCKOWNER); + encode_lockowner(xdr, lowner); + hdr->nops++; + hdr->replen += decode_release_lockowner_maxsz; +} + static void encode_lookup(struct xdr_stream *xdr, const struct qstr *name, struct compound_hdr *hdr) { int len = name->len; @@ -1172,7 +1202,7 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op break; default: clp = arg->server->nfs_client; - if (clp->cl_minorversion > 0) { + if (clp->cl_mvops->minor_version > 0) { if (nfs4_has_persistent_session(clp)) { *p = cpu_to_be32(NFS4_CREATE_GUARDED); encode_attrs(xdr, arg->u.attrs, arg->server); @@ -1324,14 +1354,14 @@ static void encode_putrootfh(struct xdr_stream *xdr, struct compound_hdr *hdr) hdr->replen += decode_putrootfh_maxsz; } -static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx) +static void encode_stateid(struct xdr_stream *xdr, const struct nfs_open_context *ctx, const struct nfs_lock_context *l_ctx) { nfs4_stateid stateid; __be32 *p; p = reserve_space(xdr, NFS4_STATEID_SIZE); if (ctx->state != NULL) { - nfs4_copy_stateid(&stateid, ctx->state, ctx->lockowner); + nfs4_copy_stateid(&stateid, ctx->state, l_ctx->lockowner, l_ctx->pid); xdr_encode_opaque_fixed(p, stateid.data, NFS4_STATEID_SIZE); } else xdr_encode_opaque_fixed(p, zero_stateid.data, NFS4_STATEID_SIZE); @@ -1344,7 +1374,7 @@ static void encode_read(struct xdr_stream *xdr, const struct nfs_readargs *args, p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_READ); - encode_stateid(xdr, args->context); + encode_stateid(xdr, args->context, args->lock_context); p = reserve_space(xdr, 12); p = xdr_encode_hyper(p, args->offset); @@ -1523,7 +1553,7 @@ static void encode_write(struct xdr_stream *xdr, const struct nfs_writeargs *arg p = reserve_space(xdr, 4); *p = cpu_to_be32(OP_WRITE); - encode_stateid(xdr, args->context); + encode_stateid(xdr, args->context, args->lock_context); p = reserve_space(xdr, 16); p = xdr_encode_hyper(p, args->offset); @@ -1704,7 +1734,7 @@ static u32 nfs4_xdr_minorversion(const struct nfs4_sequence_args *args) { #if defined(CONFIG_NFS_V4_1) if (args->sa_session) - return args->sa_session->clp->cl_minorversion; + return args->sa_session->clp->cl_mvops->minor_version; #endif /* CONFIG_NFS_V4_1 */ return 0; } @@ -2048,6 +2078,20 @@ static int nfs4_xdr_enc_locku(struct rpc_rqst *req, __be32 *p, struct nfs_locku_ return 0; } +static int nfs4_xdr_enc_release_lockowner(struct rpc_rqst *req, __be32 *p, struct nfs_release_lockowner_args *args) +{ + struct xdr_stream xdr; + struct compound_hdr hdr = { + .minorversion = 0, + }; + + xdr_init_encode(&xdr, &req->rq_snd_buf, p); + encode_compound_hdr(&xdr, req, &hdr); + encode_release_lockowner(&xdr, &args->lock_owner, &hdr); + encode_nops(&hdr); + return 0; +} + /* * Encode a READLINK request */ @@ -2395,7 +2439,7 @@ static int nfs4_xdr_enc_exchange_id(struct rpc_rqst *req, uint32_t *p, { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = args->client->cl_minorversion, + .minorversion = args->client->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2413,7 +2457,7 @@ static int nfs4_xdr_enc_create_session(struct rpc_rqst *req, uint32_t *p, { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = args->client->cl_minorversion, + .minorversion = args->client->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -2431,7 +2475,7 @@ static int nfs4_xdr_enc_destroy_session(struct rpc_rqst *req, uint32_t *p, { struct xdr_stream xdr; struct compound_hdr hdr = { - .minorversion = session->clp->cl_minorversion, + .minorversion = session->clp->cl_mvops->minor_version, }; xdr_init_encode(&xdr, &req->rq_snd_buf, p); @@ -3973,6 +4017,11 @@ static int decode_locku(struct xdr_stream *xdr, struct nfs_locku_res *res) return status; } +static int decode_release_lockowner(struct xdr_stream *xdr) +{ + return decode_op_hdr(xdr, OP_RELEASE_LOCKOWNER); +} + static int decode_lookup(struct xdr_stream *xdr) { return decode_op_hdr(xdr, OP_LOOKUP); @@ -5259,6 +5308,19 @@ out: return status; } +static int nfs4_xdr_dec_release_lockowner(struct rpc_rqst *rqstp, __be32 *p, void *dummy) +{ + struct xdr_stream xdr; + struct compound_hdr hdr; + int status; + + xdr_init_decode(&xdr, &rqstp->rq_rcv_buf, p); + status = decode_compound_hdr(&xdr, &hdr); + if (!status) + status = decode_release_lockowner(&xdr); + return status; +} + /* * Decode READLINK response */ @@ -5866,6 +5928,7 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(GETACL, enc_getacl, dec_getacl), PROC(SETACL, enc_setacl, dec_setacl), PROC(FS_LOCATIONS, enc_fs_locations, dec_fs_locations), + PROC(RELEASE_LOCKOWNER, enc_release_lockowner, dec_release_lockowner), #if defined(CONFIG_NFS_V4_1) PROC(EXCHANGE_ID, enc_exchange_id, dec_exchange_id), PROC(CREATE_SESSION, enc_create_session, dec_create_session), diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index a3654e57b58..919490232e1 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -79,6 +79,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct inode *inode, req->wb_pgbase = offset; req->wb_bytes = count; req->wb_context = get_nfs_open_context(ctx); + req->wb_lock_context = nfs_get_lock_context(ctx); kref_init(&req->wb_kref); return req; } @@ -141,11 +142,16 @@ void nfs_clear_request(struct nfs_page *req) { struct page *page = req->wb_page; struct nfs_open_context *ctx = req->wb_context; + struct nfs_lock_context *l_ctx = req->wb_lock_context; if (page != NULL) { page_cache_release(page); req->wb_page = NULL; } + if (l_ctx != NULL) { + nfs_put_lock_context(l_ctx); + req->wb_lock_context = NULL; + } if (ctx != NULL) { put_nfs_open_context(ctx); req->wb_context = NULL; @@ -235,7 +241,7 @@ static int nfs_can_coalesce_requests(struct nfs_page *prev, { if (req->wb_context->cred != prev->wb_context->cred) return 0; - if (req->wb_context->lockowner != prev->wb_context->lockowner) + if (req->wb_lock_context->lockowner != prev->wb_lock_context->lockowner) return 0; if (req->wb_context->state != prev->wb_context->state) return 0; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 6e2b06e6ca7..87adc274424 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -190,6 +190,7 @@ static int nfs_read_rpcsetup(struct nfs_page *req, struct nfs_read_data *data, data->args.pages = data->pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; data->res.fattr = &data->fattr; data->res.count = count; @@ -410,7 +411,7 @@ void nfs_read_prepare(struct rpc_task *task, void *calldata) { struct nfs_read_data *data = calldata; - if (nfs4_setup_sequence(NFS_SERVER(data->inode)->nfs_client, + if (nfs4_setup_sequence(NFS_SERVER(data->inode), &data->args.seq_args, &data->res.seq_res, 0, task)) return; diff --git a/fs/nfs/super.c b/fs/nfs/super.c index f9df16de4a5..f1ae39f6cb0 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -546,6 +546,9 @@ static void nfs_show_mountd_options(struct seq_file *m, struct nfs_server *nfss, { struct sockaddr *sap = (struct sockaddr *)&nfss->mountd_address; + if (nfss->flags & NFS_MOUNT_LEGACY_INTERFACE) + return; + switch (sap->sa_family) { case AF_INET: { struct sockaddr_in *sin = (struct sockaddr_in *)sap; @@ -1780,6 +1783,7 @@ static int nfs_validate_mount_data(void *options, * can deal with. */ args->flags = data->flags & NFS_MOUNT_FLAGMASK; + args->flags |= NFS_MOUNT_LEGACY_INTERFACE; args->rsize = data->rsize; args->wsize = data->wsize; args->timeo = data->timeo; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index a2242af6a17..2f84adaad42 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -110,7 +110,7 @@ void nfs_unlink_prepare(struct rpc_task *task, void *calldata) struct nfs_unlinkdata *data = calldata; struct nfs_server *server = NFS_SERVER(data->dir); - if (nfs4_setup_sequence(server->nfs_client, &data->args.seq_args, + if (nfs4_setup_sequence(server, &data->args.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 9f81bdd91c5..874972d9427 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -700,7 +700,9 @@ int nfs_flush_incompatible(struct file *file, struct page *page) req = nfs_page_find_request(page); if (req == NULL) return 0; - do_flush = req->wb_page != page || req->wb_context != ctx; + do_flush = req->wb_page != page || req->wb_context != ctx || + req->wb_lock_context->lockowner != current->files || + req->wb_lock_context->pid != current->tgid; nfs_release_request(req); if (!do_flush) return 0; @@ -824,6 +826,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req, data->args.pages = data->pagevec; data->args.count = count; data->args.context = get_nfs_open_context(req->wb_context); + data->args.lock_context = req->wb_lock_context; data->args.stable = NFS_UNSTABLE; if (how & FLUSH_STABLE) { data->args.stable = NFS_DATA_SYNC; @@ -1047,9 +1050,9 @@ out: void nfs_write_prepare(struct rpc_task *task, void *calldata) { struct nfs_write_data *data = calldata; - struct nfs_client *clp = (NFS_SERVER(data->inode))->nfs_client; - if (nfs4_setup_sequence(clp, &data->args.seq_args, + if (nfs4_setup_sequence(NFS_SERVER(data->inode), + &data->args.seq_args, &data->res.seq_res, 1, task)) return; rpc_call_start(task); diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 3d68f45a37b..5b7e3021e06 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -168,7 +168,7 @@ nfsd3_proc_read(struct svc_rqst *rqstp, struct nfsd3_readargs *argp, svc_reserve_auth(rqstp, ((1 + NFS3_POST_OP_ATTR_WORDS + 3)<<2) + resp->count +4); fh_copy(&resp->fh, &argp->fh); - nfserr = nfsd_read(rqstp, &resp->fh, NULL, + nfserr = nfsd_read(rqstp, &resp->fh, argp->offset, rqstp->rq_vec, argp->vlen, &resp->count); @@ -271,7 +271,7 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, fh_init(&resp->fh, NFS3_FHSIZE); nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, &argp->attrs, S_IFDIR, 0, &resp->fh); - + fh_unlock(&resp->dirfh); RETURN_STATUS(nfserr); } @@ -327,7 +327,7 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp, struct nfsd3_mknodargs *argp, type = nfs3_ftypes[argp->ftype]; nfserr = nfsd_create(rqstp, &resp->dirfh, argp->name, argp->len, &argp->attrs, type, rdev, &resp->fh); - + fh_unlock(&resp->dirfh); RETURN_STATUS(nfserr); } @@ -348,6 +348,7 @@ nfsd3_proc_remove(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); nfserr = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); RETURN_STATUS(nfserr); } @@ -367,6 +368,7 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp, struct nfsd3_diropargs *argp, fh_copy(&resp->fh, &argp->fh); nfserr = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); + fh_unlock(&resp->fh); RETURN_STATUS(nfserr); } diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index eb78e7e2207..988cbb3a19b 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -143,8 +143,6 @@ struct nfs4_cb_compound_hdr { u32 minorversion; /* res */ int status; - u32 taglen; - char *tag; }; static struct { @@ -205,6 +203,16 @@ nfs_cb_stat_to_errno(int stat) */ static void +encode_stateid(struct xdr_stream *xdr, stateid_t *sid) +{ + __be32 *p; + + RESERVE_SPACE(sizeof(stateid_t)); + WRITE32(sid->si_generation); + WRITEMEM(&sid->si_opaque, sizeof(stateid_opaque_t)); +} + +static void encode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { __be32 * p; @@ -229,10 +237,10 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_delegation *dp, __be32 *p; int len = dp->dl_fh.fh_size; - RESERVE_SPACE(12+sizeof(dp->dl_stateid) + len); + RESERVE_SPACE(4); WRITE32(OP_CB_RECALL); - WRITE32(dp->dl_stateid.si_generation); - WRITEMEM(&dp->dl_stateid.si_opaque, sizeof(stateid_opaque_t)); + encode_stateid(xdr, &dp->dl_stateid); + RESERVE_SPACE(8 + (XDR_QUADLEN(len) << 2)); WRITE32(0); /* truncate optimization not implemented */ WRITE32(len); WRITEMEM(&dp->dl_fh.fh_base, len); @@ -293,13 +301,14 @@ nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, __be32 *p, static int decode_cb_compound_hdr(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr){ __be32 *p; + u32 taglen; READ_BUF(8); READ32(hdr->status); - READ32(hdr->taglen); - READ_BUF(hdr->taglen + 4); - hdr->tag = (char *)p; - p += XDR_QUADLEN(hdr->taglen); + /* We've got no use for the tag; ignore it: */ + READ32(taglen); + READ_BUF(taglen + 4); + p += XDR_QUADLEN(taglen); READ32(hdr->nops); return 0; } @@ -667,28 +676,28 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) } switch (task->tk_status) { - case -EIO: + case 0: + return; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* Race: client probably got cb_recall + * before open reply granting delegation */ + break; + default: /* Network partition? */ atomic_set(&clp->cl_cb_set, 0); warn_no_callback_path(clp, task->tk_status); if (current_rpc_client != task->tk_client) { /* queue a callback on the new connection: */ + atomic_inc(&dp->dl_count); nfsd4_cb_recall(dp); return; } - case -EBADHANDLE: - case -NFS4ERR_BAD_STATEID: - /* Race: client probably got cb_recall - * before open reply granting delegation */ - break; - default: - /* success, or error we can't handle */ - return; } if (dp->dl_retries--) { rpc_delay(task, 2*HZ); task->tk_status = 0; - rpc_restart_call(task); + rpc_restart_call_prepare(task); return; } else { atomic_set(&clp->cl_cb_set, 0); @@ -752,18 +761,16 @@ static void _nfsd4_cb_recall(struct nfs4_delegation *dp) .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL], .rpc_cred = callback_cred }; - int status; - if (clnt == NULL) + if (clnt == NULL) { + nfs4_put_delegation(dp); return; /* Client is shutting down; give up. */ + } args->args_op = dp; msg.rpc_argp = args; dp->dl_retries = 1; - status = rpc_call_async(clnt, &msg, RPC_TASK_SOFT, - &nfsd4_cb_recall_ops, dp); - if (status) - nfs4_put_delegation(dp); + rpc_call_async(clnt, &msg, RPC_TASK_SOFT, &nfsd4_cb_recall_ops, dp); } void nfsd4_do_callback_rpc(struct work_struct *w) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 4a273475877..2e7357104cf 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -51,7 +51,6 @@ static time_t boot_time; static u32 current_ownerid = 1; static u32 current_fileid = 1; static u32 current_delegid = 1; -static u32 nfs4_init; static stateid_t zerostateid; /* bits all 0 */ static stateid_t onestateid; /* bits all 1 */ static u64 current_sessionid = 1; @@ -163,6 +162,46 @@ static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; static struct list_head file_hashtbl[FILE_HASH_SIZE]; static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; +static void __nfs4_file_get_access(struct nfs4_file *fp, int oflag) +{ + BUG_ON(!(fp->fi_fds[oflag] || fp->fi_fds[O_RDWR])); + atomic_inc(&fp->fi_access[oflag]); +} + +static void nfs4_file_get_access(struct nfs4_file *fp, int oflag) +{ + if (oflag == O_RDWR) { + __nfs4_file_get_access(fp, O_RDONLY); + __nfs4_file_get_access(fp, O_WRONLY); + } else + __nfs4_file_get_access(fp, oflag); +} + +static void nfs4_file_put_fd(struct nfs4_file *fp, int oflag) +{ + if (fp->fi_fds[oflag]) { + fput(fp->fi_fds[oflag]); + fp->fi_fds[oflag] = NULL; + } +} + +static void __nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + if (atomic_dec_and_test(&fp->fi_access[oflag])) { + nfs4_file_put_fd(fp, O_RDWR); + nfs4_file_put_fd(fp, oflag); + } +} + +static void nfs4_file_put_access(struct nfs4_file *fp, int oflag) +{ + if (oflag == O_RDWR) { + __nfs4_file_put_access(fp, O_RDONLY); + __nfs4_file_put_access(fp, O_WRONLY); + } else + __nfs4_file_put_access(fp, oflag); +} + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -171,6 +210,13 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_cb_conn *cb = &stp->st_stateowner->so_client->cl_cb_conn; dprintk("NFSD alloc_init_deleg\n"); + /* + * Major work on the lease subsystem (for example, to support + * calbacks on stat) will be required before we can support + * write delegations properly. + */ + if (type != NFS4_OPEN_DELEGATE_READ) + return NULL; if (fp->fi_had_conflict) return NULL; if (num_delegations > max_delegations) @@ -185,9 +231,8 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f dp->dl_client = clp; get_nfs4_file(fp); dp->dl_file = fp; + nfs4_file_get_access(fp, O_RDONLY); dp->dl_flock = NULL; - get_file(stp->st_vfs_file); - dp->dl_vfs_file = stp->st_vfs_file; dp->dl_type = type; dp->dl_ident = cb->cb_ident; dp->dl_stateid.si_boot = boot_time; @@ -222,15 +267,12 @@ nfs4_put_delegation(struct nfs4_delegation *dp) static void nfs4_close_delegation(struct nfs4_delegation *dp) { - struct file *filp = dp->dl_vfs_file; + struct file *filp = find_readable_file(dp->dl_file); dprintk("NFSD: close_delegation dp %p\n",dp); - dp->dl_vfs_file = NULL; - /* The following nfsd_close may not actually close the file, - * but we want to remove the lease in any case. */ if (dp->dl_flock) vfs_setlease(filp, F_UNLCK, &dp->dl_flock); - nfsd_close(filp); + nfs4_file_put_access(dp->dl_file, O_RDONLY); } /* Called under the state lock. */ @@ -302,8 +344,12 @@ static void free_generic_stateid(struct nfs4_stateid *stp) static void release_lock_stateid(struct nfs4_stateid *stp) { + struct file *file; + unhash_generic_stateid(stp); - locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner); + file = find_any_file(stp->st_file); + if (file) + locks_remove_posix(file, (fl_owner_t)stp->st_stateowner); free_generic_stateid(stp); } @@ -341,11 +387,85 @@ release_stateid_lockowners(struct nfs4_stateid *open_stp) } } +/* + * We store the NONE, READ, WRITE, and BOTH bits separately in the + * st_{access,deny}_bmap field of the stateid, in order to track not + * only what share bits are currently in force, but also what + * combinations of share bits previous opens have used. This allows us + * to enforce the recommendation of rfc 3530 14.2.19 that the server + * return an error if the client attempt to downgrade to a combination + * of share bits not explicable by closing some of its previous opens. + * + * XXX: This enforcement is actually incomplete, since we don't keep + * track of access/deny bit combinations; so, e.g., we allow: + * + * OPEN allow read, deny write + * OPEN allow both, deny none + * DOWNGRADE allow read, deny none + * + * which we should reject. + */ +static void +set_access(unsigned int *access, unsigned long bmap) { + int i; + + *access = 0; + for (i = 1; i < 4; i++) { + if (test_bit(i, &bmap)) + *access |= i; + } +} + +static void +set_deny(unsigned int *deny, unsigned long bmap) { + int i; + + *deny = 0; + for (i = 0; i < 4; i++) { + if (test_bit(i, &bmap)) + *deny |= i ; + } +} + +static int +test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { + unsigned int access, deny; + + set_access(&access, stp->st_access_bmap); + set_deny(&deny, stp->st_deny_bmap); + if ((access & open->op_share_deny) || (deny & open->op_share_access)) + return 0; + return 1; +} + +static int nfs4_access_to_omode(u32 access) +{ + switch (access) { + case NFS4_SHARE_ACCESS_READ: + return O_RDONLY; + case NFS4_SHARE_ACCESS_WRITE: + return O_WRONLY; + case NFS4_SHARE_ACCESS_BOTH: + return O_RDWR; + } + BUG(); +} + +static int nfs4_access_bmap_to_omode(struct nfs4_stateid *stp) +{ + unsigned int access; + + set_access(&access, stp->st_access_bmap); + return nfs4_access_to_omode(access); +} + static void release_open_stateid(struct nfs4_stateid *stp) { + int oflag = nfs4_access_bmap_to_omode(stp); + unhash_generic_stateid(stp); release_stateid_lockowners(stp); - nfsd_close(stp->st_vfs_file); + nfs4_file_put_access(stp->st_file, oflag); free_generic_stateid(stp); } @@ -457,7 +577,7 @@ static int set_forechannel_drc_size(struct nfsd4_channel_attrs *fchan) spin_unlock(&nfsd_drc_lock); if (fchan->maxreqs == 0) - return nfserr_serverfault; + return nfserr_jukebox; fchan->maxresp_cached = size + NFSD_MIN_HDR_SEQ_SZ; return 0; @@ -542,7 +662,7 @@ alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp, BUILD_BUG_ON(NFSD_MAX_SLOTS_PER_SESSION * sizeof(struct nfsd4_slot) + sizeof(struct nfsd4_session) > PAGE_SIZE); - status = nfserr_serverfault; + status = nfserr_jukebox; /* allocate struct nfsd4_session and slot table pointers in one piece */ slotsize = tmp.se_fchannel.maxreqs * sizeof(struct nfsd4_slot *); new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL); @@ -591,10 +711,8 @@ find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid) dump_sessionid(__func__, sessionid); idx = hash_sessionid(sessionid); - dprintk("%s: idx is %d\n", __func__, idx); /* Search in the appropriate list */ list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) { - dump_sessionid("list traversal", &elem->se_sessionid); if (!memcmp(elem->se_sessionid.data, sessionid->data, NFS4_MAX_SESSIONID_LEN)) { return elem; @@ -714,7 +832,6 @@ release_session_client(struct nfsd4_session *session) } else renew_client_locked(clp); spin_unlock(&client_lock); - nfsd4_put_session(session); } /* must be called under the client_lock */ @@ -1220,7 +1337,7 @@ out_new: /* Normal case */ new = create_client(exid->clname, dname, rqstp, &verf); if (new == NULL) { - status = nfserr_serverfault; + status = nfserr_jukebox; goto out; } @@ -1760,6 +1877,8 @@ alloc_init_file(struct inode *ino) fp->fi_inode = igrab(ino); fp->fi_id = current_fileid++; fp->fi_had_conflict = false; + memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); + memset(fp->fi_access, 0, sizeof(fp->fi_access)); spin_lock(&recall_lock); list_add(&fp->fi_hash, &file_hashtbl[hashval]); spin_unlock(&recall_lock); @@ -1971,57 +2090,6 @@ static inline int deny_valid(u32 x) } /* - * We store the NONE, READ, WRITE, and BOTH bits separately in the - * st_{access,deny}_bmap field of the stateid, in order to track not - * only what share bits are currently in force, but also what - * combinations of share bits previous opens have used. This allows us - * to enforce the recommendation of rfc 3530 14.2.19 that the server - * return an error if the client attempt to downgrade to a combination - * of share bits not explicable by closing some of its previous opens. - * - * XXX: This enforcement is actually incomplete, since we don't keep - * track of access/deny bit combinations; so, e.g., we allow: - * - * OPEN allow read, deny write - * OPEN allow both, deny none - * DOWNGRADE allow read, deny none - * - * which we should reject. - */ -static void -set_access(unsigned int *access, unsigned long bmap) { - int i; - - *access = 0; - for (i = 1; i < 4; i++) { - if (test_bit(i, &bmap)) - *access |= i; - } -} - -static void -set_deny(unsigned int *deny, unsigned long bmap) { - int i; - - *deny = 0; - for (i = 0; i < 4; i++) { - if (test_bit(i, &bmap)) - *deny |= i ; - } -} - -static int -test_share(struct nfs4_stateid *stp, struct nfsd4_open *open) { - unsigned int access, deny; - - set_access(&access, stp->st_access_bmap); - set_deny(&deny, stp->st_deny_bmap); - if ((access & open->op_share_deny) || (deny & open->op_share_access)) - return 0; - return 1; -} - -/* * Called to check deny when READ with all zero stateid or * WRITE with all zero or all one stateid */ @@ -2052,14 +2120,12 @@ out: } static inline void -nfs4_file_downgrade(struct file *filp, unsigned int share_access) +nfs4_file_downgrade(struct nfs4_file *fp, unsigned int share_access) { - if (share_access & NFS4_SHARE_ACCESS_WRITE) { - drop_file_write_access(filp); - spin_lock(&filp->f_lock); - filp->f_mode = (filp->f_mode | FMODE_READ) & ~FMODE_WRITE; - spin_unlock(&filp->f_lock); - } + if (share_access & NFS4_SHARE_ACCESS_WRITE) + nfs4_file_put_access(fp, O_WRONLY); + if (share_access & NFS4_SHARE_ACCESS_READ) + nfs4_file_put_access(fp, O_RDONLY); } /* @@ -2255,6 +2321,13 @@ find_delegation_file(struct nfs4_file *fp, stateid_t *stid) return NULL; } +int share_access_to_flags(u32 share_access) +{ + share_access &= ~NFS4_SHARE_WANT_MASK; + + return share_access == NFS4_SHARE_ACCESS_READ ? RD_STATE : WR_STATE; +} + static __be32 nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, struct nfs4_delegation **dp) @@ -2265,8 +2338,7 @@ nfs4_check_deleg(struct nfs4_file *fp, struct nfsd4_open *open, *dp = find_delegation_file(fp, &open->op_delegate_stateid); if (*dp == NULL) goto out; - flags = open->op_share_access == NFS4_SHARE_ACCESS_READ ? - RD_STATE : WR_STATE; + flags = share_access_to_flags(open->op_share_access); status = nfs4_check_delegmode(*dp, flags); if (status) *dp = NULL; @@ -2308,30 +2380,53 @@ nfs4_alloc_stateid(void) return kmem_cache_alloc(stateid_slab, GFP_KERNEL); } +static inline int nfs4_access_to_access(u32 nfs4_access) +{ + int flags = 0; + + if (nfs4_access & NFS4_SHARE_ACCESS_READ) + flags |= NFSD_MAY_READ; + if (nfs4_access & NFS4_SHARE_ACCESS_WRITE) + flags |= NFSD_MAY_WRITE; + return flags; +} + +static __be32 nfs4_get_vfs_file(struct svc_rqst *rqstp, struct nfs4_file +*fp, struct svc_fh *cur_fh, u32 nfs4_access) +{ + __be32 status; + int oflag = nfs4_access_to_omode(nfs4_access); + int access = nfs4_access_to_access(nfs4_access); + + if (!fp->fi_fds[oflag]) { + status = nfsd_open(rqstp, cur_fh, S_IFREG, access, + &fp->fi_fds[oflag]); + if (status == nfserr_dropit) + status = nfserr_jukebox; + if (status) + return status; + } + nfs4_file_get_access(fp, oflag); + + return nfs_ok; +} + static __be32 nfs4_new_open(struct svc_rqst *rqstp, struct nfs4_stateid **stpp, - struct nfs4_delegation *dp, - struct svc_fh *cur_fh, int flags) + struct nfs4_file *fp, struct svc_fh *cur_fh, + struct nfsd4_open *open) { struct nfs4_stateid *stp; + __be32 status; stp = nfs4_alloc_stateid(); if (stp == NULL) return nfserr_resource; - if (dp) { - get_file(dp->dl_vfs_file); - stp->st_vfs_file = dp->dl_vfs_file; - } else { - __be32 status; - status = nfsd_open(rqstp, cur_fh, S_IFREG, flags, - &stp->st_vfs_file); - if (status) { - if (status == nfserr_dropit) - status = nfserr_jukebox; - kmem_cache_free(stateid_slab, stp); - return status; - } + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, open->op_share_access); + if (status) { + kmem_cache_free(stateid_slab, stp); + return status; } *stpp = stp; return 0; @@ -2353,35 +2448,30 @@ nfsd4_truncate(struct svc_rqst *rqstp, struct svc_fh *fh, } static __be32 -nfs4_upgrade_open(struct svc_rqst *rqstp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) +nfs4_upgrade_open(struct svc_rqst *rqstp, struct nfs4_file *fp, struct svc_fh *cur_fh, struct nfs4_stateid *stp, struct nfsd4_open *open) { - struct file *filp = stp->st_vfs_file; - struct inode *inode = filp->f_path.dentry->d_inode; - unsigned int share_access, new_writer; + u32 op_share_access, new_access; __be32 status; - set_access(&share_access, stp->st_access_bmap); - new_writer = (~share_access) & open->op_share_access - & NFS4_SHARE_ACCESS_WRITE; - - if (new_writer) { - int err = get_write_access(inode); - if (err) - return nfserrno(err); - err = mnt_want_write(cur_fh->fh_export->ex_path.mnt); - if (err) - return nfserrno(err); - file_take_write(filp); + set_access(&new_access, stp->st_access_bmap); + new_access = (~new_access) & open->op_share_access & ~NFS4_SHARE_WANT_MASK; + + if (new_access) { + status = nfs4_get_vfs_file(rqstp, fp, cur_fh, new_access); + if (status) + return status; } status = nfsd4_truncate(rqstp, cur_fh, open); if (status) { - if (new_writer) - put_write_access(inode); + if (new_access) { + int oflag = nfs4_access_to_omode(new_access); + nfs4_file_put_access(fp, oflag); + } return status; } /* remember the open */ - filp->f_mode |= open->op_share_access; - __set_bit(open->op_share_access, &stp->st_access_bmap); + op_share_access = open->op_share_access & ~NFS4_SHARE_WANT_MASK; + __set_bit(op_share_access, &stp->st_access_bmap); __set_bit(open->op_share_deny, &stp->st_deny_bmap); return nfs_ok; @@ -2444,13 +2534,14 @@ nfs4_open_delegation(struct svc_fh *fh, struct nfsd4_open *open, struct nfs4_sta fl.fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; fl.fl_end = OFFSET_MAX; fl.fl_owner = (fl_owner_t)dp; - fl.fl_file = stp->st_vfs_file; + fl.fl_file = find_readable_file(stp->st_file); + BUG_ON(!fl.fl_file); fl.fl_pid = current->tgid; /* vfs_setlease checks to see if delegation should be handed out. * the lock_manager callbacks fl_mylease and fl_change are used */ - if ((status = vfs_setlease(stp->st_vfs_file, fl.fl_type, &flp))) { + if ((status = vfs_setlease(fl.fl_file, fl.fl_type, &flp))) { dprintk("NFSD: setlease failed [%d], no delegation\n", status); unhash_delegation(dp); flag = NFS4_OPEN_DELEGATE_NONE; @@ -2514,18 +2605,12 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf */ if (stp) { /* Stateid was found, this is an OPEN upgrade */ - status = nfs4_upgrade_open(rqstp, current_fh, stp, open); + status = nfs4_upgrade_open(rqstp, fp, current_fh, stp, open); if (status) goto out; update_stateid(&stp->st_stateid); } else { - /* Stateid was not found, this is a new OPEN */ - int flags = 0; - if (open->op_share_access & NFS4_SHARE_ACCESS_READ) - flags |= NFSD_MAY_READ; - if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) - flags |= NFSD_MAY_WRITE; - status = nfs4_new_open(rqstp, &stp, dp, current_fh, flags); + status = nfs4_new_open(rqstp, &stp, fp, current_fh, open); if (status) goto out; init_stateid(stp, fp, open); @@ -2727,7 +2812,7 @@ search_close_lru(u32 st_id, int flags) static inline int nfs4_check_fh(struct svc_fh *fhp, struct nfs4_stateid *stp) { - return fhp->fh_dentry->d_inode != stp->st_vfs_file->f_path.dentry->d_inode; + return fhp->fh_dentry->d_inode != stp->st_file->fi_inode; } static int @@ -2760,6 +2845,9 @@ __be32 nfs4_check_openmode(struct nfs4_stateid *stp, int flags) { __be32 status = nfserr_openmode; + /* For lock stateid's, we test the parent open, not the lock: */ + if (stp->st_openstp) + stp = stp->st_openstp; if ((flags & WR_STATE) && (!access_permit_write(stp->st_access_bmap))) goto out; if ((flags & RD_STATE) && (!access_permit_read(stp->st_access_bmap))) @@ -2872,7 +2960,8 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, goto out; renew_client(dp->dl_client); if (filpp) - *filpp = dp->dl_vfs_file; + *filpp = find_readable_file(dp->dl_file); + BUG_ON(!*filpp); } else { /* open or lock stateid */ stp = find_stateid(stateid, flags); if (!stp) @@ -2889,8 +2978,13 @@ nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate, if (status) goto out; renew_client(stp->st_stateowner->so_client); - if (filpp) - *filpp = stp->st_vfs_file; + if (filpp) { + if (flags & RD_STATE) + *filpp = find_readable_file(stp->st_file); + else + *filpp = find_writeable_file(stp->st_file); + BUG_ON(!*filpp); /* assured by check_openmode */ + } } status = nfs_ok; out: @@ -3126,8 +3220,7 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp, goto out; } set_access(&share_access, stp->st_access_bmap); - nfs4_file_downgrade(stp->st_vfs_file, - share_access & ~od->od_share_access); + nfs4_file_downgrade(stp->st_file, share_access & ~od->od_share_access); reset_union_bmap_access(od->od_share_access, &stp->st_access_bmap); reset_union_bmap_deny(od->od_share_deny, &stp->st_deny_bmap); @@ -3346,11 +3439,9 @@ static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { struct nfs4_stateowner *sop; - unsigned int hval; if (fl->fl_lmops == &nfsd_posix_mng_ops) { sop = (struct nfs4_stateowner *) fl->fl_owner; - hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; @@ -3446,8 +3537,6 @@ alloc_init_lock_stateid(struct nfs4_stateowner *sop, struct nfs4_file *fp, struc stp->st_stateid.si_stateownerid = sop->so_id; stp->st_stateid.si_fileid = fp->fi_id; stp->st_stateid.si_generation = 0; - stp->st_vfs_file = open_stp->st_vfs_file; /* FIXME refcount?? */ - stp->st_access_bmap = open_stp->st_access_bmap; stp->st_deny_bmap = open_stp->st_deny_bmap; stp->st_openstp = open_stp; @@ -3547,7 +3636,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, lock_sop = lock->lk_replay_owner; } /* lock->lk_replay_owner and lock_stp have been created or found */ - filp = lock_stp->st_vfs_file; status = nfserr_grace; if (locks_in_grace() && !lock->lk_reclaim) @@ -3560,11 +3648,13 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: + filp = find_readable_file(lock_stp->st_file); file_lock.fl_type = F_RDLCK; cmd = F_SETLK; break; case NFS4_WRITE_LT: case NFS4_WRITEW_LT: + filp = find_writeable_file(lock_stp->st_file); file_lock.fl_type = F_WRLCK; cmd = F_SETLK; break; @@ -3572,6 +3662,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_inval; goto out; } + if (!filp) { + status = nfserr_openmode; + goto out; + } file_lock.fl_owner = (fl_owner_t)lock_sop; file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; @@ -3740,7 +3834,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &locku->lu_stateowner, &stp, NULL))) goto out; - filp = stp->st_vfs_file; + filp = find_any_file(stp->st_file); + if (!filp) { + status = nfserr_lock_range; + goto out; + } BUG_ON(!filp); locks_init_lock(&file_lock); file_lock.fl_type = F_UNLCK; @@ -3787,10 +3885,10 @@ out_nfserr: * 0: no locks held by lockowner */ static int -check_for_locks(struct file *filp, struct nfs4_stateowner *lowner) +check_for_locks(struct nfs4_file *filp, struct nfs4_stateowner *lowner) { struct file_lock **flpp; - struct inode *inode = filp->f_path.dentry->d_inode; + struct inode *inode = filp->fi_inode; int status = 0; lock_kernel(); @@ -3841,7 +3939,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, continue; list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) { - if (check_for_locks(stp->st_vfs_file, sop)) + if (check_for_locks(stp->st_file, sop)) goto out; /* Note: so_perclient unused for lockowners, * so it's OK to fool with here. */ @@ -4066,16 +4164,8 @@ out_free_laundry: int nfs4_state_start(void) { - int ret; - - if (nfs4_init) - return 0; nfsd4_load_reboot_recovery_data(); - ret = __nfs4_state_start(); - if (ret) - return ret; - nfs4_init = 1; - return 0; + return __nfs4_state_start(); } static void @@ -4110,7 +4200,6 @@ __nfs4_state_shutdown(void) } nfsd4_shutdown_recdir(); - nfs4_init = 0; } void diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ac17a708023..f8931acb05f 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2630,7 +2630,7 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, __be32 nfserr, } read->rd_vlen = v; - nfserr = nfsd_read(read->rd_rqstp, read->rd_fhp, read->rd_filp, + nfserr = nfsd_read_file(read->rd_rqstp, read->rd_fhp, read->rd_filp, read->rd_offset, resp->rqstp->rq_vec, read->rd_vlen, &maxcount); @@ -3325,6 +3325,7 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo } /* Renew the clientid on success and on replay */ release_session_client(cs->session); + nfsd4_put_session(cs->session); } return 1; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 508941c23af..b53b1d042f1 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -949,15 +949,12 @@ static ssize_t __write_ports_addfd(char *buf) if (err != 0) return err; - err = lockd_up(); - if (err != 0) - goto out; - err = svc_addsock(nfsd_serv, fd, buf, SIMPLE_TRANSACTION_LIMIT); - if (err < 0) - lockd_down(); + if (err < 0) { + svc_destroy(nfsd_serv); + return err; + } -out: /* Decrease the count, but don't shut down the service */ nfsd_serv->sv_nrthreads--; return err; @@ -978,9 +975,6 @@ static ssize_t __write_ports_delfd(char *buf) if (nfsd_serv != NULL) len = svc_sock_names(nfsd_serv, buf, SIMPLE_TRANSACTION_LIMIT, toclose); - if (len >= 0) - lockd_down(); - kfree(toclose); return len; } @@ -1014,6 +1008,9 @@ static ssize_t __write_ports_addxprt(char *buf) PF_INET6, port, SVC_SOCK_ANONYMOUS); if (err < 0 && err != -EAFNOSUPPORT) goto out_close; + + /* Decrease the count, but don't shut down the service */ + nfsd_serv->sv_nrthreads--; return 0; out_close: xprt = svc_find_xprt(nfsd_serv, transport, PF_INET, port); @@ -1022,8 +1019,7 @@ out_close: svc_xprt_put(xprt); } out_err: - /* Decrease the count, but don't shut down the service */ - nfsd_serv->sv_nrthreads--; + svc_destroy(nfsd_serv); return err; } @@ -1194,7 +1190,7 @@ static ssize_t write_maxblksize(struct file *file, char *buf, size_t size) bsize = NFSSVC_MAXBLKSIZE; bsize &= ~(1024-1); mutex_lock(&nfsd_mutex); - if (nfsd_serv && nfsd_serv->sv_nrthreads) { + if (nfsd_serv) { mutex_unlock(&nfsd_mutex); return -EBUSY; } @@ -1310,6 +1306,8 @@ static ssize_t __write_recoverydir(struct file *file, char *buf, size_t size) return -EINVAL; status = nfs4_reset_recoverydir(recdir); + if (status) + return status; } return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%s\n", diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 72377761270..b76ac3a82e3 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -153,6 +153,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_bad_seqid cpu_to_be32(NFSERR_BAD_SEQID) #define nfserr_symlink cpu_to_be32(NFSERR_SYMLINK) #define nfserr_not_same cpu_to_be32(NFSERR_NOT_SAME) +#define nfserr_lock_range cpu_to_be32(NFSERR_LOCK_RANGE) #define nfserr_restorefh cpu_to_be32(NFSERR_RESTOREFH) #define nfserr_attrnotsupp cpu_to_be32(NFSERR_ATTRNOTSUPP) #define nfserr_bad_xdr cpu_to_be32(NFSERR_BAD_XDR) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index a047ad6111e..08e17264784 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -144,7 +144,7 @@ nfsd_proc_read(struct svc_rqst *rqstp, struct nfsd_readargs *argp, svc_reserve_auth(rqstp, (19<<2) + argp->count + 4); resp->count = argp->count; - nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, + nfserr = nfsd_read(rqstp, fh_copy(&resp->fh, &argp->fh), argp->offset, rqstp->rq_vec, argp->vlen, &resp->count); @@ -290,7 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, * gospel of sun micro */ if (type != S_IFREG) { - int is_borc = 0; if (type != S_IFBLK && type != S_IFCHR) { rdev = 0; } else if (type == S_IFCHR && !(attr->ia_valid & ATTR_SIZE)) { @@ -298,7 +297,6 @@ nfsd_proc_create(struct svc_rqst *rqstp, struct nfsd_createargs *argp, type = S_IFIFO; } else { /* Okay, char or block special */ - is_borc = 1; if (!rdev) rdev = wanted; } diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 06b2a26edfe..e2c43464f23 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -180,15 +180,80 @@ int nfsd_nrthreads(void) return rv; } +static int nfsd_init_socks(int port) +{ + int error; + if (!list_empty(&nfsd_serv->sv_permsocks)) + return 0; + + error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, + SVC_SOCK_DEFAULTS); + if (error < 0) + return error; + + return 0; +} + +static bool nfsd_up = false; + +static int nfsd_startup(unsigned short port, int nrservs) +{ + int ret; + + if (nfsd_up) + return 0; + /* + * Readahead param cache - will no-op if it already exists. + * (Note therefore results will be suboptimal if number of + * threads is modified after nfsd start.) + */ + ret = nfsd_racache_init(2*nrservs); + if (ret) + return ret; + ret = nfsd_init_socks(port); + if (ret) + goto out_racache; + ret = lockd_up(); + if (ret) + goto out_racache; + ret = nfs4_state_start(); + if (ret) + goto out_lockd; + nfsd_up = true; + return 0; +out_lockd: + lockd_down(); +out_racache: + nfsd_racache_shutdown(); + return ret; +} + +static void nfsd_shutdown(void) +{ + /* + * write_ports can create the server without actually starting + * any threads--if we get shut down before any threads are + * started, then nfsd_last_thread will be run before any of this + * other initialization has been done. + */ + if (!nfsd_up) + return; + nfs4_state_shutdown(); + lockd_down(); + nfsd_racache_shutdown(); + nfsd_up = false; +} + static void nfsd_last_thread(struct svc_serv *serv) { /* When last nfsd thread exits we need to do some clean-up */ - struct svc_xprt *xprt; - list_for_each_entry(xprt, &serv->sv_permsocks, xpt_list) - lockd_down(); nfsd_serv = NULL; - nfsd_racache_shutdown(); - nfs4_state_shutdown(); + nfsd_shutdown(); printk(KERN_WARNING "nfsd: last server has exited, flushing export " "cache\n"); @@ -263,45 +328,18 @@ int nfsd_create_serv(void) nfsd_max_blksize >= 8*1024*2) nfsd_max_blksize /= 2; } + nfsd_reset_versions(); nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, nfsd_last_thread, nfsd, THIS_MODULE); if (nfsd_serv == NULL) - err = -ENOMEM; - else - set_max_drc(); + return -ENOMEM; + set_max_drc(); do_gettimeofday(&nfssvc_boot); /* record boot time */ return err; } -static int nfsd_init_socks(int port) -{ - int error; - if (!list_empty(&nfsd_serv->sv_permsocks)) - return 0; - - error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port, - SVC_SOCK_DEFAULTS); - if (error < 0) - return error; - - error = lockd_up(); - if (error < 0) - return error; - - error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port, - SVC_SOCK_DEFAULTS); - if (error < 0) - return error; - - error = lockd_up(); - if (error < 0) - return error; - - return 0; -} - int nfsd_nrpools(void) { if (nfsd_serv == NULL) @@ -376,10 +414,16 @@ int nfsd_set_nrthreads(int n, int *nthreads) return err; } +/* + * Adjust the number of threads and return the new number of threads. + * This is also the function that starts the server if necessary, if + * this is the first time nrservs is nonzero. + */ int nfsd_svc(unsigned short port, int nrservs) { int error; + bool nfsd_up_before; mutex_lock(&nfsd_mutex); dprintk("nfsd: creating service\n"); @@ -391,34 +435,29 @@ nfsd_svc(unsigned short port, int nrservs) if (nrservs == 0 && nfsd_serv == NULL) goto out; - /* Readahead param cache - will no-op if it already exists */ - error = nfsd_racache_init(2*nrservs); - if (error<0) - goto out; - error = nfs4_state_start(); + error = nfsd_create_serv(); if (error) goto out; - nfsd_reset_versions(); - - error = nfsd_create_serv(); + nfsd_up_before = nfsd_up; + error = nfsd_startup(port, nrservs); if (error) - goto out; - error = nfsd_init_socks(port); - if (error) - goto failure; - + goto out_destroy; error = svc_set_num_threads(nfsd_serv, NULL, nrservs); - if (error == 0) - /* We are holding a reference to nfsd_serv which - * we don't want to count in the return value, - * so subtract 1 - */ - error = nfsd_serv->sv_nrthreads - 1; - failure: + if (error) + goto out_shutdown; + /* We are holding a reference to nfsd_serv which + * we don't want to count in the return value, + * so subtract 1 + */ + error = nfsd_serv->sv_nrthreads - 1; +out_shutdown: + if (error < 0 && !nfsd_up_before) + nfsd_shutdown(); +out_destroy: svc_destroy(nfsd_serv); /* Release server */ - out: +out: mutex_unlock(&nfsd_mutex); return error; } diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 006c84230c7..7731a75971d 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -88,7 +88,6 @@ struct nfs4_delegation { struct nfs4_client *dl_client; struct nfs4_file *dl_file; struct file_lock *dl_flock; - struct file *dl_vfs_file; u32 dl_type; time_t dl_time; /* For recall: */ @@ -342,12 +341,50 @@ struct nfs4_file { struct list_head fi_hash; /* hash by "struct inode *" */ struct list_head fi_stateids; struct list_head fi_delegations; + /* One each for O_RDONLY, O_WRONLY, O_RDWR: */ + struct file * fi_fds[3]; + /* One each for O_RDONLY, O_WRONLY: */ + atomic_t fi_access[2]; + /* + * Each open stateid contributes 1 to either fi_readers or + * fi_writers, or both, depending on the open mode. A + * delegation also takes an fi_readers reference. Lock + * stateid's take none. + */ + atomic_t fi_readers; + atomic_t fi_writers; struct inode *fi_inode; u32 fi_id; /* used with stateowner->so_id * for stateid_hashtbl hash */ bool fi_had_conflict; }; +/* XXX: for first cut may fall back on returning file that doesn't work + * at all? */ +static inline struct file *find_writeable_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + return f->fi_fds[O_WRONLY]; +} + +static inline struct file *find_readable_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + return f->fi_fds[O_RDONLY]; +} + +static inline struct file *find_any_file(struct nfs4_file *f) +{ + if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_RDWR]; + else if (f->fi_fds[O_RDWR]) + return f->fi_fds[O_WRONLY]; + else + return f->fi_fds[O_RDONLY]; +} + /* * nfs4_stateid can either be an open stateid or (eventually) a lock stateid * @@ -373,7 +410,6 @@ struct nfs4_stateid { struct nfs4_stateowner * st_stateowner; struct nfs4_file * st_file; stateid_t st_stateid; - struct file * st_vfs_file; unsigned long st_access_bmap; unsigned long st_deny_bmap; struct nfs4_stateid * st_openstp; diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 3c111120b61..9df85a13af2 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -604,7 +604,7 @@ nfsd4_get_nfs4_acl(struct svc_rqst *rqstp, struct dentry *dentry, struct nfs4_ac return error; } -#endif /* defined(CONFIG_NFS_V4) */ +#endif /* defined(CONFIG_NFSD_V4) */ #ifdef CONFIG_NFSD_V3 /* @@ -903,7 +903,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { struct inode *inode; - struct raparms *ra; mm_segment_t oldfs; __be32 err; int host_err; @@ -914,12 +913,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (svc_msnfs(fhp) && !lock_may_read(inode, offset, *count)) goto out; - /* Get readahead parameters */ - ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); - - if (ra && ra->p_set) - file->f_ra = ra->p_ra; - if (file->f_op->splice_read && rqstp->rq_splice_ok) { struct splice_desc sd = { .len = 0, @@ -937,16 +930,6 @@ nfsd_vfs_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, set_fs(oldfs); } - /* Write back readahead params */ - if (ra) { - struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; - spin_lock(&rab->pb_lock); - ra->p_ra = file->f_ra; - ra->p_set = 1; - ra->p_count--; - spin_unlock(&rab->pb_lock); - } - if (host_err >= 0) { nfsdstats.io_read += host_err; *count = host_err; @@ -1086,8 +1069,45 @@ out: * on entry. On return, *count contains the number of bytes actually read. * N.B. After this call fhp needs an fh_put */ +__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, struct kvec *vec, int vlen, unsigned long *count) +{ + struct file *file; + struct inode *inode; + struct raparms *ra; + __be32 err; + + err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); + if (err) + return err; + + inode = file->f_path.dentry->d_inode; + + /* Get readahead parameters */ + ra = nfsd_get_raparms(inode->i_sb->s_dev, inode->i_ino); + + if (ra && ra->p_set) + file->f_ra = ra->p_ra; + + err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); + + /* Write back readahead params */ + if (ra) { + struct raparm_hbucket *rab = &raparm_hash[ra->p_hindex]; + spin_lock(&rab->pb_lock); + ra->p_ra = file->f_ra; + ra->p_set = 1; + ra->p_count--; + spin_unlock(&rab->pb_lock); + } + + nfsd_close(file); + return err; +} + +/* As above, but use the provided file descriptor. */ __be32 -nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, +nfsd_read_file(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, loff_t offset, struct kvec *vec, int vlen, unsigned long *count) { @@ -1099,13 +1119,8 @@ nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, if (err) goto out; err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - } else { - err = nfsd_open(rqstp, fhp, S_IFREG, NFSD_MAY_READ, &file); - if (err) - goto out; - err = nfsd_vfs_read(rqstp, fhp, file, offset, vec, vlen, count); - nfsd_close(file); - } + } else /* Note file may still be NULL in NFSv4 special stateid case: */ + err = nfsd_read(rqstp, fhp, offset, vec, vlen, count); out: return err; } @@ -1631,7 +1646,7 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *name, int len, struct svc_fh *tfhp) { struct dentry *ddir, *dnew, *dold; - struct inode *dirp, *dest; + struct inode *dirp; __be32 err; int host_err; @@ -1659,7 +1674,6 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, goto out_nfserr; dold = tfhp->fh_dentry; - dest = dold->d_inode; host_err = mnt_want_write(tfhp->fh_export->ex_path.mnt); if (host_err) { @@ -2038,7 +2052,6 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, struct dentry *dentry, int acc) { struct inode *inode = dentry->d_inode; - struct path path; int err; if (acc == NFSD_MAY_NOP) @@ -2111,15 +2124,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp, if (err == -EACCES && S_ISREG(inode->i_mode) && acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE)) err = inode_permission(inode, MAY_EXEC); - if (err) - goto nfsd_out; - /* Do integrity (permission) checking now, but defer incrementing - * IMA counts to the actual file open. - */ - path.mnt = exp->ex_path.mnt; - path.dentry = dentry; -nfsd_out: return err? nfserrno(err) : 0; } diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index 217a62c2a35..9a370a5e36b 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -64,7 +64,9 @@ __be32 nfsd_commit(struct svc_rqst *, struct svc_fh *, __be32 nfsd_open(struct svc_rqst *, struct svc_fh *, int, int, struct file **); void nfsd_close(struct file *); -__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, struct file *, +__be32 nfsd_read(struct svc_rqst *, struct svc_fh *, + loff_t, struct kvec *, int, unsigned long *); +__be32 nfsd_read_file(struct svc_rqst *, struct svc_fh *, struct file *, loff_t, struct kvec *, int, unsigned long *); __be32 nfsd_write(struct svc_rqst *, struct svc_fh *,struct file *, loff_t, struct kvec *,int, unsigned long *, int *); diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c index effdbdbe6c1..3dbdc1d356b 100644 --- a/fs/nilfs2/bmap.c +++ b/fs/nilfs2/bmap.c @@ -26,6 +26,8 @@ #include "nilfs.h" #include "bmap.h" #include "sb.h" +#include "btree.h" +#include "direct.h" #include "btnode.h" #include "mdt.h" #include "dat.h" @@ -533,7 +535,7 @@ void nilfs_bmap_init_gc(struct nilfs_bmap *bmap) void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) { - memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union)); + memcpy(gcbmap, bmap, sizeof(*bmap)); init_rwsem(&gcbmap->b_sem); lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode; @@ -541,7 +543,7 @@ void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap) { - memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union)); + memcpy(bmap, gcbmap, sizeof(*bmap)); init_rwsem(&bmap->b_sem); lockdep_set_class(&bmap->b_sem, &nilfs_bmap_dat_lock_key); bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode; diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h index 9980d7dbab9..a20569b1992 100644 --- a/fs/nilfs2/bmap.h +++ b/fs/nilfs2/bmap.h @@ -32,11 +32,6 @@ #define NILFS_BMAP_INVALID_PTR 0 -#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey) -#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key) -#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr) -#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr) - #define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff)) @@ -71,7 +66,7 @@ struct nilfs_bmap_operations { int (*bop_delete)(struct nilfs_bmap *, __u64); void (*bop_clear)(struct nilfs_bmap *); - int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *); + int (*bop_propagate)(struct nilfs_bmap *, struct buffer_head *); void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *, struct list_head *); @@ -110,6 +105,7 @@ static inline int nilfs_bmap_is_new_ptr(unsigned long ptr) * @b_last_allocated_ptr: last allocated ptr for data block * @b_ptr_type: pointer type * @b_state: state + * @b_nchildren_per_block: maximum number of child nodes for non-root nodes */ struct nilfs_bmap { union { @@ -123,6 +119,7 @@ struct nilfs_bmap { __u64 b_last_allocated_ptr; int b_ptr_type; int b_state; + __u16 b_nchildren_per_block; }; /* pointer type */ @@ -224,6 +221,13 @@ static inline void nilfs_bmap_abort_end_ptr(struct nilfs_bmap *bmap, nilfs_dat_abort_end(dat, &req->bpr_req); } +static inline void nilfs_bmap_set_target_v(struct nilfs_bmap *bmap, __u64 key, + __u64 ptr) +{ + bmap->b_last_allocated_key = key; + bmap->b_last_allocated_ptr = ptr; +} + __u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *, const struct buffer_head *); diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h deleted file mode 100644 index d41509bff47..00000000000 --- a/fs/nilfs2/bmap_union.h +++ /dev/null @@ -1,42 +0,0 @@ -/* - * bmap_union.h - NILFS block mapping. - * - * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License as published by - * the Free Software Foundation; either version 2 of the License, or - * (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - * - * Written by Koji Sato <koji@osrg.net>. - */ - -#ifndef _NILFS_BMAP_UNION_H -#define _NILFS_BMAP_UNION_H - -#include "bmap.h" -#include "direct.h" -#include "btree.h" - -/** - * nilfs_bmap_union - - * @bi_bmap: bmap structure - * @bi_btree: direct map structure - * @bi_direct: B-tree structure - */ -union nilfs_bmap_union { - struct nilfs_bmap bi_bmap; - struct nilfs_direct bi_direct; - struct nilfs_btree bi_btree; -}; - -#endif /* _NILFS_BMAP_UNION_H */ diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 447ce47a330..f78ab1044d1 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -96,10 +96,12 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) } int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, - sector_t pblocknr, struct buffer_head **pbh) + sector_t pblocknr, int mode, + struct buffer_head **pbh, sector_t *submit_ptr) { struct buffer_head *bh; struct inode *inode = NILFS_BTNC_I(btnc); + struct page *page; int err; bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node); @@ -107,6 +109,7 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, return -ENOMEM; err = -EEXIST; /* internal code */ + page = bh->b_page; if (buffer_uptodate(bh) || buffer_dirty(bh)) goto found; @@ -125,7 +128,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, } } } - lock_buffer(bh); + + if (mode == READA) { + if (pblocknr != *submit_ptr + 1 || !trylock_buffer(bh)) { + err = -EBUSY; /* internal code */ + brelse(bh); + goto out_locked; + } + } else { /* mode == READ */ + lock_buffer(bh); + } if (buffer_uptodate(bh)) { unlock_buffer(bh); err = -EEXIST; /* internal code */ @@ -136,15 +148,16 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); - submit_bh(READ, bh); + submit_bh(mode, bh); bh->b_blocknr = blocknr; /* set back to the given block address */ + *submit_ptr = pblocknr; err = 0; found: *pbh = bh; out_locked: - unlock_page(bh->b_page); - page_cache_release(bh->b_page); + unlock_page(page); + page_cache_release(page); return err; } diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h index 07da83f0771..79037494f1e 100644 --- a/fs/nilfs2/btnode.h +++ b/fs/nilfs2/btnode.h @@ -42,8 +42,8 @@ void nilfs_btnode_cache_init(struct address_space *, struct backing_dev_info *); void nilfs_btnode_cache_clear(struct address_space *); struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr); -int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, - struct buffer_head **); +int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t, int, + struct buffer_head **, sector_t *); void nilfs_btnode_delete(struct buffer_head *); int nilfs_btnode_prepare_change_key(struct address_space *, struct nilfs_btnode_chkey_ctxt *); diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index b27a342c5af..300c2bc00c3 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -66,30 +66,10 @@ static void nilfs_btree_free_path(struct nilfs_btree_path *path) /* * B-tree node operations */ -static int nilfs_btree_get_block(const struct nilfs_btree *btree, __u64 ptr, - struct buffer_head **bhp) -{ - struct address_space *btnc = - &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; - int err; - - err = nilfs_btnode_submit_block(btnc, ptr, 0, bhp); - if (err) - return err == -EEXIST ? 0 : err; - - wait_on_buffer(*bhp); - if (!buffer_uptodate(*bhp)) { - brelse(*bhp); - return -EIO; - } - return 0; -} - -static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, +static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree, __u64 ptr, struct buffer_head **bhp) { - struct address_space *btnc = - &NILFS_BMAP_I((struct nilfs_bmap *)btree)->i_btnode_cache; + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; struct buffer_head *bh; bh = nilfs_btnode_create_block(btnc, ptr); @@ -101,71 +81,55 @@ static int nilfs_btree_get_new_block(const struct nilfs_btree *btree, return 0; } -static inline int -nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) +static int nilfs_btree_node_get_flags(const struct nilfs_btree_node *node) { return node->bn_flags; } -static inline void +static void nilfs_btree_node_set_flags(struct nilfs_btree_node *node, int flags) { node->bn_flags = flags; } -static inline int nilfs_btree_node_root(const struct nilfs_btree_node *node) +static int nilfs_btree_node_root(const struct nilfs_btree_node *node) { return nilfs_btree_node_get_flags(node) & NILFS_BTREE_NODE_ROOT; } -static inline int -nilfs_btree_node_get_level(const struct nilfs_btree_node *node) +static int nilfs_btree_node_get_level(const struct nilfs_btree_node *node) { return node->bn_level; } -static inline void +static void nilfs_btree_node_set_level(struct nilfs_btree_node *node, int level) { node->bn_level = level; } -static inline int -nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) +static int nilfs_btree_node_get_nchildren(const struct nilfs_btree_node *node) { return le16_to_cpu(node->bn_nchildren); } -static inline void +static void nilfs_btree_node_set_nchildren(struct nilfs_btree_node *node, int nchildren) { node->bn_nchildren = cpu_to_le16(nchildren); } -static inline int nilfs_btree_node_size(const struct nilfs_btree *btree) +static int nilfs_btree_node_size(const struct nilfs_bmap *btree) { - return 1 << btree->bt_bmap.b_inode->i_blkbits; + return 1 << btree->b_inode->i_blkbits; } -static inline int -nilfs_btree_node_nchildren_min(const struct nilfs_btree_node *node, - const struct nilfs_btree *btree) +static int nilfs_btree_nchildren_per_block(const struct nilfs_bmap *btree) { - return nilfs_btree_node_root(node) ? - NILFS_BTREE_ROOT_NCHILDREN_MIN : - NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); + return btree->b_nchildren_per_block; } -static inline int -nilfs_btree_node_nchildren_max(const struct nilfs_btree_node *node, - const struct nilfs_btree *btree) -{ - return nilfs_btree_node_root(node) ? - NILFS_BTREE_ROOT_NCHILDREN_MAX : - NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree)); -} - -static inline __le64 * +static __le64 * nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) { return (__le64 *)((char *)(node + 1) + @@ -173,45 +137,40 @@ nilfs_btree_node_dkeys(const struct nilfs_btree_node *node) 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE)); } -static inline __le64 * -nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, - const struct nilfs_btree *btree) +static __le64 * +nilfs_btree_node_dptrs(const struct nilfs_btree_node *node, int ncmax) { - return (__le64 *)(nilfs_btree_node_dkeys(node) + - nilfs_btree_node_nchildren_max(node, btree)); + return (__le64 *)(nilfs_btree_node_dkeys(node) + ncmax); } -static inline __u64 +static __u64 nilfs_btree_node_get_key(const struct nilfs_btree_node *node, int index) { - return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(node) + index)); + return le64_to_cpu(*(nilfs_btree_node_dkeys(node) + index)); } -static inline void +static void nilfs_btree_node_set_key(struct nilfs_btree_node *node, int index, __u64 key) { - *(nilfs_btree_node_dkeys(node) + index) = nilfs_bmap_key_to_dkey(key); + *(nilfs_btree_node_dkeys(node) + index) = cpu_to_le64(key); } -static inline __u64 -nilfs_btree_node_get_ptr(const struct nilfs_btree *btree, - const struct nilfs_btree_node *node, int index) +static __u64 +nilfs_btree_node_get_ptr(const struct nilfs_btree_node *node, int index, + int ncmax) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(node, btree) + - index)); + return le64_to_cpu(*(nilfs_btree_node_dptrs(node, ncmax) + index)); } -static inline void -nilfs_btree_node_set_ptr(struct nilfs_btree *btree, - struct nilfs_btree_node *node, int index, __u64 ptr) +static void +nilfs_btree_node_set_ptr(struct nilfs_btree_node *node, int index, __u64 ptr, + int ncmax) { - *(nilfs_btree_node_dptrs(node, btree) + index) = - nilfs_bmap_ptr_to_dptr(ptr); + *(nilfs_btree_node_dptrs(node, ncmax) + index) = cpu_to_le64(ptr); } -static void nilfs_btree_node_init(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - int flags, int level, int nchildren, +static void nilfs_btree_node_init(struct nilfs_btree_node *node, int flags, + int level, int nchildren, int ncmax, const __u64 *keys, const __u64 *ptrs) { __le64 *dkeys; @@ -223,29 +182,28 @@ static void nilfs_btree_node_init(struct nilfs_btree *btree, nilfs_btree_node_set_nchildren(node, nchildren); dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); + dptrs = nilfs_btree_node_dptrs(node, ncmax); for (i = 0; i < nchildren; i++) { - dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]); - dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]); + dkeys[i] = cpu_to_le64(keys[i]); + dptrs[i] = cpu_to_le64(ptrs[i]); } } /* Assume the buffer heads corresponding to left and right are locked. */ -static void nilfs_btree_node_move_left(struct nilfs_btree *btree, - struct nilfs_btree_node *left, +static void nilfs_btree_node_move_left(struct nilfs_btree_node *left, struct nilfs_btree_node *right, - int n) + int n, int lncmax, int rncmax) { __le64 *ldkeys, *rdkeys; __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; ldkeys = nilfs_btree_node_dkeys(left); - ldptrs = nilfs_btree_node_dptrs(left, btree); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); lnchildren = nilfs_btree_node_get_nchildren(left); rdkeys = nilfs_btree_node_dkeys(right); - rdptrs = nilfs_btree_node_dptrs(right, btree); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); rnchildren = nilfs_btree_node_get_nchildren(right); memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys)); @@ -260,21 +218,20 @@ static void nilfs_btree_node_move_left(struct nilfs_btree *btree, } /* Assume that the buffer heads corresponding to left and right are locked. */ -static void nilfs_btree_node_move_right(struct nilfs_btree *btree, - struct nilfs_btree_node *left, +static void nilfs_btree_node_move_right(struct nilfs_btree_node *left, struct nilfs_btree_node *right, - int n) + int n, int lncmax, int rncmax) { __le64 *ldkeys, *rdkeys; __le64 *ldptrs, *rdptrs; int lnchildren, rnchildren; ldkeys = nilfs_btree_node_dkeys(left); - ldptrs = nilfs_btree_node_dptrs(left, btree); + ldptrs = nilfs_btree_node_dptrs(left, lncmax); lnchildren = nilfs_btree_node_get_nchildren(left); rdkeys = nilfs_btree_node_dkeys(right); - rdptrs = nilfs_btree_node_dptrs(right, btree); + rdptrs = nilfs_btree_node_dptrs(right, rncmax); rnchildren = nilfs_btree_node_get_nchildren(right); memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys)); @@ -289,16 +246,15 @@ static void nilfs_btree_node_move_right(struct nilfs_btree *btree, } /* Assume that the buffer head corresponding to node is locked. */ -static void nilfs_btree_node_insert(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - __u64 key, __u64 ptr, int index) +static void nilfs_btree_node_insert(struct nilfs_btree_node *node, int index, + __u64 key, __u64 ptr, int ncmax) { __le64 *dkeys; __le64 *dptrs; int nchildren; dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); + dptrs = nilfs_btree_node_dptrs(node, ncmax); nchildren = nilfs_btree_node_get_nchildren(node); if (index < nchildren) { memmove(dkeys + index + 1, dkeys + index, @@ -306,16 +262,15 @@ static void nilfs_btree_node_insert(struct nilfs_btree *btree, memmove(dptrs + index + 1, dptrs + index, (nchildren - index) * sizeof(*dptrs)); } - dkeys[index] = nilfs_bmap_key_to_dkey(key); - dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr); + dkeys[index] = cpu_to_le64(key); + dptrs[index] = cpu_to_le64(ptr); nchildren++; nilfs_btree_node_set_nchildren(node, nchildren); } /* Assume that the buffer head corresponding to node is locked. */ -static void nilfs_btree_node_delete(struct nilfs_btree *btree, - struct nilfs_btree_node *node, - __u64 *keyp, __u64 *ptrp, int index) +static void nilfs_btree_node_delete(struct nilfs_btree_node *node, int index, + __u64 *keyp, __u64 *ptrp, int ncmax) { __u64 key; __u64 ptr; @@ -324,9 +279,9 @@ static void nilfs_btree_node_delete(struct nilfs_btree *btree, int nchildren; dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); - key = nilfs_bmap_dkey_to_key(dkeys[index]); - ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]); + dptrs = nilfs_btree_node_dptrs(node, ncmax); + key = le64_to_cpu(dkeys[index]); + ptr = le64_to_cpu(dptrs[index]); nchildren = nilfs_btree_node_get_nchildren(node); if (keyp != NULL) *keyp = key; @@ -382,40 +337,92 @@ static int nilfs_btree_node_lookup(const struct nilfs_btree_node *node, return s == 0; } -static inline struct nilfs_btree_node * -nilfs_btree_get_root(const struct nilfs_btree *btree) +/** + * nilfs_btree_node_broken - verify consistency of btree node + * @node: btree node block to be examined + * @size: node size (in bytes) + * @blocknr: block number + * + * Return Value: If node is broken, 1 is returned. Otherwise, 0 is returned. + */ +static int nilfs_btree_node_broken(const struct nilfs_btree_node *node, + size_t size, sector_t blocknr) { - return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data; + int level, flags, nchildren; + int ret = 0; + + level = nilfs_btree_node_get_level(node); + flags = nilfs_btree_node_get_flags(node); + nchildren = nilfs_btree_node_get_nchildren(node); + + if (unlikely(level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX || + (flags & NILFS_BTREE_NODE_ROOT) || + nchildren < 0 || + nchildren > NILFS_BTREE_NODE_NCHILDREN_MAX(size))) { + printk(KERN_CRIT "NILFS: bad btree node (blocknr=%llu): " + "level = %d, flags = 0x%x, nchildren = %d\n", + (unsigned long long)blocknr, level, flags, nchildren); + ret = 1; + } + return ret; } -static inline struct nilfs_btree_node * +int nilfs_btree_broken_node_block(struct buffer_head *bh) +{ + int ret; + + if (buffer_nilfs_checked(bh)) + return 0; + + ret = nilfs_btree_node_broken((struct nilfs_btree_node *)bh->b_data, + bh->b_size, bh->b_blocknr); + if (likely(!ret)) + set_buffer_nilfs_checked(bh); + return ret; +} + +static struct nilfs_btree_node * +nilfs_btree_get_root(const struct nilfs_bmap *btree) +{ + return (struct nilfs_btree_node *)btree->b_u.u_data; +} + +static struct nilfs_btree_node * nilfs_btree_get_nonroot_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_bh->b_data; } -static inline struct nilfs_btree_node * +static struct nilfs_btree_node * nilfs_btree_get_sib_node(const struct nilfs_btree_path *path, int level) { return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data; } -static inline int nilfs_btree_height(const struct nilfs_btree *btree) +static int nilfs_btree_height(const struct nilfs_bmap *btree) { return nilfs_btree_node_get_level(nilfs_btree_get_root(btree)) + 1; } -static inline struct nilfs_btree_node * -nilfs_btree_get_node(const struct nilfs_btree *btree, +static struct nilfs_btree_node * +nilfs_btree_get_node(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, - int level) + int level, int *ncmaxp) { - return (level == nilfs_btree_height(btree) - 1) ? - nilfs_btree_get_root(btree) : - nilfs_btree_get_nonroot_node(path, level); + struct nilfs_btree_node *node; + + if (level == nilfs_btree_height(btree) - 1) { + node = nilfs_btree_get_root(btree); + *ncmaxp = NILFS_BTREE_ROOT_NCHILDREN_MAX; + } else { + node = nilfs_btree_get_nonroot_node(path, level); + *ncmaxp = nilfs_btree_nchildren_per_block(btree); + } + return node; } -static inline int +static int nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) { if (unlikely(nilfs_btree_node_get_level(node) != level)) { @@ -427,13 +434,83 @@ nilfs_btree_bad_node(struct nilfs_btree_node *node, int level) return 0; } -static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, +struct nilfs_btree_readahead_info { + struct nilfs_btree_node *node; /* parent node */ + int max_ra_blocks; /* max nof blocks to read ahead */ + int index; /* current index on the parent node */ + int ncmax; /* nof children in the parent node */ +}; + +static int __nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp, + const struct nilfs_btree_readahead_info *ra) +{ + struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache; + struct buffer_head *bh, *ra_bh; + sector_t submit_ptr = 0; + int ret; + + ret = nilfs_btnode_submit_block(btnc, ptr, 0, READ, &bh, &submit_ptr); + if (ret) { + if (ret != -EEXIST) + return ret; + goto out_check; + } + + if (ra) { + int i, n; + __u64 ptr2; + + /* read ahead sibling nodes */ + for (n = ra->max_ra_blocks, i = ra->index + 1; + n > 0 && i < ra->ncmax; n--, i++) { + ptr2 = nilfs_btree_node_get_ptr(ra->node, i, ra->ncmax); + + ret = nilfs_btnode_submit_block(btnc, ptr2, 0, READA, + &ra_bh, &submit_ptr); + if (likely(!ret || ret == -EEXIST)) + brelse(ra_bh); + else if (ret != -EBUSY) + break; + if (!buffer_locked(bh)) + goto out_no_wait; + } + } + + wait_on_buffer(bh); + + out_no_wait: + if (!buffer_uptodate(bh)) { + brelse(bh); + return -EIO; + } + + out_check: + if (nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + brelse(bh); + return -EINVAL; + } + + *bhp = bh; + return 0; +} + +static int nilfs_btree_get_block(const struct nilfs_bmap *btree, __u64 ptr, + struct buffer_head **bhp) +{ + return __nilfs_btree_get_block(btree, ptr, bhp, NULL); +} + +static int nilfs_btree_do_lookup(const struct nilfs_bmap *btree, struct nilfs_btree_path *path, - __u64 key, __u64 *ptrp, int minlevel) + __u64 key, __u64 *ptrp, int minlevel, + int readahead) { struct nilfs_btree_node *node; + struct nilfs_btree_readahead_info p, *ra; __u64 ptr; - int level, index, found, ret; + int level, index, found, ncmax, ret; node = nilfs_btree_get_root(btree); level = nilfs_btree_node_get_level(node); @@ -441,14 +518,27 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, return -ENOENT; found = nilfs_btree_node_lookup(node, key, &index); - ptr = nilfs_btree_node_get_ptr(btree, node, index); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); path[level].bp_bh = NULL; path[level].bp_index = index; - for (level--; level >= minlevel; level--) { - ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); + ncmax = nilfs_btree_nchildren_per_block(btree); + + while (--level >= minlevel) { + ra = NULL; + if (level == NILFS_BTREE_LEVEL_NODE_MIN && readahead) { + p.node = nilfs_btree_get_node(btree, path, level + 1, + &p.ncmax); + p.index = index; + p.max_ra_blocks = 7; + ra = &p; + } + ret = __nilfs_btree_get_block(btree, ptr, &path[level].bp_bh, + ra); if (ret < 0) return ret; + node = nilfs_btree_get_nonroot_node(path, level); if (nilfs_btree_bad_node(node, level)) return -EINVAL; @@ -456,9 +546,9 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, found = nilfs_btree_node_lookup(node, key, &index); else index = 0; - if (index < nilfs_btree_node_nchildren_max(node, btree)) - ptr = nilfs_btree_node_get_ptr(btree, node, index); - else { + if (index < ncmax) { + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); + } else { WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN); /* insert */ ptr = NILFS_BMAP_INVALID_PTR; @@ -474,22 +564,24 @@ static int nilfs_btree_do_lookup(const struct nilfs_btree *btree, return 0; } -static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, +static int nilfs_btree_do_lookup_last(const struct nilfs_bmap *btree, struct nilfs_btree_path *path, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; __u64 ptr; - int index, level, ret; + int index, level, ncmax, ret; node = nilfs_btree_get_root(btree); index = nilfs_btree_node_get_nchildren(node) - 1; if (index < 0) return -ENOENT; level = nilfs_btree_node_get_level(node); - ptr = nilfs_btree_node_get_ptr(btree, node, index); + ptr = nilfs_btree_node_get_ptr(node, index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); path[level].bp_bh = NULL; path[level].bp_index = index; + ncmax = nilfs_btree_nchildren_per_block(btree); for (level--; level > 0; level--) { ret = nilfs_btree_get_block(btree, ptr, &path[level].bp_bh); @@ -499,7 +591,7 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, if (nilfs_btree_bad_node(node, level)) return -EINVAL; index = nilfs_btree_node_get_nchildren(node) - 1; - ptr = nilfs_btree_node_get_ptr(btree, node, index); + ptr = nilfs_btree_node_get_ptr(node, index, ncmax); path[level].bp_index = index; } @@ -511,51 +603,45 @@ static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree, return 0; } -static int nilfs_btree_lookup(const struct nilfs_bmap *bmap, +static int nilfs_btree_lookup(const struct nilfs_bmap *btree, __u64 key, int level, __u64 *ptrp) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; - __u64 ptr; int ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); - - if (ptrp != NULL) - *ptrp = ptr; + ret = nilfs_btree_do_lookup(btree, path, key, ptrp, level, 0); nilfs_btree_free_path(path); return ret; } -static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, +static int nilfs_btree_lookup_contig(const struct nilfs_bmap *btree, __u64 key, __u64 *ptrp, unsigned maxblocks) { - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_path *path; struct nilfs_btree_node *node; struct inode *dat = NULL; __u64 ptr, ptr2; sector_t blocknr; int level = NILFS_BTREE_LEVEL_NODE_MIN; - int ret, cnt, index, maxlevel; + int ret, cnt, index, maxlevel, ncmax; + struct nilfs_btree_readahead_info p; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level, 1); if (ret < 0) goto out; - if (NILFS_BMAP_USE_VBN(bmap)) { - dat = nilfs_bmap_get_dat(bmap); + if (NILFS_BMAP_USE_VBN(btree)) { + dat = nilfs_bmap_get_dat(btree); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) goto out; @@ -566,14 +652,14 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, goto end; maxlevel = nilfs_btree_height(btree) - 1; - node = nilfs_btree_get_node(btree, path, level); + node = nilfs_btree_get_node(btree, path, level, &ncmax); index = path[level].bp_index + 1; for (;;) { while (index < nilfs_btree_node_get_nchildren(node)) { if (nilfs_btree_node_get_key(node, index) != key + cnt) goto end; - ptr2 = nilfs_btree_node_get_ptr(btree, node, index); + ptr2 = nilfs_btree_node_get_ptr(node, index, ncmax); if (dat) { ret = nilfs_dat_translate(dat, ptr2, &blocknr); if (ret < 0) @@ -589,20 +675,24 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, break; /* look-up right sibling node */ - node = nilfs_btree_get_node(btree, path, level + 1); - index = path[level + 1].bp_index + 1; - if (index >= nilfs_btree_node_get_nchildren(node) || - nilfs_btree_node_get_key(node, index) != key + cnt) + p.node = nilfs_btree_get_node(btree, path, level + 1, &p.ncmax); + p.index = path[level + 1].bp_index + 1; + p.max_ra_blocks = 7; + if (p.index >= nilfs_btree_node_get_nchildren(p.node) || + nilfs_btree_node_get_key(p.node, p.index) != key + cnt) break; - ptr2 = nilfs_btree_node_get_ptr(btree, node, index); - path[level + 1].bp_index = index; + ptr2 = nilfs_btree_node_get_ptr(p.node, p.index, p.ncmax); + path[level + 1].bp_index = p.index; brelse(path[level].bp_bh); path[level].bp_bh = NULL; - ret = nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh); + + ret = __nilfs_btree_get_block(btree, ptr2, &path[level].bp_bh, + &p); if (ret < 0) goto out; node = nilfs_btree_get_nonroot_node(path, level); + ncmax = nilfs_btree_nchildren_per_block(btree); index = 0; path[level].bp_index = index; } @@ -614,7 +704,7 @@ static int nilfs_btree_lookup_contig(const struct nilfs_bmap *bmap, return ret; } -static void nilfs_btree_promote_key(struct nilfs_btree *btree, +static void nilfs_btree_promote_key(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 key) { @@ -636,16 +726,18 @@ static void nilfs_btree_promote_key(struct nilfs_btree *btree, } } -static void nilfs_btree_do_insert(struct nilfs_btree *btree, +static void nilfs_btree_do_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; + int ncblk; if (level < nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_nonroot_node(path, level); - nilfs_btree_node_insert(btree, node, *keyp, *ptrp, - path[level].bp_index); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -655,22 +747,24 @@ static void nilfs_btree_do_insert(struct nilfs_btree *btree, 0)); } else { node = nilfs_btree_get_root(btree); - nilfs_btree_node_insert(btree, node, *keyp, *ptrp, - path[level].bp_index); + nilfs_btree_node_insert(node, path[level].bp_index, + *keyp, *ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); } } -static void nilfs_btree_carry_left(struct nilfs_btree *btree, +static void nilfs_btree_carry_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; - int nchildren, lnchildren, n, move; + int nchildren, lnchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + lnchildren + 1) / 2 - lnchildren; @@ -680,7 +774,7 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, move = 1; } - nilfs_btree_node_move_left(btree, left, node, n); + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -705,17 +799,18 @@ static void nilfs_btree_carry_left(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); } -static void nilfs_btree_carry_right(struct nilfs_btree *btree, +static void nilfs_btree_carry_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - int nchildren, rnchildren, n, move; + int nchildren, rnchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + rnchildren + 1) / 2 - rnchildren; @@ -725,7 +820,7 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, move = 1; } - nilfs_btree_node_move_right(btree, node, right, n); + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -751,18 +846,19 @@ static void nilfs_btree_carry_right(struct nilfs_btree *btree, nilfs_btree_do_insert(btree, path, level, keyp, ptrp); } -static void nilfs_btree_split(struct nilfs_btree *btree, +static void nilfs_btree_split(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; __u64 newkey; __u64 newptr; - int nchildren, n, move; + int nchildren, n, move, ncblk; node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); + ncblk = nilfs_btree_nchildren_per_block(btree); move = 0; n = (nchildren + 1) / 2; @@ -771,7 +867,7 @@ static void nilfs_btree_split(struct nilfs_btree *btree, move = 1; } - nilfs_btree_node_move_right(btree, node, right, n); + nilfs_btree_node_move_right(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -783,8 +879,8 @@ static void nilfs_btree_split(struct nilfs_btree *btree, if (move) { path[level].bp_index -= nilfs_btree_node_get_nchildren(node); - nilfs_btree_node_insert(btree, right, *keyp, *ptrp, - path[level].bp_index); + nilfs_btree_node_insert(right, path[level].bp_index, + *keyp, *ptrp, ncblk); *keyp = nilfs_btree_node_get_key(right, 0); *ptrp = path[level].bp_newreq.bpr_ptr; @@ -805,19 +901,21 @@ static void nilfs_btree_split(struct nilfs_btree *btree, path[level + 1].bp_index++; } -static void nilfs_btree_grow(struct nilfs_btree *btree, +static void nilfs_btree_grow(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *root, *child; - int n; + int n, ncblk; root = nilfs_btree_get_root(btree); child = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(root); - nilfs_btree_node_move_right(btree, root, child, n); + nilfs_btree_node_move_right(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); nilfs_btree_node_set_level(root, level + 1); if (!buffer_dirty(path[level].bp_sib_bh)) @@ -832,11 +930,11 @@ static void nilfs_btree_grow(struct nilfs_btree *btree, *ptrp = path[level].bp_newreq.bpr_ptr; } -static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, +static __u64 nilfs_btree_find_near(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path) { struct nilfs_btree_node *node; - int level; + int level, ncmax; if (path == NULL) return NILFS_BMAP_INVALID_PTR; @@ -844,29 +942,30 @@ static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree, /* left sibling */ level = NILFS_BTREE_LEVEL_NODE_MIN; if (path[level].bp_index > 0) { - node = nilfs_btree_get_node(btree, path, level); - return nilfs_btree_node_get_ptr(btree, node, - path[level].bp_index - 1); + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, + path[level].bp_index - 1, + ncmax); } /* parent */ level = NILFS_BTREE_LEVEL_NODE_MIN + 1; if (level <= nilfs_btree_height(btree) - 1) { - node = nilfs_btree_get_node(btree, path, level); - return nilfs_btree_node_get_ptr(btree, node, - path[level].bp_index); + node = nilfs_btree_get_node(btree, path, level, &ncmax); + return nilfs_btree_node_get_ptr(node, path[level].bp_index, + ncmax); } return NILFS_BMAP_INVALID_PTR; } -static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, +static __u64 nilfs_btree_find_target_v(const struct nilfs_bmap *btree, const struct nilfs_btree_path *path, __u64 key) { __u64 ptr; - ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key); + ptr = nilfs_bmap_find_target_seq(btree, key); if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; @@ -877,17 +976,10 @@ static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree, return ptr; } /* block group */ - return nilfs_bmap_find_target_in_group(&btree->bt_bmap); -} - -static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key, - __u64 ptr) -{ - btree->bt_bmap.b_last_allocated_key = key; - btree->bt_bmap.b_last_allocated_ptr = ptr; + return nilfs_bmap_find_target_in_group(btree); } -static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, +static int nilfs_btree_prepare_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int *levelp, __u64 key, __u64 ptr, struct nilfs_bmap_stats *stats) @@ -895,79 +987,78 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ret; + int pindex, level, ncmax, ncblk, ret; struct inode *dat = NULL; stats->bs_nblocks = 0; level = NILFS_BTREE_LEVEL_DATA; /* allocate a new ptr for data block */ - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { + if (NILFS_BMAP_USE_VBN(btree)) { path[level].bp_newreq.bpr_ptr = nilfs_btree_find_target_v(btree, path, key); - dat = nilfs_bmap_get_dat(&btree->bt_bmap); + dat = nilfs_bmap_get_dat(btree); } - ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_data; + ncblk = nilfs_btree_nchildren_per_block(btree); + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); - if (nilfs_btree_node_get_nchildren(node) < - nilfs_btree_node_nchildren_max(node, btree)) { + if (nilfs_btree_node_get_nchildren(node) < ncblk) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; } - parent = nilfs_btree_get_node(btree, path, level + 1); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; /* left sibling */ if (pindex > 0) { - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex - 1); + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) < - nilfs_btree_node_nchildren_max(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_left; stats->bs_nblocks++; goto out; - } else + } else { brelse(bh); + } } /* right sibling */ - if (pindex < - nilfs_btree_node_get_nchildren(parent) - 1) { - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex + 1); + if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_child_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) < - nilfs_btree_node_nchildren_max(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) < ncblk) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_carry_right; stats->bs_nblocks++; goto out; - } else + } else { brelse(bh); + } } /* split */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; @@ -979,9 +1070,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, stats->bs_nblocks++; - nilfs_btree_node_init(btree, - (struct nilfs_btree_node *)bh->b_data, - 0, level, 0, NULL, NULL); + sib = (struct nilfs_btree_node *)bh->b_data; + nilfs_btree_node_init(sib, 0, level, 0, ncblk, NULL, NULL); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_split; } @@ -989,7 +1079,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* root */ node = nilfs_btree_get_root(btree); if (nilfs_btree_node_get_nchildren(node) < - nilfs_btree_node_nchildren_max(node, btree)) { + NILFS_BTREE_ROOT_NCHILDREN_MAX) { path[level].bp_op = nilfs_btree_do_insert; stats->bs_nblocks++; goto out; @@ -997,8 +1087,7 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* grow */ path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, &path[level].bp_newreq, dat); if (ret < 0) goto err_out_child_node; ret = nilfs_btree_get_new_block(btree, path[level].bp_newreq.bpr_ptr, @@ -1006,8 +1095,8 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, if (ret < 0) goto err_out_curr_node; - nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data, - 0, level, 0, NULL, NULL); + nilfs_btree_node_init((struct nilfs_btree_node *)bh->b_data, + 0, level, 0, ncblk, NULL, NULL); path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_grow; @@ -1024,25 +1113,22 @@ static int nilfs_btree_prepare_insert(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, - dat); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); err_out_child_node: for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) { nilfs_btnode_delete(path[level].bp_sib_bh); - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, - &path[level].bp_newreq, dat); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); } - nilfs_bmap_abort_alloc_ptr(&btree->bt_bmap, &path[level].bp_newreq, - dat); + nilfs_bmap_abort_alloc_ptr(btree, &path[level].bp_newreq, dat); err_out_data: *levelp = level; stats->bs_nblocks = 0; return ret; } -static void nilfs_btree_commit_insert(struct nilfs_btree *btree, +static void nilfs_btree_commit_insert(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int maxlevel, __u64 key, __u64 ptr) { @@ -1051,35 +1137,33 @@ static void nilfs_btree_commit_insert(struct nilfs_btree *btree, set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr; - if (NILFS_BMAP_USE_VBN(&btree->bt_bmap)) { - nilfs_btree_set_target_v(btree, key, ptr); - dat = nilfs_bmap_get_dat(&btree->bt_bmap); + if (NILFS_BMAP_USE_VBN(btree)) { + nilfs_bmap_set_target_v(btree, key, ptr); + dat = nilfs_bmap_get_dat(btree); } for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { - nilfs_bmap_commit_alloc_ptr(&btree->bt_bmap, + nilfs_bmap_commit_alloc_ptr(btree, &path[level - 1].bp_newreq, dat); path[level].bp_op(btree, path, level, &key, &ptr); } - if (!nilfs_bmap_dirty(&btree->bt_bmap)) - nilfs_bmap_set_dirty(&btree->bt_bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); } -static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) +static int nilfs_btree_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; int level, ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, NULL, - NILFS_BTREE_LEVEL_NODE_MIN); + NILFS_BTREE_LEVEL_NODE_MIN, 0); if (ret != -ENOENT) { if (ret == 0) ret = -EEXIST; @@ -1090,23 +1174,25 @@ static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) if (ret < 0) goto out; nilfs_btree_commit_insert(btree, path, level, key, ptr); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + nilfs_bmap_add_blocks(btree, stats.bs_nblocks); out: nilfs_btree_free_path(path); return ret; } -static void nilfs_btree_do_delete(struct nilfs_btree *btree, +static void nilfs_btree_do_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node; + int ncblk; if (level < nilfs_btree_height(btree) - 1) { node = nilfs_btree_get_nonroot_node(path, level); - nilfs_btree_node_delete(btree, node, keyp, ptrp, - path[level].bp_index); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); if (path[level].bp_index == 0) @@ -1114,17 +1200,18 @@ static void nilfs_btree_do_delete(struct nilfs_btree *btree, nilfs_btree_node_get_key(node, 0)); } else { node = nilfs_btree_get_root(btree); - nilfs_btree_node_delete(btree, node, keyp, ptrp, - path[level].bp_index); + nilfs_btree_node_delete(node, path[level].bp_index, + keyp, ptrp, + NILFS_BTREE_ROOT_NCHILDREN_MAX); } } -static void nilfs_btree_borrow_left(struct nilfs_btree *btree, +static void nilfs_btree_borrow_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; - int nchildren, lnchildren, n; + int nchildren, lnchildren, n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); @@ -1132,10 +1219,11 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, left = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); lnchildren = nilfs_btree_node_get_nchildren(left); + ncblk = nilfs_btree_nchildren_per_block(btree); n = (nchildren + lnchildren) / 2 - nchildren; - nilfs_btree_node_move_right(btree, left, node, n); + nilfs_btree_node_move_right(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -1150,12 +1238,12 @@ static void nilfs_btree_borrow_left(struct nilfs_btree *btree, path[level].bp_index += n; } -static void nilfs_btree_borrow_right(struct nilfs_btree *btree, +static void nilfs_btree_borrow_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - int nchildren, rnchildren, n; + int nchildren, rnchildren, n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); @@ -1163,10 +1251,11 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, right = nilfs_btree_get_sib_node(path, level); nchildren = nilfs_btree_node_get_nchildren(node); rnchildren = nilfs_btree_node_get_nchildren(right); + ncblk = nilfs_btree_nchildren_per_block(btree); n = (nchildren + rnchildren) / 2 - nchildren; - nilfs_btree_node_move_left(btree, node, right, n); + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -1182,21 +1271,22 @@ static void nilfs_btree_borrow_right(struct nilfs_btree *btree, path[level].bp_sib_bh = NULL; } -static void nilfs_btree_concat_left(struct nilfs_btree *btree, +static void nilfs_btree_concat_left(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *left; - int n; + int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); left = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(node); - nilfs_btree_node_move_left(btree, left, node, n); + nilfs_btree_node_move_left(left, node, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_sib_bh)) nilfs_btnode_mark_dirty(path[level].bp_sib_bh); @@ -1207,21 +1297,22 @@ static void nilfs_btree_concat_left(struct nilfs_btree *btree, path[level].bp_index += nilfs_btree_node_get_nchildren(left); } -static void nilfs_btree_concat_right(struct nilfs_btree *btree, +static void nilfs_btree_concat_right(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *node, *right; - int n; + int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); node = nilfs_btree_get_nonroot_node(path, level); right = nilfs_btree_get_sib_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); n = nilfs_btree_node_get_nchildren(right); - nilfs_btree_node_move_left(btree, node, right, n); + nilfs_btree_node_move_left(node, right, n, ncblk, ncblk); if (!buffer_dirty(path[level].bp_bh)) nilfs_btnode_mark_dirty(path[level].bp_bh); @@ -1231,29 +1322,32 @@ static void nilfs_btree_concat_right(struct nilfs_btree *btree, path[level + 1].bp_index++; } -static void nilfs_btree_shrink(struct nilfs_btree *btree, +static void nilfs_btree_shrink(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, __u64 *keyp, __u64 *ptrp) { struct nilfs_btree_node *root, *child; - int n; + int n, ncblk; nilfs_btree_do_delete(btree, path, level, keyp, ptrp); root = nilfs_btree_get_root(btree); child = nilfs_btree_get_nonroot_node(path, level); + ncblk = nilfs_btree_nchildren_per_block(btree); - nilfs_btree_node_delete(btree, root, NULL, NULL, 0); + nilfs_btree_node_delete(root, 0, NULL, NULL, + NILFS_BTREE_ROOT_NCHILDREN_MAX); nilfs_btree_node_set_level(root, level); n = nilfs_btree_node_get_nchildren(child); - nilfs_btree_node_move_left(btree, root, child, n); + nilfs_btree_node_move_left(root, child, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, ncblk); nilfs_btnode_delete(path[level].bp_bh); path[level].bp_bh = NULL; } -static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, +static int nilfs_btree_prepare_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int *levelp, struct nilfs_bmap_stats *stats, @@ -1262,42 +1356,43 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, struct buffer_head *bh; struct nilfs_btree_node *node, *parent, *sib; __u64 sibptr; - int pindex, level, ret; + int pindex, level, ncmin, ncmax, ncblk, ret; ret = 0; stats->bs_nblocks = 0; + ncmin = NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree)); + ncblk = nilfs_btree_nchildren_per_block(btree); + for (level = NILFS_BTREE_LEVEL_NODE_MIN; level < nilfs_btree_height(btree) - 1; level++) { node = nilfs_btree_get_nonroot_node(path, level); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(btree, node, - path[level].bp_index); - ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, + nilfs_btree_node_get_ptr(node, path[level].bp_index, + ncblk); + ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; - if (nilfs_btree_node_get_nchildren(node) > - nilfs_btree_node_nchildren_min(node, btree)) { + if (nilfs_btree_node_get_nchildren(node) > ncmin) { path[level].bp_op = nilfs_btree_do_delete; stats->bs_nblocks++; goto out; } - parent = nilfs_btree_get_node(btree, path, level + 1); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); pindex = path[level + 1].bp_index; if (pindex > 0) { /* left sibling */ - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex - 1); + sibptr = nilfs_btree_node_get_ptr(parent, pindex - 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) > - nilfs_btree_node_nchildren_min(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_left; stats->bs_nblocks++; @@ -1311,14 +1406,13 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, } else if (pindex < nilfs_btree_node_get_nchildren(parent) - 1) { /* right sibling */ - sibptr = nilfs_btree_node_get_ptr(btree, parent, - pindex + 1); + sibptr = nilfs_btree_node_get_ptr(parent, pindex + 1, + ncmax); ret = nilfs_btree_get_block(btree, sibptr, &bh); if (ret < 0) goto err_out_curr_node; sib = (struct nilfs_btree_node *)bh->b_data; - if (nilfs_btree_node_get_nchildren(sib) > - nilfs_btree_node_nchildren_min(sib, btree)) { + if (nilfs_btree_node_get_nchildren(sib) > ncmin) { path[level].bp_sib_bh = bh; path[level].bp_op = nilfs_btree_borrow_right; stats->bs_nblocks++; @@ -1349,10 +1443,10 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, node = nilfs_btree_get_root(btree); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(btree, node, path[level].bp_index); + nilfs_btree_node_get_ptr(node, path[level].bp_index, + NILFS_BTREE_ROOT_NCHILDREN_MAX); - ret = nilfs_bmap_prepare_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq, dat); + ret = nilfs_bmap_prepare_end_ptr(btree, &path[level].bp_oldreq, dat); if (ret < 0) goto err_out_child_node; @@ -1367,75 +1461,68 @@ static int nilfs_btree_prepare_delete(struct nilfs_btree *btree, /* error */ err_out_curr_node: - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, &path[level].bp_oldreq, dat); + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); err_out_child_node: for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) { brelse(path[level].bp_sib_bh); - nilfs_bmap_abort_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq, dat); + nilfs_bmap_abort_end_ptr(btree, &path[level].bp_oldreq, dat); } *levelp = level; stats->bs_nblocks = 0; return ret; } -static void nilfs_btree_commit_delete(struct nilfs_btree *btree, +static void nilfs_btree_commit_delete(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int maxlevel, struct inode *dat) { int level; for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) { - nilfs_bmap_commit_end_ptr(&btree->bt_bmap, - &path[level].bp_oldreq, dat); + nilfs_bmap_commit_end_ptr(btree, &path[level].bp_oldreq, dat); path[level].bp_op(btree, path, level, NULL, NULL); } - if (!nilfs_bmap_dirty(&btree->bt_bmap)) - nilfs_bmap_set_dirty(&btree->bt_bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); } -static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key) +static int nilfs_btree_delete(struct nilfs_bmap *btree, __u64 key) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_bmap_stats stats; struct inode *dat; int level, ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; ret = nilfs_btree_do_lookup(btree, path, key, NULL, - NILFS_BTREE_LEVEL_NODE_MIN); + NILFS_BTREE_LEVEL_NODE_MIN, 0); if (ret < 0) goto out; - dat = NILFS_BMAP_USE_VBN(&btree->bt_bmap) ? - nilfs_bmap_get_dat(&btree->bt_bmap) : NULL; + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; ret = nilfs_btree_prepare_delete(btree, path, &level, &stats, dat); if (ret < 0) goto out; nilfs_btree_commit_delete(btree, path, level, dat); - nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks); + nilfs_bmap_sub_blocks(btree, stats.bs_nblocks); out: nilfs_btree_free_path(path); return ret; } -static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +static int nilfs_btree_last_key(const struct nilfs_bmap *btree, __u64 *keyp) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; int ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; @@ -1447,16 +1534,14 @@ static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) return ret; } -static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) +static int nilfs_btree_check_delete(struct nilfs_bmap *btree, __u64 key) { struct buffer_head *bh; - struct nilfs_btree *btree; struct nilfs_btree_node *root, *node; __u64 maxkey, nextmaxkey; __u64 ptr; int nchildren, ret; - btree = (struct nilfs_btree *)bmap; root = nilfs_btree_get_root(btree); switch (nilfs_btree_height(btree)) { case 2: @@ -1467,7 +1552,8 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) nchildren = nilfs_btree_node_get_nchildren(root); if (nchildren > 1) return 0; - ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; @@ -1487,32 +1573,33 @@ static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key) return (maxkey == key) && (nextmaxkey < NILFS_BMAP_LARGE_LOW); } -static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, +static int nilfs_btree_gather_data(struct nilfs_bmap *btree, __u64 *keys, __u64 *ptrs, int nitems) { struct buffer_head *bh; - struct nilfs_btree *btree; struct nilfs_btree_node *node, *root; __le64 *dkeys; __le64 *dptrs; __u64 ptr; - int nchildren, i, ret; + int nchildren, ncmax, i, ret; - btree = (struct nilfs_btree *)bmap; root = nilfs_btree_get_root(btree); switch (nilfs_btree_height(btree)) { case 2: bh = NULL; node = root; + ncmax = NILFS_BTREE_ROOT_NCHILDREN_MAX; break; case 3: nchildren = nilfs_btree_node_get_nchildren(root); WARN_ON(nchildren > 1); - ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1); + ptr = nilfs_btree_node_get_ptr(root, nchildren - 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX); ret = nilfs_btree_get_block(btree, ptr, &bh); if (ret < 0) return ret; node = (struct nilfs_btree_node *)bh->b_data; + ncmax = nilfs_btree_nchildren_per_block(btree); break; default: node = NULL; @@ -1523,10 +1610,10 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, if (nchildren < nitems) nitems = nchildren; dkeys = nilfs_btree_node_dkeys(node); - dptrs = nilfs_btree_node_dptrs(node, btree); + dptrs = nilfs_btree_node_dptrs(node, ncmax); for (i = 0; i < nitems; i++) { - keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]); - ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]); + keys[i] = le64_to_cpu(dkeys[i]); + ptrs[i] = le64_to_cpu(dptrs[i]); } if (bh != NULL) @@ -1536,14 +1623,13 @@ static int nilfs_btree_gather_data(struct nilfs_bmap *bmap, } static int -nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, +nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *btree, __u64 key, union nilfs_bmap_ptr_req *dreq, union nilfs_bmap_ptr_req *nreq, struct buffer_head **bhp, struct nilfs_bmap_stats *stats) { struct buffer_head *bh; - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct inode *dat = NULL; int ret; @@ -1551,12 +1637,12 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* for data */ /* cannot find near ptr */ - if (NILFS_BMAP_USE_VBN(bmap)) { + if (NILFS_BMAP_USE_VBN(btree)) { dreq->bpr_ptr = nilfs_btree_find_target_v(btree, NULL, key); - dat = nilfs_bmap_get_dat(bmap); + dat = nilfs_bmap_get_dat(btree); } - ret = nilfs_bmap_prepare_alloc_ptr(bmap, dreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat); if (ret < 0) return ret; @@ -1564,7 +1650,7 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, stats->bs_nblocks++; if (nreq != NULL) { nreq->bpr_ptr = dreq->bpr_ptr + 1; - ret = nilfs_bmap_prepare_alloc_ptr(bmap, nreq, dat); + ret = nilfs_bmap_prepare_alloc_ptr(btree, nreq, dat); if (ret < 0) goto err_out_dreq; @@ -1581,16 +1667,16 @@ nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key, /* error */ err_out_nreq: - nilfs_bmap_abort_alloc_ptr(bmap, nreq, dat); + nilfs_bmap_abort_alloc_ptr(btree, nreq, dat); err_out_dreq: - nilfs_bmap_abort_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_abort_alloc_ptr(btree, dreq, dat); stats->bs_nblocks = 0; return ret; } static void -nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, +nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n, @@ -1598,57 +1684,59 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, union nilfs_bmap_ptr_req *nreq, struct buffer_head *bh) { - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; struct nilfs_btree_node *node; struct inode *dat; __u64 tmpptr; + int ncblk; /* free resources */ - if (bmap->b_ops->bop_clear != NULL) - bmap->b_ops->bop_clear(bmap); + if (btree->b_ops->bop_clear != NULL) + btree->b_ops->bop_clear(btree); /* ptr must be a pointer to a buffer head. */ set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr)); /* convert and insert */ - dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; - nilfs_btree_init(bmap); + dat = NILFS_BMAP_USE_VBN(btree) ? nilfs_bmap_get_dat(btree) : NULL; + nilfs_btree_init(btree); if (nreq != NULL) { - nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); - nilfs_bmap_commit_alloc_ptr(bmap, nreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, nreq, dat); /* create child node at level 1 */ node = (struct nilfs_btree_node *)bh->b_data; - nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs); - nilfs_btree_node_insert(btree, node, - key, dreq->bpr_ptr, n); + ncblk = nilfs_btree_nchildren_per_block(btree); + nilfs_btree_node_init(node, 0, 1, n, ncblk, keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, ncblk); if (!buffer_dirty(bh)) nilfs_btnode_mark_dirty(bh); - if (!nilfs_bmap_dirty(bmap)) - nilfs_bmap_set_dirty(bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); brelse(bh); /* create root node at level 2 */ node = nilfs_btree_get_root(btree); tmpptr = nreq->bpr_ptr; - nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, - 2, 1, &keys[0], &tmpptr); + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 2, 1, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + &keys[0], &tmpptr); } else { - nilfs_bmap_commit_alloc_ptr(bmap, dreq, dat); + nilfs_bmap_commit_alloc_ptr(btree, dreq, dat); /* create root node at level 1 */ node = nilfs_btree_get_root(btree); - nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT, - 1, n, keys, ptrs); - nilfs_btree_node_insert(btree, node, - key, dreq->bpr_ptr, n); - if (!nilfs_bmap_dirty(bmap)) - nilfs_bmap_set_dirty(bmap); + nilfs_btree_node_init(node, NILFS_BTREE_NODE_ROOT, 1, n, + NILFS_BTREE_ROOT_NCHILDREN_MAX, + keys, ptrs); + nilfs_btree_node_insert(node, n, key, dreq->bpr_ptr, + NILFS_BTREE_ROOT_NCHILDREN_MAX); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); } - if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_btree_set_target_v(btree, key, dreq->bpr_ptr); + if (NILFS_BMAP_USE_VBN(btree)) + nilfs_bmap_set_target_v(btree, key, dreq->bpr_ptr); } /** @@ -1660,7 +1748,7 @@ nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap, * @ptrs: * @n: */ -int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, +int nilfs_btree_convert_and_insert(struct nilfs_bmap *btree, __u64 key, __u64 ptr, const __u64 *keys, const __u64 *ptrs, int n) { @@ -1673,7 +1761,7 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, di = &dreq; ni = NULL; } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX( - 1 << bmap->b_inode->i_blkbits)) { + 1 << btree->b_inode->i_blkbits)) { di = &dreq; ni = &nreq; } else { @@ -1682,17 +1770,17 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap, BUG(); } - ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh, + ret = nilfs_btree_prepare_convert_and_insert(btree, key, di, ni, &bh, &stats); if (ret < 0) return ret; - nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n, + nilfs_btree_commit_convert_and_insert(btree, key, ptr, keys, ptrs, n, di, ni, bh); - nilfs_bmap_add_blocks(bmap, stats.bs_nblocks); + nilfs_bmap_add_blocks(btree, stats.bs_nblocks); return 0; } -static int nilfs_btree_propagate_p(struct nilfs_btree *btree, +static int nilfs_btree_propagate_p(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) @@ -1704,17 +1792,17 @@ static int nilfs_btree_propagate_p(struct nilfs_btree *btree, return 0; } -static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, +static int nilfs_btree_prepare_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { struct nilfs_btree_node *parent; - int ret; + int ncmax, ret; - parent = nilfs_btree_get_node(btree, path, level + 1); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); path[level].bp_oldreq.bpr_ptr = - nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1; ret = nilfs_dat_prepare_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req); @@ -1726,7 +1814,7 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr; path[level].bp_ctxt.bh = path[level].bp_bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) { nilfs_dat_abort_update(dat, @@ -1739,30 +1827,31 @@ static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree, return 0; } -static void nilfs_btree_commit_update_v(struct nilfs_btree *btree, +static void nilfs_btree_commit_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { struct nilfs_btree_node *parent; + int ncmax; nilfs_dat_commit_update(dat, &path[level].bp_oldreq.bpr_req, &path[level].bp_newreq.bpr_req, - btree->bt_bmap.b_ptr_type == NILFS_BMAP_PTR_VS); + btree->b_ptr_type == NILFS_BMAP_PTR_VS); if (buffer_nilfs_node(path[level].bp_bh)) { nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); path[level].bp_bh = path[level].bp_ctxt.bh; } set_buffer_nilfs_volatile(path[level].bp_bh); - parent = nilfs_btree_get_node(btree, path, level + 1); - nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index, - path[level].bp_newreq.bpr_ptr); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, + path[level].bp_newreq.bpr_ptr, ncmax); } -static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, +static void nilfs_btree_abort_update_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct inode *dat) { @@ -1770,11 +1859,11 @@ static void nilfs_btree_abort_update_v(struct nilfs_btree *btree, &path[level].bp_newreq.bpr_req); if (buffer_nilfs_node(path[level].bp_bh)) nilfs_btnode_abort_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); } -static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, +static int nilfs_btree_prepare_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int minlevel, int *maxlevelp, struct inode *dat) @@ -1809,7 +1898,7 @@ static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree, return ret; } -static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, +static void nilfs_btree_commit_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int minlevel, int maxlevel, struct buffer_head *bh, @@ -1824,14 +1913,15 @@ static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree, nilfs_btree_commit_update_v(btree, path, level, dat); } -static int nilfs_btree_propagate_v(struct nilfs_btree *btree, +static int nilfs_btree_propagate_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head *bh) { int maxlevel = 0, ret; struct nilfs_btree_node *parent; - struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); + struct inode *dat = nilfs_bmap_get_dat(btree); __u64 ptr; + int ncmax; get_bh(bh); path[level].bp_bh = bh; @@ -1841,9 +1931,10 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, goto out; if (buffer_nilfs_volatile(path[level].bp_bh)) { - parent = nilfs_btree_get_node(btree, path, level + 1); - ptr = nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, + path[level + 1].bp_index, + ncmax); ret = nilfs_dat_mark_dirty(dat, ptr); if (ret < 0) goto out; @@ -1857,10 +1948,9 @@ static int nilfs_btree_propagate_v(struct nilfs_btree *btree, return ret; } -static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, +static int nilfs_btree_propagate(struct nilfs_bmap *btree, struct buffer_head *bh) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_btree_node *node; __u64 key; @@ -1868,7 +1958,6 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, WARN_ON(!buffer_dirty(bh)); - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; @@ -1878,11 +1967,11 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); } else { - key = nilfs_bmap_data_get_key(bmap, bh); + key = nilfs_bmap_data_get_key(btree, bh); level = NILFS_BTREE_LEVEL_DATA; } - ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { if (unlikely(ret == -ENOENT)) printk(KERN_CRIT "%s: key = %llu, level == %d\n", @@ -1890,7 +1979,7 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, goto out; } - ret = NILFS_BMAP_USE_VBN(bmap) ? + ret = NILFS_BMAP_USE_VBN(btree) ? nilfs_btree_propagate_v(btree, path, level, bh) : nilfs_btree_propagate_p(btree, path, level, bh); @@ -1900,13 +1989,13 @@ static int nilfs_btree_propagate(const struct nilfs_bmap *bmap, return ret; } -static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap, +static int nilfs_btree_propagate_gc(struct nilfs_bmap *btree, struct buffer_head *bh) { - return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), bh->b_blocknr); + return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(btree), bh->b_blocknr); } -static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, +static void nilfs_btree_add_dirty_buffer(struct nilfs_bmap *btree, struct list_head *lists, struct buffer_head *bh) { @@ -1920,6 +2009,18 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, node = (struct nilfs_btree_node *)bh->b_data; key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); + if (level < NILFS_BTREE_LEVEL_NODE_MIN || + level >= NILFS_BTREE_LEVEL_MAX) { + dump_stack(); + printk(KERN_WARNING + "%s: invalid btree level: %d (key=%llu, ino=%lu, " + "blocknr=%llu)\n", + __func__, level, (unsigned long long)key, + NILFS_BMAP_I(btree)->vfs_inode.i_ino, + (unsigned long long)bh->b_blocknr); + return; + } + list_for_each(head, &lists[level]) { cbh = list_entry(head, struct buffer_head, b_assoc_buffers); cnode = (struct nilfs_btree_node *)cbh->b_data; @@ -1930,11 +2031,10 @@ static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree, list_add_tail(&bh->b_assoc_buffers, head); } -static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, +static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree, struct list_head *listp) { - struct nilfs_btree *btree = (struct nilfs_btree *)bmap; - struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache; + struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache; struct list_head lists[NILFS_BTREE_LEVEL_MAX]; struct pagevec pvec; struct buffer_head *bh, *head; @@ -1968,7 +2068,7 @@ static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap, list_splice_tail(&lists[level], listp); } -static int nilfs_btree_assign_p(struct nilfs_btree *btree, +static int nilfs_btree_assign_p(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head **bh, @@ -1978,38 +2078,38 @@ static int nilfs_btree_assign_p(struct nilfs_btree *btree, struct nilfs_btree_node *parent; __u64 key; __u64 ptr; - int ret; + int ncmax, ret; - parent = nilfs_btree_get_node(btree, path, level + 1); - ptr = nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); if (buffer_nilfs_node(*bh)) { path[level].bp_ctxt.oldkey = ptr; path[level].bp_ctxt.newkey = blocknr; path[level].bp_ctxt.bh = *bh; ret = nilfs_btnode_prepare_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); if (ret < 0) return ret; nilfs_btnode_commit_change_key( - &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache, + &NILFS_BMAP_I(btree)->i_btnode_cache, &path[level].bp_ctxt); *bh = path[level].bp_ctxt.bh; } - nilfs_btree_node_set_ptr(btree, parent, - path[level + 1].bp_index, blocknr); + nilfs_btree_node_set_ptr(parent, path[level + 1].bp_index, blocknr, + ncmax); key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ - binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = level; return 0; } -static int nilfs_btree_assign_v(struct nilfs_btree *btree, +static int nilfs_btree_assign_v(struct nilfs_bmap *btree, struct nilfs_btree_path *path, int level, struct buffer_head **bh, @@ -2017,15 +2117,15 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, union nilfs_binfo *binfo) { struct nilfs_btree_node *parent; - struct inode *dat = nilfs_bmap_get_dat(&btree->bt_bmap); + struct inode *dat = nilfs_bmap_get_dat(btree); __u64 key; __u64 ptr; union nilfs_bmap_ptr_req req; - int ret; + int ncmax, ret; - parent = nilfs_btree_get_node(btree, path, level + 1); - ptr = nilfs_btree_node_get_ptr(btree, parent, - path[level + 1].bp_index); + parent = nilfs_btree_get_node(btree, path, level + 1, &ncmax); + ptr = nilfs_btree_node_get_ptr(parent, path[level + 1].bp_index, + ncmax); req.bpr_ptr = ptr; ret = nilfs_dat_prepare_start(dat, &req.bpr_req); if (ret < 0) @@ -2034,24 +2134,22 @@ static int nilfs_btree_assign_v(struct nilfs_btree *btree, key = nilfs_btree_node_get_key(parent, path[level + 1].bp_index); /* on-disk format */ - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); return 0; } -static int nilfs_btree_assign(struct nilfs_bmap *bmap, +static int nilfs_btree_assign(struct nilfs_bmap *btree, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_btree *btree; struct nilfs_btree_path *path; struct nilfs_btree_node *node; __u64 key; int level, ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; @@ -2061,17 +2159,17 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, key = nilfs_btree_node_get_key(node, 0); level = nilfs_btree_node_get_level(node); } else { - key = nilfs_bmap_data_get_key(bmap, *bh); + key = nilfs_bmap_data_get_key(btree, *bh); level = NILFS_BTREE_LEVEL_DATA; } - ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1); + ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; } - ret = NILFS_BMAP_USE_VBN(bmap) ? + ret = NILFS_BMAP_USE_VBN(btree) ? nilfs_btree_assign_v(btree, path, level, bh, blocknr, binfo) : nilfs_btree_assign_p(btree, path, level, bh, blocknr, binfo); @@ -2081,7 +2179,7 @@ static int nilfs_btree_assign(struct nilfs_bmap *bmap, return ret; } -static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, +static int nilfs_btree_assign_gc(struct nilfs_bmap *btree, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) @@ -2090,7 +2188,7 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, __u64 key; int ret; - ret = nilfs_dat_move(nilfs_bmap_get_dat(bmap), (*bh)->b_blocknr, + ret = nilfs_dat_move(nilfs_bmap_get_dat(btree), (*bh)->b_blocknr, blocknr); if (ret < 0) return ret; @@ -2099,29 +2197,27 @@ static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap, node = (struct nilfs_btree_node *)(*bh)->b_data; key = nilfs_btree_node_get_key(node, 0); } else - key = nilfs_bmap_data_get_key(bmap, *bh); + key = nilfs_bmap_data_get_key(btree, *bh); /* on-disk format */ binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); return 0; } -static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) +static int nilfs_btree_mark(struct nilfs_bmap *btree, __u64 key, int level) { struct buffer_head *bh; - struct nilfs_btree *btree; struct nilfs_btree_path *path; __u64 ptr; int ret; - btree = (struct nilfs_btree *)bmap; path = nilfs_btree_alloc_path(); if (path == NULL) return -ENOMEM; - ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1); + ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1, 0); if (ret < 0) { WARN_ON(ret == -ENOENT); goto out; @@ -2135,8 +2231,8 @@ static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level) if (!buffer_dirty(bh)) nilfs_btnode_mark_dirty(bh); brelse(bh); - if (!nilfs_bmap_dirty(&btree->bt_bmap)) - nilfs_bmap_set_dirty(&btree->bt_bmap); + if (!nilfs_bmap_dirty(btree)) + nilfs_bmap_set_dirty(btree); out: nilfs_btree_free_path(path); @@ -2186,10 +2282,14 @@ static const struct nilfs_bmap_operations nilfs_btree_ops_gc = { int nilfs_btree_init(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); return 0; } void nilfs_btree_init_gc(struct nilfs_bmap *bmap) { bmap->b_ops = &nilfs_btree_ops_gc; + bmap->b_nchildren_per_block = + NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(bmap)); } diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h index 43c8c5b541f..22c02e35b6e 100644 --- a/fs/nilfs2/btree.h +++ b/fs/nilfs2/btree.h @@ -31,14 +31,6 @@ #include "bmap.h" /** - * struct nilfs_btree - B-tree structure - * @bt_bmap: bmap base structure - */ -struct nilfs_btree { - struct nilfs_bmap bt_bmap; -}; - -/** * struct nilfs_btree_path - A path on which B-tree operations are executed * @bp_bh: buffer head of node block * @bp_sib_bh: buffer head of sibling node block @@ -54,7 +46,7 @@ struct nilfs_btree_path { union nilfs_bmap_ptr_req bp_oldreq; union nilfs_bmap_ptr_req bp_newreq; struct nilfs_btnode_chkey_ctxt bp_ctxt; - void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *, + void (*bp_op)(struct nilfs_bmap *, struct nilfs_btree_path *, int, __u64 *, __u64 *); }; @@ -80,4 +72,6 @@ int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64, const __u64 *, const __u64 *, int); void nilfs_btree_init_gc(struct nilfs_bmap *); +int nilfs_btree_broken_node_block(struct buffer_head *bh); + #endif /* _NILFS_BTREE_H */ diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c index 85c89dfc71f..b60277b4446 100644 --- a/fs/nilfs2/dir.c +++ b/fs/nilfs2/dir.c @@ -141,7 +141,7 @@ static void nilfs_check_page(struct page *page) } for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) { p = (struct nilfs_dir_entry *)(kaddr + offs); - rec_len = le16_to_cpu(p->rec_len); + rec_len = nilfs_rec_len_from_disk(p->rec_len); if (rec_len < NILFS_DIR_REC_LEN(1)) goto Eshort; @@ -199,13 +199,10 @@ fail: static struct page *nilfs_get_page(struct inode *dir, unsigned long n) { struct address_space *mapping = dir->i_mapping; - struct page *page = read_cache_page(mapping, n, - (filler_t *)mapping->a_ops->readpage, NULL); + struct page *page = read_mapping_page(mapping, n, NULL); + if (!IS_ERR(page)) { - wait_on_page_locked(page); kmap(page); - if (!PageUptodate(page)) - goto fail; if (!PageChecked(page)) nilfs_check_page(page); if (PageError(page)) @@ -238,7 +235,8 @@ nilfs_match(int len, const unsigned char *name, struct nilfs_dir_entry *de) */ static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p) { - return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len)); + return (struct nilfs_dir_entry *)((char *)p + + nilfs_rec_len_from_disk(p->rec_len)); } static unsigned char @@ -329,7 +327,7 @@ static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir) goto success; } } - filp->f_pos += le16_to_cpu(de->rec_len); + filp->f_pos += nilfs_rec_len_from_disk(de->rec_len); } nilfs_put_page(page); } @@ -444,7 +442,7 @@ void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de, struct page *page, struct inode *inode) { unsigned from = (char *) de - (char *) page_address(page); - unsigned to = from + le16_to_cpu(de->rec_len); + unsigned to = from + nilfs_rec_len_from_disk(de->rec_len); struct address_space *mapping = page->mapping; int err; @@ -500,7 +498,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) /* We hit i_size */ name_len = 0; rec_len = chunk_size; - de->rec_len = cpu_to_le16(chunk_size); + de->rec_len = nilfs_rec_len_to_disk(chunk_size); de->inode = 0; goto got_it; } @@ -514,7 +512,7 @@ int nilfs_add_link(struct dentry *dentry, struct inode *inode) if (nilfs_match(namelen, name, de)) goto out_unlock; name_len = NILFS_DIR_REC_LEN(de->name_len); - rec_len = le16_to_cpu(de->rec_len); + rec_len = nilfs_rec_len_from_disk(de->rec_len); if (!de->inode && rec_len >= reclen) goto got_it; if (rec_len >= name_len + reclen) @@ -537,8 +535,8 @@ got_it: struct nilfs_dir_entry *de1; de1 = (struct nilfs_dir_entry *)((char *)de + name_len); - de1->rec_len = cpu_to_le16(rec_len - name_len); - de->rec_len = cpu_to_le16(name_len); + de1->rec_len = nilfs_rec_len_to_disk(rec_len - name_len); + de->rec_len = nilfs_rec_len_to_disk(name_len); de = de1; } de->name_len = namelen; @@ -569,7 +567,8 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) struct inode *inode = mapping->host; char *kaddr = page_address(page); unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1); - unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len); + unsigned to = ((char *)dir - kaddr) + + nilfs_rec_len_from_disk(dir->rec_len); struct nilfs_dir_entry *pde = NULL; struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from); int err; @@ -590,7 +589,7 @@ int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page) err = nilfs_prepare_chunk(page, mapping, from, to); BUG_ON(err); if (pde) - pde->rec_len = cpu_to_le16(to - from); + pde->rec_len = nilfs_rec_len_to_disk(to - from); dir->inode = 0; nilfs_commit_chunk(page, mapping, from, to); inode->i_ctime = inode->i_mtime = CURRENT_TIME; @@ -624,14 +623,14 @@ int nilfs_make_empty(struct inode *inode, struct inode *parent) memset(kaddr, 0, chunk_size); de = (struct nilfs_dir_entry *)kaddr; de->name_len = 1; - de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1)); + de->rec_len = nilfs_rec_len_to_disk(NILFS_DIR_REC_LEN(1)); memcpy(de->name, ".\0\0", 4); de->inode = cpu_to_le64(inode->i_ino); nilfs_set_de_type(de, inode); de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1)); de->name_len = 2; - de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1)); + de->rec_len = nilfs_rec_len_to_disk(chunk_size - NILFS_DIR_REC_LEN(1)); de->inode = cpu_to_le64(parent->i_ino); memcpy(de->name, "..\0", 4); nilfs_set_de_type(de, inode); diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 236753df5cd..324d80c5751 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -27,47 +27,43 @@ #include "alloc.h" #include "dat.h" -static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct) +static inline __le64 *nilfs_direct_dptrs(const struct nilfs_bmap *direct) { return (__le64 *) - ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1); + ((struct nilfs_direct_node *)direct->b_u.u_data + 1); } static inline __u64 -nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key) +nilfs_direct_get_ptr(const struct nilfs_bmap *direct, __u64 key) { - return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key)); + return le64_to_cpu(*(nilfs_direct_dptrs(direct) + key)); } -static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct, +static inline void nilfs_direct_set_ptr(struct nilfs_bmap *direct, __u64 key, __u64 ptr) { - *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr); + *(nilfs_direct_dptrs(direct) + key) = cpu_to_le64(ptr); } -static int nilfs_direct_lookup(const struct nilfs_bmap *bmap, +static int nilfs_direct_lookup(const struct nilfs_bmap *direct, __u64 key, int level, __u64 *ptrp) { - struct nilfs_direct *direct; __u64 ptr; - direct = (struct nilfs_direct *)bmap; /* XXX: use macro for level 1 */ if (key > NILFS_DIRECT_KEY_MAX || level != 1) return -ENOENT; ptr = nilfs_direct_get_ptr(direct, key); if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; - if (ptrp != NULL) - *ptrp = ptr; + *ptrp = ptr; return 0; } -static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, +static int nilfs_direct_lookup_contig(const struct nilfs_bmap *direct, __u64 key, __u64 *ptrp, unsigned maxblocks) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; struct inode *dat = NULL; __u64 ptr, ptr2; sector_t blocknr; @@ -79,8 +75,8 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, if (ptr == NILFS_BMAP_INVALID_PTR) return -ENOENT; - if (NILFS_BMAP_USE_VBN(bmap)) { - dat = nilfs_bmap_get_dat(bmap); + if (NILFS_BMAP_USE_VBN(direct)) { + dat = nilfs_bmap_get_dat(direct); ret = nilfs_dat_translate(dat, ptr, &blocknr); if (ret < 0) return ret; @@ -106,29 +102,21 @@ static int nilfs_direct_lookup_contig(const struct nilfs_bmap *bmap, } static __u64 -nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key) +nilfs_direct_find_target_v(const struct nilfs_bmap *direct, __u64 key) { __u64 ptr; - ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key); + ptr = nilfs_bmap_find_target_seq(direct, key); if (ptr != NILFS_BMAP_INVALID_PTR) /* sequential access */ return ptr; else /* block group */ - return nilfs_bmap_find_target_in_group(&direct->d_bmap); -} - -static void nilfs_direct_set_target_v(struct nilfs_direct *direct, - __u64 key, __u64 ptr) -{ - direct->d_bmap.b_last_allocated_key = key; - direct->d_bmap.b_last_allocated_ptr = ptr; + return nilfs_bmap_find_target_in_group(direct); } static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; struct inode *dat = NULL; struct buffer_head *bh; @@ -136,11 +124,11 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) if (key > NILFS_DIRECT_KEY_MAX) return -ENOENT; - if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR) + if (nilfs_direct_get_ptr(bmap, key) != NILFS_BMAP_INVALID_PTR) return -EEXIST; if (NILFS_BMAP_USE_VBN(bmap)) { - req.bpr_ptr = nilfs_direct_find_target_v(direct, key); + req.bpr_ptr = nilfs_direct_find_target_v(bmap, key); dat = nilfs_bmap_get_dat(bmap); } ret = nilfs_bmap_prepare_alloc_ptr(bmap, &req, dat); @@ -150,13 +138,13 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) set_buffer_nilfs_volatile(bh); nilfs_bmap_commit_alloc_ptr(bmap, &req, dat); - nilfs_direct_set_ptr(direct, key, req.bpr_ptr); + nilfs_direct_set_ptr(bmap, key, req.bpr_ptr); if (!nilfs_bmap_dirty(bmap)) nilfs_bmap_set_dirty(bmap); if (NILFS_BMAP_USE_VBN(bmap)) - nilfs_direct_set_target_v(direct, key, req.bpr_ptr); + nilfs_bmap_set_target_v(bmap, key, req.bpr_ptr); nilfs_bmap_add_blocks(bmap, 1); } @@ -165,33 +153,30 @@ static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr) static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; union nilfs_bmap_ptr_req req; struct inode *dat; int ret; if (key > NILFS_DIRECT_KEY_MAX || - nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR) + nilfs_direct_get_ptr(bmap, key) == NILFS_BMAP_INVALID_PTR) return -ENOENT; dat = NILFS_BMAP_USE_VBN(bmap) ? nilfs_bmap_get_dat(bmap) : NULL; - req.bpr_ptr = nilfs_direct_get_ptr(direct, key); + req.bpr_ptr = nilfs_direct_get_ptr(bmap, key); ret = nilfs_bmap_prepare_end_ptr(bmap, &req, dat); if (!ret) { nilfs_bmap_commit_end_ptr(bmap, &req, dat); - nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR); + nilfs_direct_set_ptr(bmap, key, NILFS_BMAP_INVALID_PTR); nilfs_bmap_sub_blocks(bmap, 1); } return ret; } -static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp) +static int nilfs_direct_last_key(const struct nilfs_bmap *direct, __u64 *keyp) { - struct nilfs_direct *direct; __u64 key, lastkey; - direct = (struct nilfs_direct *)bmap; lastkey = NILFS_DIRECT_KEY_MAX + 1; for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++) if (nilfs_direct_get_ptr(direct, key) != @@ -211,15 +196,13 @@ static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key) return key > NILFS_DIRECT_KEY_MAX; } -static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, +static int nilfs_direct_gather_data(struct nilfs_bmap *direct, __u64 *keys, __u64 *ptrs, int nitems) { - struct nilfs_direct *direct; __u64 key; __u64 ptr; int n; - direct = (struct nilfs_direct *)bmap; if (nitems > NILFS_DIRECT_NBLOCKS) nitems = NILFS_DIRECT_NBLOCKS; n = 0; @@ -237,7 +220,6 @@ static int nilfs_direct_gather_data(struct nilfs_bmap *bmap, int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, __u64 key, __u64 *keys, __u64 *ptrs, int n) { - struct nilfs_direct *direct; __le64 *dptrs; int ret, i, j; @@ -253,12 +235,11 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, bmap->b_ops->bop_clear(bmap); /* convert */ - direct = (struct nilfs_direct *)bmap; - dptrs = nilfs_direct_dptrs(direct); + dptrs = nilfs_direct_dptrs(bmap); for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) { if ((j < n) && (i == keys[j])) { dptrs[i] = (i != key) ? - nilfs_bmap_ptr_to_dptr(ptrs[j]) : + cpu_to_le64(ptrs[j]) : NILFS_BMAP_INVALID_PTR; j++; } else @@ -269,10 +250,9 @@ int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap, return 0; } -static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, +static int nilfs_direct_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh) { - struct nilfs_direct *direct = (struct nilfs_direct *)bmap; struct nilfs_palloc_req oldreq, newreq; struct inode *dat; __u64 key; @@ -284,7 +264,7 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); - ptr = nilfs_direct_get_ptr(direct, key); + ptr = nilfs_direct_get_ptr(bmap, key); if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; @@ -294,20 +274,20 @@ static int nilfs_direct_propagate(const struct nilfs_bmap *bmap, nilfs_dat_commit_update(dat, &oldreq, &newreq, bmap->b_ptr_type == NILFS_BMAP_PTR_VS); set_buffer_nilfs_volatile(bh); - nilfs_direct_set_ptr(direct, key, newreq.pr_entry_nr); + nilfs_direct_set_ptr(bmap, key, newreq.pr_entry_nr); } else ret = nilfs_dat_mark_dirty(dat, ptr); return ret; } -static int nilfs_direct_assign_v(struct nilfs_direct *direct, +static int nilfs_direct_assign_v(struct nilfs_bmap *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, union nilfs_binfo *binfo) { - struct inode *dat = nilfs_bmap_get_dat(&direct->d_bmap); + struct inode *dat = nilfs_bmap_get_dat(direct); union nilfs_bmap_ptr_req req; int ret; @@ -315,13 +295,13 @@ static int nilfs_direct_assign_v(struct nilfs_direct *direct, ret = nilfs_dat_prepare_start(dat, &req.bpr_req); if (!ret) { nilfs_dat_commit_start(dat, &req.bpr_req, blocknr); - binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr); - binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_v.bi_vblocknr = cpu_to_le64(ptr); + binfo->bi_v.bi_blkoff = cpu_to_le64(key); } return ret; } -static int nilfs_direct_assign_p(struct nilfs_direct *direct, +static int nilfs_direct_assign_p(struct nilfs_bmap *direct, __u64 key, __u64 ptr, struct buffer_head **bh, sector_t blocknr, @@ -329,7 +309,7 @@ static int nilfs_direct_assign_p(struct nilfs_direct *direct, { nilfs_direct_set_ptr(direct, key, blocknr); - binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key); + binfo->bi_dat.bi_blkoff = cpu_to_le64(key); binfo->bi_dat.bi_level = 0; return 0; @@ -340,18 +320,16 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, sector_t blocknr, union nilfs_binfo *binfo) { - struct nilfs_direct *direct; __u64 key; __u64 ptr; - direct = (struct nilfs_direct *)bmap; key = nilfs_bmap_data_get_key(bmap, *bh); if (unlikely(key > NILFS_DIRECT_KEY_MAX)) { printk(KERN_CRIT "%s: invalid key: %llu\n", __func__, (unsigned long long)key); return -EINVAL; } - ptr = nilfs_direct_get_ptr(direct, key); + ptr = nilfs_direct_get_ptr(bmap, key); if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) { printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__, (unsigned long long)ptr); @@ -359,8 +337,8 @@ static int nilfs_direct_assign(struct nilfs_bmap *bmap, } return NILFS_BMAP_USE_VBN(bmap) ? - nilfs_direct_assign_v(direct, key, ptr, bh, blocknr, binfo) : - nilfs_direct_assign_p(direct, key, ptr, bh, blocknr, binfo); + nilfs_direct_assign_v(bmap, key, ptr, bh, blocknr, binfo) : + nilfs_direct_assign_p(bmap, key, ptr, bh, blocknr, binfo); } static const struct nilfs_bmap_operations nilfs_direct_ops = { diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h index a5ffd66e25d..dc643de20a2 100644 --- a/fs/nilfs2/direct.h +++ b/fs/nilfs2/direct.h @@ -28,8 +28,6 @@ #include "bmap.h" -struct nilfs_direct; - /** * struct nilfs_direct_node - direct node * @dn_flags: flags @@ -40,15 +38,6 @@ struct nilfs_direct_node { __u8 pad[7]; }; -/** - * struct nilfs_direct - direct mapping - * @d_bmap: bmap structure - */ -struct nilfs_direct { - struct nilfs_bmap d_bmap; -}; - - #define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1) #define NILFS_DIRECT_KEY_MIN 0 #define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1) diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index 145f03cd7d3..bed3a783129 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -48,6 +48,8 @@ #include <linux/slab.h> #include <linux/swap.h> #include "nilfs.h" +#include "btree.h" +#include "btnode.h" #include "page.h" #include "mdt.h" #include "dat.h" @@ -149,8 +151,10 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn, __u64 vbn, struct buffer_head **out_bh) { - int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, - vbn ? : pbn, pbn, out_bh); + int ret; + + ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache, + vbn ? : pbn, pbn, READ, out_bh, &pbn); if (ret == -EEXIST) /* internal code (cache hit) */ ret = 0; return ret; @@ -164,10 +168,15 @@ int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh) if (buffer_dirty(bh)) return -EEXIST; - if (buffer_nilfs_node(bh)) + if (buffer_nilfs_node(bh)) { + if (nilfs_btree_broken_node_block(bh)) { + clear_buffer_uptodate(bh); + return -EIO; + } nilfs_btnode_mark_dirty(bh); - else + } else { nilfs_mdt_mark_buffer_dirty(bh); + } return 0; } diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 024be8c35bb..d01aff4957d 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -28,6 +28,7 @@ #include <linux/swap.h> #include <linux/slab.h> #include "nilfs.h" +#include "btnode.h" #include "segment.h" #include "page.h" #include "mdt.h" diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 47d6d792812..0842d775b3e 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -32,7 +32,6 @@ #include "the_nilfs.h" #include "sb.h" #include "bmap.h" -#include "bmap_union.h" /* * nilfs inode data in memory @@ -41,7 +40,7 @@ struct nilfs_inode_info { __u32 i_flags; unsigned long i_state; /* Dynamic state flags */ struct nilfs_bmap *i_bmap; - union nilfs_bmap_union i_bmap_union; + struct nilfs_bmap i_bmap_data; __u64 i_xattr; /* sector_t ??? */ __u32 i_dir_start_lookup; __u64 i_cno; /* check point number for GC inode */ @@ -71,9 +70,7 @@ static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode) static inline struct nilfs_inode_info * NILFS_BMAP_I(const struct nilfs_bmap *bmap) { - return container_of((union nilfs_bmap_union *)bmap, - struct nilfs_inode_info, - i_bmap_union); + return container_of(bmap, struct nilfs_inode_info, i_bmap_data); } static inline struct inode *NILFS_BTNC_I(struct address_space *btnc) @@ -107,6 +104,14 @@ enum { }; /* + * commit flags for nilfs_commit_super and nilfs_sync_super + */ +enum { + NILFS_SB_COMMIT = 0, /* Commit a super block alternately */ + NILFS_SB_COMMIT_ALL /* Commit both super blocks */ +}; + +/* * Macros to check inode numbers */ #define NILFS_MDT_INO_BITS \ @@ -270,7 +275,14 @@ extern struct nilfs_super_block * nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **); extern int nilfs_store_magic_and_option(struct super_block *, struct nilfs_super_block *, char *); +extern int nilfs_check_feature_compatibility(struct super_block *, + struct nilfs_super_block *); +extern void nilfs_set_log_cursor(struct nilfs_super_block *, + struct the_nilfs *); +extern struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *, + int flip); extern int nilfs_commit_super(struct nilfs_sb_info *, int); +extern int nilfs_cleanup_super(struct nilfs_sb_info *); extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64); extern void nilfs_detach_checkpoint(struct nilfs_sb_info *); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 8de3e1e4813..aab11db2cb0 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -37,7 +37,8 @@ #define NILFS_BUFFER_INHERENT_BITS \ ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \ - (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated)) + (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated) | \ + (1UL << BH_NILFS_Checked)) static struct buffer_head * __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, @@ -129,6 +130,7 @@ void nilfs_forget_buffer(struct buffer_head *bh) lock_buffer(bh); clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); clear_buffer_dirty(bh); if (nilfs_page_buffers_clean(page)) __nilfs_clear_page_dirty(page); @@ -480,6 +482,7 @@ void nilfs_clear_dirty_pages(struct address_space *mapping) lock_buffer(bh); clear_buffer_dirty(bh); clear_buffer_nilfs_volatile(bh); + clear_buffer_nilfs_checked(bh); clear_buffer_uptodate(bh); clear_buffer_mapped(bh); unlock_buffer(bh); diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h index 8abca4d1c1f..f53d8da41ed 100644 --- a/fs/nilfs2/page.h +++ b/fs/nilfs2/page.h @@ -34,11 +34,13 @@ enum { BH_NILFS_Allocated = BH_PrivateStart, BH_NILFS_Node, BH_NILFS_Volatile, + BH_NILFS_Checked, }; BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */ BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */ BUFFER_FNS(NILFS_Volatile, nilfs_volatile) +BUFFER_FNS(NILFS_Checked, nilfs_checked) /* buffer is verified */ void nilfs_mark_buffer_dirty(struct buffer_head *bh); diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c index bae2a516b4e..83e3d8c61a0 100644 --- a/fs/nilfs2/recovery.c +++ b/fs/nilfs2/recovery.c @@ -91,27 +91,9 @@ static int nilfs_warn_segment_error(int err) return -EINVAL; } -static void store_segsum_info(struct nilfs_segsum_info *ssi, - struct nilfs_segment_summary *sum, - unsigned int blocksize) -{ - ssi->flags = le16_to_cpu(sum->ss_flags); - ssi->seg_seq = le64_to_cpu(sum->ss_seq); - ssi->ctime = le64_to_cpu(sum->ss_create); - ssi->next = le64_to_cpu(sum->ss_next); - ssi->nblocks = le32_to_cpu(sum->ss_nblocks); - ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo); - ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes); - - ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize); - ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi); - - /* need to verify ->ss_bytes field if read ->ss_cno */ -} - /** - * calc_crc_cont - check CRC of blocks continuously - * @sbi: nilfs_sb_info + * nilfs_compute_checksum - compute checksum of blocks continuously + * @nilfs: nilfs object * @bhs: buffer head of start block * @sum: place to store result * @offset: offset bytes in the first block @@ -119,23 +101,25 @@ static void store_segsum_info(struct nilfs_segsum_info *ssi, * @start: DBN of start block * @nblock: number of blocks to be checked */ -static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, - u32 *sum, unsigned long offset, u64 check_bytes, - sector_t start, unsigned long nblock) +static int nilfs_compute_checksum(struct the_nilfs *nilfs, + struct buffer_head *bhs, u32 *sum, + unsigned long offset, u64 check_bytes, + sector_t start, unsigned long nblock) { - unsigned long blocksize = sbi->s_super->s_blocksize; + unsigned int blocksize = nilfs->ns_blocksize; unsigned long size; u32 crc; BUG_ON(offset >= blocksize); check_bytes -= offset; size = min_t(u64, check_bytes, blocksize - offset); - crc = crc32_le(sbi->s_nilfs->ns_crc_seed, + crc = crc32_le(nilfs->ns_crc_seed, (unsigned char *)bhs->b_data + offset, size); if (--nblock > 0) { do { - struct buffer_head *bh - = sb_bread(sbi->s_super, ++start); + struct buffer_head *bh; + + bh = __bread(nilfs->ns_bdev, ++start, blocksize); if (!bh) return -EIO; check_bytes -= size; @@ -150,12 +134,12 @@ static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs, /** * nilfs_read_super_root_block - read super root block - * @sb: super_block + * @nilfs: nilfs object * @sr_block: disk block number of the super root block * @pbh: address of a buffer_head pointer to return super root buffer * @check: CRC check flag */ -int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, +int nilfs_read_super_root_block(struct the_nilfs *nilfs, sector_t sr_block, struct buffer_head **pbh, int check) { struct buffer_head *bh_sr; @@ -164,7 +148,7 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, int ret; *pbh = NULL; - bh_sr = sb_bread(sb, sr_block); + bh_sr = __bread(nilfs->ns_bdev, sr_block, nilfs->ns_blocksize); if (unlikely(!bh_sr)) { ret = NILFS_SEG_FAIL_IO; goto failed; @@ -174,12 +158,13 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, if (check) { unsigned bytes = le16_to_cpu(sr->sr_bytes); - if (bytes == 0 || bytes > sb->s_blocksize) { + if (bytes == 0 || bytes > nilfs->ns_blocksize) { ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT; goto failed_bh; } - if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc, - sizeof(sr->sr_sum), bytes, sr_block, 1)) { + if (nilfs_compute_checksum( + nilfs, bh_sr, &crc, sizeof(sr->sr_sum), bytes, + sr_block, 1)) { ret = NILFS_SEG_FAIL_IO; goto failed_bh; } @@ -199,64 +184,76 @@ int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block, } /** - * load_segment_summary - read segment summary of the specified partial segment - * @sbi: nilfs_sb_info - * @pseg_start: start disk block number of partial segment - * @seg_seq: sequence number requested - * @ssi: pointer to nilfs_segsum_info struct to store information + * nilfs_read_log_header - read summary header of the specified log + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: pointer to return segment summary structure */ -static int -load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start, - u64 seg_seq, struct nilfs_segsum_info *ssi) +static struct buffer_head * +nilfs_read_log_header(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary **sum) { struct buffer_head *bh_sum; - struct nilfs_segment_summary *sum; + + bh_sum = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); + if (bh_sum) + *sum = (struct nilfs_segment_summary *)bh_sum->b_data; + return bh_sum; +} + +/** + * nilfs_validate_log - verify consistency of log + * @nilfs: nilfs object + * @seg_seq: sequence number of segment + * @bh_sum: buffer head of summary block + * @sum: segment summary struct + */ +static int nilfs_validate_log(struct the_nilfs *nilfs, u64 seg_seq, + struct buffer_head *bh_sum, + struct nilfs_segment_summary *sum) +{ unsigned long nblock; u32 crc; - int ret = NILFS_SEG_FAIL_IO; + int ret; - bh_sum = sb_bread(sbi->s_super, pseg_start); - if (!bh_sum) + ret = NILFS_SEG_FAIL_MAGIC; + if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) goto out; - sum = (struct nilfs_segment_summary *)bh_sum->b_data; - - /* Check consistency of segment summary */ - if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) { - ret = NILFS_SEG_FAIL_MAGIC; - goto failed; - } - store_segsum_info(ssi, sum, sbi->s_super->s_blocksize); - if (seg_seq != ssi->seg_seq) { - ret = NILFS_SEG_FAIL_SEQ; - goto failed; - } + ret = NILFS_SEG_FAIL_SEQ; + if (le64_to_cpu(sum->ss_seq) != seg_seq) + goto out; - nblock = ssi->nblocks; - if (unlikely(nblock == 0 || - nblock > sbi->s_nilfs->ns_blocks_per_segment)) { + nblock = le32_to_cpu(sum->ss_nblocks); + ret = NILFS_SEG_FAIL_CONSISTENCY; + if (unlikely(nblock == 0 || nblock > nilfs->ns_blocks_per_segment)) /* This limits the number of blocks read in the CRC check */ - ret = NILFS_SEG_FAIL_CONSISTENCY; - goto failed; - } - if (calc_crc_cont(sbi, bh_sum, &crc, sizeof(sum->ss_datasum), - ((u64)nblock << sbi->s_super->s_blocksize_bits), - pseg_start, nblock)) { - ret = NILFS_SEG_FAIL_IO; - goto failed; - } - if (crc == le32_to_cpu(sum->ss_datasum)) - ret = 0; - else - ret = NILFS_SEG_FAIL_CHECKSUM_FULL; - failed: - brelse(bh_sum); - out: + goto out; + + ret = NILFS_SEG_FAIL_IO; + if (nilfs_compute_checksum(nilfs, bh_sum, &crc, sizeof(sum->ss_datasum), + ((u64)nblock << nilfs->ns_blocksize_bits), + bh_sum->b_blocknr, nblock)) + goto out; + + ret = NILFS_SEG_FAIL_CHECKSUM_FULL; + if (crc != le32_to_cpu(sum->ss_datasum)) + goto out; + ret = 0; +out: return ret; } -static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, - unsigned int *offset, unsigned int bytes) +/** + * nilfs_read_summary_info - read an item on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be read + */ +static void *nilfs_read_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes) { void *ptr; sector_t blocknr; @@ -265,7 +262,8 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, if (bytes > (*pbh)->b_size - *offset) { blocknr = (*pbh)->b_blocknr; brelse(*pbh); - *pbh = sb_bread(sb, blocknr + 1); + *pbh = __bread(nilfs->ns_bdev, blocknr + 1, + nilfs->ns_blocksize); if (unlikely(!*pbh)) return NULL; *offset = 0; @@ -275,9 +273,18 @@ static void *segsum_get(struct super_block *sb, struct buffer_head **pbh, return ptr; } -static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, - unsigned int *offset, unsigned int bytes, - unsigned long count) +/** + * nilfs_skip_summary_info - skip items on summary blocks of a log + * @nilfs: nilfs object + * @pbh: the current buffer head on summary blocks [in, out] + * @offset: the current byte offset on summary blocks [in, out] + * @bytes: byte size of the item to be skipped + * @count: number of items to be skipped + */ +static void nilfs_skip_summary_info(struct the_nilfs *nilfs, + struct buffer_head **pbh, + unsigned int *offset, unsigned int bytes, + unsigned long count) { unsigned int rest_item_in_current_block = ((*pbh)->b_size - *offset) / bytes; @@ -294,36 +301,46 @@ static void segsum_skip(struct super_block *sb, struct buffer_head **pbh, *offset = bytes * (count - (bcnt - 1) * nitem_per_block); brelse(*pbh); - *pbh = sb_bread(sb, blocknr + bcnt); + *pbh = __bread(nilfs->ns_bdev, blocknr + bcnt, + nilfs->ns_blocksize); } } -static int -collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, - struct nilfs_segsum_info *ssi, - struct list_head *head) +/** + * nilfs_scan_dsync_log - get block information of a log written for data sync + * @nilfs: nilfs object + * @start_blocknr: start block number of the log + * @sum: log summary information + * @head: list head to add nilfs_recovery_block struct + */ +static int nilfs_scan_dsync_log(struct the_nilfs *nilfs, sector_t start_blocknr, + struct nilfs_segment_summary *sum, + struct list_head *head) { struct buffer_head *bh; unsigned int offset; - unsigned long nfinfo = ssi->nfinfo; - sector_t blocknr = sum_blocknr + ssi->nsumblk; + u32 nfinfo, sumbytes; + sector_t blocknr; ino_t ino; int err = -EIO; + nfinfo = le32_to_cpu(sum->ss_nfinfo); if (!nfinfo) return 0; - bh = sb_bread(sbi->s_super, sum_blocknr); + sumbytes = le32_to_cpu(sum->ss_sumbytes); + blocknr = start_blocknr + DIV_ROUND_UP(sumbytes, nilfs->ns_blocksize); + bh = __bread(nilfs->ns_bdev, start_blocknr, nilfs->ns_blocksize); if (unlikely(!bh)) goto out; - offset = le16_to_cpu( - ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes); + offset = le16_to_cpu(sum->ss_bytes); for (;;) { unsigned long nblocks, ndatablk, nnodeblk; struct nilfs_finfo *finfo; - finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo)); + finfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*finfo)); if (unlikely(!finfo)) goto out; @@ -336,8 +353,8 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, struct nilfs_recovery_block *rb; struct nilfs_binfo_v *binfo; - binfo = segsum_get(sbi->s_super, &bh, &offset, - sizeof(*binfo)); + binfo = nilfs_read_summary_info(nilfs, &bh, &offset, + sizeof(*binfo)); if (unlikely(!binfo)) goto out; @@ -355,9 +372,9 @@ collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr, } if (--nfinfo == 0) break; - blocknr += nnodeblk; /* always 0 for the data sync segments */ - segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64), - nnodeblk); + blocknr += nnodeblk; /* always 0 for data sync logs */ + nilfs_skip_summary_info(nilfs, &bh, &offset, sizeof(__le64), + nnodeblk); if (unlikely(!bh)) goto out; } @@ -467,14 +484,14 @@ static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs, return err; } -static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, +static int nilfs_recovery_copy_block(struct the_nilfs *nilfs, struct nilfs_recovery_block *rb, struct page *page) { struct buffer_head *bh_org; void *kaddr; - bh_org = sb_bread(sbi->s_super, rb->blocknr); + bh_org = __bread(nilfs->ns_bdev, rb->blocknr, nilfs->ns_blocksize); if (unlikely(!bh_org)) return -EIO; @@ -485,13 +502,14 @@ static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi, return 0; } -static int recover_dsync_blocks(struct nilfs_sb_info *sbi, - struct list_head *head, - unsigned long *nr_salvaged_blocks) +static int nilfs_recover_dsync_blocks(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct list_head *head, + unsigned long *nr_salvaged_blocks) { struct inode *inode; struct nilfs_recovery_block *rb, *n; - unsigned blocksize = sbi->s_super->s_blocksize; + unsigned blocksize = nilfs->ns_blocksize; struct page *page; loff_t pos; int err = 0, err2 = 0; @@ -511,7 +529,7 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, if (unlikely(err)) goto failed_inode; - err = nilfs_recovery_copy_block(sbi, rb, page); + err = nilfs_recovery_copy_block(nilfs, rb, page); if (unlikely(err)) goto failed_page; @@ -551,18 +569,20 @@ static int recover_dsync_blocks(struct nilfs_sb_info *sbi, /** * nilfs_do_roll_forward - salvage logical segments newer than the latest * checkpoint + * @nilfs: nilfs object * @sbi: nilfs_sb_info - * @nilfs: the_nilfs * @ri: pointer to a nilfs_recovery_info */ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, struct nilfs_recovery_info *ri) { - struct nilfs_segsum_info ssi; + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; sector_t pseg_start; sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */ unsigned long nsalvaged_blocks = 0; + unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; int empty_seg = 0; @@ -581,8 +601,14 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end); while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) { + brelse(bh_sum); + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) { + err = -EIO; + goto failed; + } - ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) { err = -EIO; @@ -590,33 +616,38 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, } goto strayed; } - if (unlikely(NILFS_SEG_HAS_SR(&ssi))) + + flags = le16_to_cpu(sum->ss_flags); + if (flags & NILFS_SS_SR) goto confused; /* Found a valid partial segment; do recovery actions */ - nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); empty_seg = 0; - nilfs->ns_ctime = ssi.ctime; - if (!(ssi.flags & NILFS_SS_GC)) - nilfs->ns_nongc_ctime = ssi.ctime; + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); + if (!(flags & NILFS_SS_GC)) + nilfs->ns_nongc_ctime = nilfs->ns_ctime; switch (state) { case RF_INIT_ST: - if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi)) + if (!(flags & NILFS_SS_LOGBGN) || + !(flags & NILFS_SS_SYNDT)) goto try_next_pseg; state = RF_DSYNC_ST; /* Fall through */ case RF_DSYNC_ST: - if (!NILFS_SEG_DSYNC(&ssi)) + if (!(flags & NILFS_SS_SYNDT)) goto confused; - err = collect_blocks_from_segsum( - sbi, pseg_start, &ssi, &dsync_blocks); + err = nilfs_scan_dsync_log(nilfs, pseg_start, sum, + &dsync_blocks); if (unlikely(err)) goto failed; - if (NILFS_SEG_LOGEND(&ssi)) { - err = recover_dsync_blocks( - sbi, &dsync_blocks, &nsalvaged_blocks); + if (flags & NILFS_SS_LOGEND) { + err = nilfs_recover_dsync_blocks( + nilfs, sbi, &dsync_blocks, + &nsalvaged_blocks); if (unlikely(err)) goto failed; state = RF_INIT_ST; @@ -627,7 +658,7 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, try_next_pseg: if (pseg_start == ri->ri_lsegs_end) break; - pseg_start += ssi.nblocks; + pseg_start += le32_to_cpu(sum->ss_nblocks); if (pseg_start < seg_end) continue; goto feed_segment; @@ -652,8 +683,9 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE; } out: + brelse(bh_sum); dispose_recovery_list(&dsync_blocks); - nilfs_detach_writer(sbi->s_nilfs, sbi); + nilfs_detach_writer(nilfs, sbi); return err; confused: @@ -667,7 +699,6 @@ static int nilfs_do_roll_forward(struct the_nilfs *nilfs, } static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, - struct nilfs_sb_info *sbi, struct nilfs_recovery_info *ri) { struct buffer_head *bh; @@ -677,7 +708,7 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, nilfs_get_segnum_of_block(nilfs, ri->ri_super_root)) return; - bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start); + bh = __getblk(nilfs->ns_bdev, ri->ri_lsegs_start, nilfs->ns_blocksize); BUG_ON(!bh); memset(bh->b_data, 0, bh->b_size); set_buffer_dirty(bh); @@ -690,9 +721,8 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, } /** - * nilfs_recover_logical_segments - salvage logical segments written after - * the latest super root - * @nilfs: the_nilfs + * nilfs_salvage_orphan_logs - salvage logs written after the latest checkpoint + * @nilfs: nilfs object * @sbi: nilfs_sb_info * @ri: pointer to a nilfs_recovery_info struct to store search results. * @@ -709,9 +739,9 @@ static void nilfs_finish_roll_forward(struct the_nilfs *nilfs, * * %-ENOMEM - Insufficient memory available. */ -int nilfs_recover_logical_segments(struct the_nilfs *nilfs, - struct nilfs_sb_info *sbi, - struct nilfs_recovery_info *ri) +int nilfs_salvage_orphan_logs(struct the_nilfs *nilfs, + struct nilfs_sb_info *sbi, + struct nilfs_recovery_info *ri) { int err; @@ -751,7 +781,7 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, goto failed; } - nilfs_finish_roll_forward(nilfs, sbi, ri); + nilfs_finish_roll_forward(nilfs, ri); } failed: @@ -762,7 +792,6 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, /** * nilfs_search_super_root - search the latest valid super root * @nilfs: the_nilfs - * @sbi: nilfs_sb_info * @ri: pointer to a nilfs_recovery_info struct to store search results. * * nilfs_search_super_root() looks for the latest super-root from a partial @@ -775,14 +804,19 @@ int nilfs_recover_logical_segments(struct the_nilfs *nilfs, * %-EINVAL - No valid segment found * * %-EIO - I/O error + * + * %-ENOMEM - Insufficient memory available. */ -int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, +int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_recovery_info *ri) { - struct nilfs_segsum_info ssi; + struct buffer_head *bh_sum = NULL; + struct nilfs_segment_summary *sum; sector_t pseg_start, pseg_end, sr_pseg_start = 0; sector_t seg_start, seg_end; /* range of full segment (block number) */ sector_t b, end; + unsigned long nblocks; + unsigned int flags; u64 seg_seq; __u64 segnum, nextnum = 0; __u64 cno; @@ -801,17 +835,24 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, /* Read ahead segment */ b = seg_start; while (b <= seg_end) - sb_breadahead(sbi->s_super, b++); + __breadahead(nilfs->ns_bdev, b++, nilfs->ns_blocksize); for (;;) { - /* Load segment summary */ - ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi); + brelse(bh_sum); + ret = NILFS_SEG_FAIL_IO; + bh_sum = nilfs_read_log_header(nilfs, pseg_start, &sum); + if (!bh_sum) + goto failed; + + ret = nilfs_validate_log(nilfs, seg_seq, bh_sum, sum); if (ret) { if (ret == NILFS_SEG_FAIL_IO) goto failed; goto strayed; } - pseg_end = pseg_start + ssi.nblocks - 1; + + nblocks = le32_to_cpu(sum->ss_nblocks); + pseg_end = pseg_start + nblocks - 1; if (unlikely(pseg_end > seg_end)) { ret = NILFS_SEG_FAIL_CONSISTENCY; goto strayed; @@ -821,11 +862,13 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, ri->ri_pseg_start = pseg_start; ri->ri_seq = seg_seq; ri->ri_segnum = segnum; - nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next); + nextnum = nilfs_get_segnum_of_block(nilfs, + le64_to_cpu(sum->ss_next)); ri->ri_nextnum = nextnum; empty_seg = 0; - if (!NILFS_SEG_HAS_SR(&ssi) && !scan_newer) { + flags = le16_to_cpu(sum->ss_flags); + if (!(flags & NILFS_SS_SR) && !scan_newer) { /* This will never happen because a superblock (last_segment) always points to a pseg having a super root. */ @@ -836,14 +879,15 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, if (pseg_start == seg_start) { nilfs_get_segment_range(nilfs, nextnum, &b, &end); while (b <= end) - sb_breadahead(sbi->s_super, b++); + __breadahead(nilfs->ns_bdev, b++, + nilfs->ns_blocksize); } - if (!NILFS_SEG_HAS_SR(&ssi)) { - if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) { + if (!(flags & NILFS_SS_SR)) { + if (!ri->ri_lsegs_start && (flags & NILFS_SS_LOGBGN)) { ri->ri_lsegs_start = pseg_start; ri->ri_lsegs_start_seq = seg_seq; } - if (NILFS_SEG_LOGEND(&ssi)) + if (flags & NILFS_SS_LOGEND) ri->ri_lsegs_end = pseg_start; goto try_next_pseg; } @@ -854,12 +898,12 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, ri->ri_lsegs_start = ri->ri_lsegs_end = 0; nilfs_dispose_segment_list(&segments); - nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start) - + ssi.nblocks - seg_start; + sr_pseg_start = pseg_start; + nilfs->ns_pseg_offset = pseg_start + nblocks - seg_start; nilfs->ns_seg_seq = seg_seq; nilfs->ns_segnum = segnum; nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */ - nilfs->ns_ctime = ssi.ctime; + nilfs->ns_ctime = le64_to_cpu(sum->ss_create); nilfs->ns_nextnum = nextnum; if (scan_newer) @@ -870,15 +914,9 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, scan_newer = 1; } - /* reset region for roll-forward */ - pseg_start += ssi.nblocks; - if (pseg_start < seg_end) - continue; - goto feed_segment; - try_next_pseg: /* Standing on a course, or met an inconsistent state */ - pseg_start += ssi.nblocks; + pseg_start += nblocks; if (pseg_start < seg_end) continue; goto feed_segment; @@ -909,6 +947,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, super_root_found: /* Updating pointers relating to the latest checkpoint */ + brelse(bh_sum); list_splice_tail(&segments, &ri->ri_used_segments); nilfs->ns_last_pseg = sr_pseg_start; nilfs->ns_last_seq = nilfs->ns_seg_seq; @@ -916,6 +955,7 @@ int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, return 0; failed: + brelse(bh_sum); nilfs_dispose_segment_list(&segments); return (ret < 0) ? ret : nilfs_warn_segment_error(ret); } diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h index 85fbb66455e..b04f08cc239 100644 --- a/fs/nilfs2/segbuf.h +++ b/fs/nilfs2/segbuf.h @@ -54,17 +54,6 @@ struct nilfs_segsum_info { sector_t next; }; -/* macro for the flags */ -#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR) -#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN) -#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND) -#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT) -#define NILFS_SEG_SIMPLEX(sum) \ - (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \ - (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) - -#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk) - /** * struct nilfs_segment_buffer - Segment buffer * @sb_super: back pointer to a superblock struct @@ -141,6 +130,19 @@ int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *, struct buffer_head **); void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *); +static inline int nilfs_segbuf_simplex(struct nilfs_segment_buffer *segbuf) +{ + unsigned int flags = segbuf->sb_sum.flags; + + return (flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == + (NILFS_SS_LOGBGN | NILFS_SS_LOGEND); +} + +static inline int nilfs_segbuf_empty(struct nilfs_segment_buffer *segbuf) +{ + return segbuf->sb_sum.nblocks == segbuf->sb_sum.nsumblk; +} + static inline void nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf, struct buffer_head *bh) diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index c9201649cc4..9fd051a33c4 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1914,12 +1914,12 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) } } - if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) { - if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) { + if (!nilfs_segbuf_simplex(segbuf)) { + if (segbuf->sb_sum.flags & NILFS_SS_LOGBGN) { set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); sci->sc_lseg_stime = jiffies; } - if (NILFS_SEG_LOGEND(&segbuf->sb_sum)) + if (segbuf->sb_sum.flags & NILFS_SS_LOGEND) clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags); } } @@ -1951,7 +1951,6 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) if (update_sr) { nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, segbuf->sb_sum.seg_seq, nilfs->ns_cno++); - set_nilfs_sb_dirty(nilfs); clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags); clear_bit(NILFS_SC_DIRTY, &sci->sc_flags); @@ -2082,7 +2081,7 @@ static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode) /* Avoid empty segment */ if (sci->sc_stage.scnt == NILFS_ST_DONE && - NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) { + nilfs_segbuf_empty(sci->sc_curseg)) { nilfs_segctor_abort_construction(sci, nilfs, 1); goto out; } @@ -2408,6 +2407,7 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) { struct nilfs_sb_info *sbi = sci->sc_sbi; struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp; int err = 0; nilfs_segctor_accept(sci); @@ -2423,8 +2423,13 @@ static int nilfs_segctor_construct(struct nilfs_sc_info *sci, int mode) if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) && nilfs_discontinued(nilfs)) { down_write(&nilfs->ns_sem); - err = nilfs_commit_super( - sbi, nilfs_altsb_need_update(nilfs)); + err = -EIO; + sbp = nilfs_prepare_super(sbi, + nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + err = nilfs_commit_super(sbi, NILFS_SB_COMMIT); + } up_write(&nilfs->ns_sem); } } diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index 01e20dbb217..17c487bd815 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -234,13 +234,13 @@ extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *); extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *); /* recovery.c */ -extern int nilfs_read_super_root_block(struct super_block *, sector_t, +extern int nilfs_read_super_root_block(struct the_nilfs *, sector_t, struct buffer_head **, int); -extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *, +extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_recovery_info *); -extern int nilfs_recover_logical_segments(struct the_nilfs *, - struct nilfs_sb_info *, - struct nilfs_recovery_info *); +extern int nilfs_salvage_orphan_logs(struct the_nilfs *, + struct nilfs_sb_info *, + struct nilfs_recovery_info *); extern void nilfs_dispose_segment_list(struct list_head *); #endif /* _NILFS_SEGMENT_H */ diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 414ef68931c..26078b3407c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -55,6 +55,8 @@ #include "nilfs.h" #include "mdt.h" #include "alloc.h" +#include "btree.h" +#include "btnode.h" #include "page.h" #include "cpfile.h" #include "ifile.h" @@ -74,6 +76,25 @@ struct kmem_cache *nilfs_btree_path_cache; static int nilfs_remount(struct super_block *sb, int *flags, char *data); +static void nilfs_set_error(struct nilfs_sb_info *sbi) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp; + + down_write(&nilfs->ns_sem); + if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { + nilfs->ns_mount_state |= NILFS_ERROR_FS; + sbp = nilfs_prepare_super(sbi, 0); + if (likely(sbp)) { + sbp[0]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + if (sbp[1]) + sbp[1]->s_state |= cpu_to_le16(NILFS_ERROR_FS); + nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); + } + } + up_write(&nilfs->ns_sem); +} + /** * nilfs_error() - report failure condition on a filesystem * @@ -99,16 +120,7 @@ void nilfs_error(struct super_block *sb, const char *function, va_end(args); if (!(sb->s_flags & MS_RDONLY)) { - struct the_nilfs *nilfs = sbi->s_nilfs; - - down_write(&nilfs->ns_sem); - if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) { - nilfs->ns_mount_state |= NILFS_ERROR_FS; - nilfs->ns_sbp[0]->s_state |= - cpu_to_le16(NILFS_ERROR_FS); - nilfs_commit_super(sbi, 1); - } - up_write(&nilfs->ns_sem); + nilfs_set_error(sbi); if (nilfs_test_opt(sbi, ERRORS_RO)) { printk(KERN_CRIT "Remounting filesystem read-only\n"); @@ -176,7 +188,7 @@ static void nilfs_clear_inode(struct inode *inode) nilfs_btnode_cache_clear(&ii->i_btnode_cache); } -static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) +static int nilfs_sync_super(struct nilfs_sb_info *sbi, int flag) { struct the_nilfs *nilfs = sbi->s_nilfs; int err; @@ -202,12 +214,20 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) printk(KERN_ERR "NILFS: unable to write superblock (err=%d)\n", err); if (err == -EIO && nilfs->ns_sbh[1]) { + /* + * sbp[0] points to newer log than sbp[1], + * so copy sbp[0] to sbp[1] to take over sbp[0]. + */ + memcpy(nilfs->ns_sbp[1], nilfs->ns_sbp[0], + nilfs->ns_sbsize); nilfs_fall_back_super_block(nilfs); goto retry; } } else { struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; + nilfs->ns_sbwcount++; + /* * The latest segment becomes trailable from the position * written in superblock. @@ -216,66 +236,122 @@ static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb) /* update GC protection for recent segments */ if (nilfs->ns_sbh[1]) { - sbp = NULL; - if (dupsb) { + if (flag == NILFS_SB_COMMIT_ALL) { set_buffer_dirty(nilfs->ns_sbh[1]); - if (!sync_dirty_buffer(nilfs->ns_sbh[1])) - sbp = nilfs->ns_sbp[1]; + if (sync_dirty_buffer(nilfs->ns_sbh[1]) < 0) + goto out; } + if (le64_to_cpu(nilfs->ns_sbp[1]->s_last_cno) < + le64_to_cpu(nilfs->ns_sbp[0]->s_last_cno)) + sbp = nilfs->ns_sbp[1]; } - if (sbp) { - spin_lock(&nilfs->ns_last_segment_lock); - nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); - spin_unlock(&nilfs->ns_last_segment_lock); - } - } + spin_lock(&nilfs->ns_last_segment_lock); + nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq); + spin_unlock(&nilfs->ns_last_segment_lock); + } + out: return err; } -int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb) +void nilfs_set_log_cursor(struct nilfs_super_block *sbp, + struct the_nilfs *nilfs) +{ + sector_t nfreeblocks; + + /* nilfs->ns_sem must be locked by the caller. */ + nilfs_count_free_blocks(nilfs, &nfreeblocks); + sbp->s_free_blocks_count = cpu_to_le64(nfreeblocks); + + spin_lock(&nilfs->ns_last_segment_lock); + sbp->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); + sbp->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); + sbp->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); + spin_unlock(&nilfs->ns_last_segment_lock); +} + +struct nilfs_super_block **nilfs_prepare_super(struct nilfs_sb_info *sbi, + int flip) { struct the_nilfs *nilfs = sbi->s_nilfs; struct nilfs_super_block **sbp = nilfs->ns_sbp; - sector_t nfreeblocks; - time_t t; - int err; - /* nilfs->sem must be locked by the caller. */ + /* nilfs->ns_sem must be locked by the caller. */ if (sbp[0]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { - if (sbp[1] && sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) - nilfs_swap_super_block(nilfs); - else { + if (sbp[1] && + sbp[1]->s_magic == cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + } else { printk(KERN_CRIT "NILFS: superblock broke on dev %s\n", sbi->s_super->s_id); - return -EIO; + return NULL; } + } else if (sbp[1] && + sbp[1]->s_magic != cpu_to_le16(NILFS_SUPER_MAGIC)) { + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); } - err = nilfs_count_free_blocks(nilfs, &nfreeblocks); - if (unlikely(err)) { - printk(KERN_ERR "NILFS: failed to count free blocks\n"); - return err; - } - spin_lock(&nilfs->ns_last_segment_lock); - sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq); - sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg); - sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno); - spin_unlock(&nilfs->ns_last_segment_lock); + if (flip && sbp[1]) + nilfs_swap_super_block(nilfs); + + return sbp; +} + +int nilfs_commit_super(struct nilfs_sb_info *sbi, int flag) +{ + struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + time_t t; + + /* nilfs->ns_sem must be locked by the caller. */ t = get_seconds(); - nilfs->ns_sbwtime[0] = t; - sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks); + nilfs->ns_sbwtime = t; sbp[0]->s_wtime = cpu_to_le64(t); sbp[0]->s_sum = 0; sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, (unsigned char *)sbp[0], nilfs->ns_sbsize)); - if (dupsb && sbp[1]) { - memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); - nilfs->ns_sbwtime[1] = t; + if (flag == NILFS_SB_COMMIT_ALL && sbp[1]) { + sbp[1]->s_wtime = sbp[0]->s_wtime; + sbp[1]->s_sum = 0; + sbp[1]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed, + (unsigned char *)sbp[1], + nilfs->ns_sbsize)); } clear_nilfs_sb_dirty(nilfs); - return nilfs_sync_super(sbi, dupsb); + return nilfs_sync_super(sbi, flag); +} + +/** + * nilfs_cleanup_super() - write filesystem state for cleanup + * @sbi: nilfs_sb_info to be unmounted or degraded to read-only + * + * This function restores state flags in the on-disk super block. + * This will set "clean" flag (i.e. NILFS_VALID_FS) unless the + * filesystem was not clean previously. + */ +int nilfs_cleanup_super(struct nilfs_sb_info *sbi) +{ + struct nilfs_super_block **sbp; + int flag = NILFS_SB_COMMIT; + int ret = -EIO; + + sbp = nilfs_prepare_super(sbi, 0); + if (sbp) { + sbp[0]->s_state = cpu_to_le16(sbi->s_nilfs->ns_mount_state); + nilfs_set_log_cursor(sbp[0], sbi->s_nilfs); + if (sbp[1] && sbp[0]->s_last_cno == sbp[1]->s_last_cno) { + /* + * make the "clean" flag also to the opposite + * super block if both super blocks point to + * the same checkpoint. + */ + sbp[1]->s_state = sbp[0]->s_state; + flag = NILFS_SB_COMMIT_ALL; + } + ret = nilfs_commit_super(sbi, flag); + } + return ret; } static void nilfs_put_super(struct super_block *sb) @@ -289,8 +365,7 @@ static void nilfs_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { down_write(&nilfs->ns_sem); - nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); - nilfs_commit_super(sbi, 1); + nilfs_cleanup_super(sbi); up_write(&nilfs->ns_sem); } down_write(&nilfs->ns_super_sem); @@ -311,6 +386,7 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) { struct nilfs_sb_info *sbi = NILFS_SB(sb); struct the_nilfs *nilfs = sbi->s_nilfs; + struct nilfs_super_block **sbp; int err = 0; /* This function is called when super block should be written back */ @@ -318,8 +394,13 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) err = nilfs_construct_segment(sb); down_write(&nilfs->ns_sem); - if (nilfs_sb_dirty(nilfs)) - nilfs_commit_super(sbi, 1); + if (nilfs_sb_dirty(nilfs)) { + sbp = nilfs_prepare_super(sbi, nilfs_sb_will_flip(nilfs)); + if (likely(sbp)) { + nilfs_set_log_cursor(sbp[0], nilfs); + nilfs_commit_super(sbi, NILFS_SB_COMMIT); + } + } up_write(&nilfs->ns_sem); return err; @@ -442,20 +523,20 @@ static int nilfs_show_options(struct seq_file *seq, struct vfsmount *vfs) struct nilfs_sb_info *sbi = NILFS_SB(sb); if (!nilfs_test_opt(sbi, BARRIER)) - seq_printf(seq, ",nobarrier"); + seq_puts(seq, ",nobarrier"); if (nilfs_test_opt(sbi, SNAPSHOT)) seq_printf(seq, ",cp=%llu", (unsigned long long int)sbi->s_snapshot_cno); if (nilfs_test_opt(sbi, ERRORS_PANIC)) - seq_printf(seq, ",errors=panic"); + seq_puts(seq, ",errors=panic"); if (nilfs_test_opt(sbi, ERRORS_CONT)) - seq_printf(seq, ",errors=continue"); + seq_puts(seq, ",errors=continue"); if (nilfs_test_opt(sbi, STRICT_ORDER)) - seq_printf(seq, ",order=strict"); + seq_puts(seq, ",order=strict"); if (nilfs_test_opt(sbi, NORECOVERY)) - seq_printf(seq, ",norecovery"); + seq_puts(seq, ",norecovery"); if (nilfs_test_opt(sbi, DISCARD)) - seq_printf(seq, ",discard"); + seq_puts(seq, ",discard"); return 0; } @@ -524,23 +605,25 @@ static const struct export_operations nilfs_export_ops = { enum { Opt_err_cont, Opt_err_panic, Opt_err_ro, - Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, - Opt_discard, Opt_err, + Opt_barrier, Opt_nobarrier, Opt_snapshot, Opt_order, Opt_norecovery, + Opt_discard, Opt_nodiscard, Opt_err, }; static match_table_t tokens = { {Opt_err_cont, "errors=continue"}, {Opt_err_panic, "errors=panic"}, {Opt_err_ro, "errors=remount-ro"}, + {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_snapshot, "cp=%u"}, {Opt_order, "order=%s"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_err, NULL} }; -static int parse_options(char *options, struct super_block *sb) +static int parse_options(char *options, struct super_block *sb, int is_remount) { struct nilfs_sb_info *sbi = NILFS_SB(sb); char *p; @@ -557,6 +640,9 @@ static int parse_options(char *options, struct super_block *sb) token = match_token(p, tokens, args); switch (token) { + case Opt_barrier: + nilfs_set_opt(sbi, BARRIER); + break; case Opt_nobarrier: nilfs_clear_opt(sbi, BARRIER); break; @@ -582,8 +668,26 @@ static int parse_options(char *options, struct super_block *sb) case Opt_snapshot: if (match_int(&args[0], &option) || option <= 0) return 0; - if (!(sb->s_flags & MS_RDONLY)) + if (is_remount) { + if (!nilfs_test_opt(sbi, SNAPSHOT)) { + printk(KERN_ERR + "NILFS: cannot change regular " + "mount to snapshot.\n"); + return 0; + } else if (option != sbi->s_snapshot_cno) { + printk(KERN_ERR + "NILFS: cannot remount to a " + "different snapshot.\n"); + return 0; + } + break; + } + if (!(sb->s_flags & MS_RDONLY)) { + printk(KERN_ERR "NILFS: cannot mount snapshot " + "read/write. A read-only option is " + "required.\n"); return 0; + } sbi->s_snapshot_cno = option; nilfs_set_opt(sbi, SNAPSHOT); break; @@ -593,6 +697,9 @@ static int parse_options(char *options, struct super_block *sb) case Opt_discard: nilfs_set_opt(sbi, DISCARD); break; + case Opt_nodiscard: + nilfs_clear_opt(sbi, DISCARD); + break; default: printk(KERN_ERR "NILFS: Unrecognized mount option \"%s\"\n", p); @@ -613,11 +720,18 @@ nilfs_set_default_options(struct nilfs_sb_info *sbi, static int nilfs_setup_super(struct nilfs_sb_info *sbi) { struct the_nilfs *nilfs = sbi->s_nilfs; - struct nilfs_super_block *sbp = nilfs->ns_sbp[0]; - int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count); - int mnt_count = le16_to_cpu(sbp->s_mnt_count); + struct nilfs_super_block **sbp; + int max_mnt_count; + int mnt_count; + + /* nilfs->ns_sem must be locked by the caller. */ + sbp = nilfs_prepare_super(sbi, 0); + if (!sbp) + return -EIO; + + max_mnt_count = le16_to_cpu(sbp[0]->s_max_mnt_count); + mnt_count = le16_to_cpu(sbp[0]->s_mnt_count); - /* nilfs->sem must be locked by the caller. */ if (nilfs->ns_mount_state & NILFS_ERROR_FS) { printk(KERN_WARNING "NILFS warning: mounting fs with errors\n"); @@ -628,12 +742,15 @@ static int nilfs_setup_super(struct nilfs_sb_info *sbi) #endif } if (!max_mnt_count) - sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); - - sbp->s_mnt_count = cpu_to_le16(mnt_count + 1); - sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS); - sbp->s_mtime = cpu_to_le64(get_seconds()); - return nilfs_commit_super(sbi, 1); + sbp[0]->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT); + + sbp[0]->s_mnt_count = cpu_to_le16(mnt_count + 1); + sbp[0]->s_state = + cpu_to_le16(le16_to_cpu(sbp[0]->s_state) & ~NILFS_VALID_FS); + sbp[0]->s_mtime = cpu_to_le64(get_seconds()); + /* synchronize sbp[1] with sbp[0] */ + memcpy(sbp[1], sbp[0], nilfs->ns_sbsize); + return nilfs_commit_super(sbi, NILFS_SB_COMMIT_ALL); } struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb, @@ -670,7 +787,31 @@ int nilfs_store_magic_and_option(struct super_block *sb, sbi->s_interval = le32_to_cpu(sbp->s_c_interval); sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max); - return !parse_options(data, sb) ? -EINVAL : 0 ; + return !parse_options(data, sb, 0) ? -EINVAL : 0 ; +} + +int nilfs_check_feature_compatibility(struct super_block *sb, + struct nilfs_super_block *sbp) +{ + __u64 features; + + features = le64_to_cpu(sbp->s_feature_incompat) & + ~NILFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + features = le64_to_cpu(sbp->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "NILFS: couldn't mount RDWR because of " + "unsupported optional features (%llx)\n", + (unsigned long long)features); + return -EINVAL; + } + return 0; } /** @@ -819,7 +960,6 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent, static int nilfs_remount(struct super_block *sb, int *flags, char *data) { struct nilfs_sb_info *sbi = NILFS_SB(sb); - struct nilfs_super_block *sbp; struct the_nilfs *nilfs = sbi->s_nilfs; unsigned long old_sb_flags; struct nilfs_mount_options old_opts; @@ -833,32 +973,17 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) old_opts.snapshot_cno = sbi->s_snapshot_cno; was_snapshot = nilfs_test_opt(sbi, SNAPSHOT); - if (!parse_options(data, sb)) { + if (!parse_options(data, sb, 1)) { err = -EINVAL; goto restore_opts; } sb->s_flags = (sb->s_flags & ~MS_POSIXACL); err = -EINVAL; - if (was_snapshot) { - if (!(*flags & MS_RDONLY)) { - printk(KERN_ERR "NILFS (device %s): cannot remount " - "snapshot read/write.\n", - sb->s_id); - goto restore_opts; - } else if (sbi->s_snapshot_cno != old_opts.snapshot_cno) { - printk(KERN_ERR "NILFS (device %s): cannot " - "remount to a different snapshot.\n", - sb->s_id); - goto restore_opts; - } - } else { - if (nilfs_test_opt(sbi, SNAPSHOT)) { - printk(KERN_ERR "NILFS (device %s): cannot change " - "a regular mount to a snapshot.\n", - sb->s_id); - goto restore_opts; - } + if (was_snapshot && !(*flags & MS_RDONLY)) { + printk(KERN_ERR "NILFS (device %s): cannot remount snapshot " + "read/write.\n", sb->s_id); + goto restore_opts; } if (!nilfs_valid_fs(nilfs)) { @@ -880,19 +1005,29 @@ static int nilfs_remount(struct super_block *sb, int *flags, char *data) * the RDONLY flag and then mark the partition as valid again. */ down_write(&nilfs->ns_sem); - sbp = nilfs->ns_sbp[0]; - if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) && - (nilfs->ns_mount_state & NILFS_VALID_FS)) - sbp->s_state = cpu_to_le16(nilfs->ns_mount_state); - sbp->s_mtime = cpu_to_le64(get_seconds()); - nilfs_commit_super(sbi, 1); + nilfs_cleanup_super(sbi); up_write(&nilfs->ns_sem); } else { + __u64 features; + /* * Mounting a RDONLY partition read-write, so reread and * store the current valid flag. (It may have been changed * by fsck since we originally mounted the partition.) */ + down_read(&nilfs->ns_sem); + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + up_read(&nilfs->ns_sem); + if (features) { + printk(KERN_WARNING "NILFS (device %s): couldn't " + "remount RDWR because of unsupported optional " + "features (%llx)\n", + sb->s_id, (unsigned long long)features); + err = -EROFS; + goto restore_opts; + } + sb->s_flags &= ~MS_RDONLY; err = nilfs_attach_segment_constructor(sbi); @@ -1119,7 +1254,7 @@ static void nilfs_inode_init_once(void *obj) init_rwsem(&ii->xattr_sem); #endif nilfs_btnode_cache_init_once(&ii->i_btnode_cache); - ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union; + ii->i_bmap = &ii->i_bmap_data; inode_init_once(&ii->vfs_inode); } diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 8c1097327ab..37de1f062d8 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -38,6 +38,8 @@ static LIST_HEAD(nilfs_objects); static DEFINE_SPINLOCK(nilfs_lock); +static int nilfs_valid_sb(struct nilfs_super_block *sbp); + void nilfs_set_last_segment(struct the_nilfs *nilfs, sector_t start_blocknr, u64 seq, __u64 cno) { @@ -45,6 +47,16 @@ void nilfs_set_last_segment(struct the_nilfs *nilfs, nilfs->ns_last_pseg = start_blocknr; nilfs->ns_last_seq = seq; nilfs->ns_last_cno = cno; + + if (!nilfs_sb_dirty(nilfs)) { + if (nilfs->ns_prev_seq == nilfs->ns_last_seq) + goto stay_cursor; + + set_nilfs_sb_dirty(nilfs); + } + nilfs->ns_prev_seq = nilfs->ns_last_seq; + + stay_cursor: spin_unlock(&nilfs->ns_last_segment_lock); } @@ -159,8 +171,7 @@ void put_nilfs(struct the_nilfs *nilfs) kfree(nilfs); } -static int nilfs_load_super_root(struct the_nilfs *nilfs, - struct nilfs_sb_info *sbi, sector_t sr_block) +static int nilfs_load_super_root(struct the_nilfs *nilfs, sector_t sr_block) { struct buffer_head *bh_sr; struct nilfs_super_root *raw_sr; @@ -169,7 +180,7 @@ static int nilfs_load_super_root(struct the_nilfs *nilfs, unsigned inode_size; int err; - err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1); + err = nilfs_read_super_root_block(nilfs, sr_block, &bh_sr, 1); if (unlikely(err)) return err; @@ -248,6 +259,37 @@ static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri) } /** + * nilfs_store_log_cursor - load log cursor from a super block + * @nilfs: nilfs object + * @sbp: buffer storing super block to be read + * + * nilfs_store_log_cursor() reads the last position of the log + * containing a super root from a given super block, and initializes + * relevant information on the nilfs object preparatory for log + * scanning and recovery. + */ +static int nilfs_store_log_cursor(struct the_nilfs *nilfs, + struct nilfs_super_block *sbp) +{ + int ret = 0; + + nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); + nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); + nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); + + nilfs->ns_prev_seq = nilfs->ns_last_seq; + nilfs->ns_seg_seq = nilfs->ns_last_seq; + nilfs->ns_segnum = + nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); + nilfs->ns_cno = nilfs->ns_last_cno + 1; + if (nilfs->ns_segnum >= nilfs->ns_nsegments) { + printk(KERN_ERR "NILFS invalid last segment number.\n"); + ret = -EINVAL; + } + return ret; +} + +/** * load_nilfs - load and recover the nilfs * @nilfs: the_nilfs structure to be released * @sbi: nilfs_sb_info used to recover past segment @@ -285,13 +327,55 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) nilfs_init_recovery_info(&ri); - err = nilfs_search_super_root(nilfs, sbi, &ri); + err = nilfs_search_super_root(nilfs, &ri); if (unlikely(err)) { - printk(KERN_ERR "NILFS: error searching super root.\n"); - goto failed; + struct nilfs_super_block **sbp = nilfs->ns_sbp; + int blocksize; + + if (err != -EINVAL) + goto scan_error; + + if (!nilfs_valid_sb(sbp[1])) { + printk(KERN_WARNING + "NILFS warning: unable to fall back to spare" + "super block\n"); + goto scan_error; + } + printk(KERN_INFO + "NILFS: try rollback from an earlier position\n"); + + /* + * restore super block with its spare and reconfigure + * relevant states of the nilfs object. + */ + memcpy(sbp[0], sbp[1], nilfs->ns_sbsize); + nilfs->ns_crc_seed = le32_to_cpu(sbp[0]->s_crc_seed); + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); + + /* verify consistency between two super blocks */ + blocksize = BLOCK_SIZE << le32_to_cpu(sbp[0]->s_log_block_size); + if (blocksize != nilfs->ns_blocksize) { + printk(KERN_WARNING + "NILFS warning: blocksize differs between " + "two super blocks (%d != %d)\n", + blocksize, nilfs->ns_blocksize); + goto scan_error; + } + + err = nilfs_store_log_cursor(nilfs, sbp[0]); + if (err) + goto scan_error; + + /* drop clean flag to allow roll-forward and recovery */ + nilfs->ns_mount_state &= ~NILFS_VALID_FS; + valid_fs = 0; + + err = nilfs_search_super_root(nilfs, &ri); + if (err) + goto scan_error; } - err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root); + err = nilfs_load_super_root(nilfs, ri.ri_super_root); if (unlikely(err)) { printk(KERN_ERR "NILFS: error loading super root.\n"); goto failed; @@ -301,11 +385,23 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) goto skip_recovery; if (s_flags & MS_RDONLY) { + __u64 features; + if (nilfs_test_opt(sbi, NORECOVERY)) { printk(KERN_INFO "NILFS: norecovery option specified. " "skipping roll-forward recovery\n"); goto skip_recovery; } + features = le64_to_cpu(nilfs->ns_sbp[0]->s_feature_compat_ro) & + ~NILFS_FEATURE_COMPAT_RO_SUPP; + if (features) { + printk(KERN_ERR "NILFS: couldn't proceed with " + "recovery because of unsupported optional " + "features (%llx)\n", + (unsigned long long)features); + err = -EROFS; + goto failed_unload; + } if (really_read_only) { printk(KERN_ERR "NILFS: write access " "unavailable, cannot proceed.\n"); @@ -320,14 +416,13 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) goto failed_unload; } - err = nilfs_recover_logical_segments(nilfs, sbi, &ri); + err = nilfs_salvage_orphan_logs(nilfs, sbi, &ri); if (err) goto failed_unload; down_write(&nilfs->ns_sem); - nilfs->ns_mount_state |= NILFS_VALID_FS; - nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state); - err = nilfs_commit_super(sbi, 1); + nilfs->ns_mount_state |= NILFS_VALID_FS; /* set "clean" flag */ + err = nilfs_cleanup_super(sbi); up_write(&nilfs->ns_sem); if (err) { @@ -343,6 +438,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) sbi->s_super->s_flags = s_flags; return 0; + scan_error: + printk(KERN_ERR "NILFS: error searching super root.\n"); + goto failed; + failed_unload: nilfs_mdt_destroy(nilfs->ns_cpfile); nilfs_mdt_destroy(nilfs->ns_sufile); @@ -515,8 +614,8 @@ static int nilfs_load_super_block(struct the_nilfs *nilfs, nilfs_swap_super_block(nilfs); } - nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime); - nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0; + nilfs->ns_sbwcount = 0; + nilfs->ns_sbwtime = le64_to_cpu(sbp[0]->s_wtime); nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq); *sbpp = sbp[0]; return 0; @@ -557,6 +656,10 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) if (err) goto out; + err = nilfs_check_feature_compatibility(sb, sbp); + if (err) + goto out; + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); if (sb->s_blocksize != blocksize && !sb_set_blocksize(sb, blocksize)) { @@ -568,7 +671,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) goto out; } - blocksize = sb_min_blocksize(sb, BLOCK_SIZE); + blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { printk(KERN_ERR "NILFS: unable to set blocksize\n"); err = -EINVAL; @@ -582,7 +685,18 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) if (err) goto failed_sbh; + err = nilfs_check_feature_compatibility(sb, sbp); + if (err) + goto failed_sbh; + blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); + if (blocksize < NILFS_MIN_BLOCK_SIZE || + blocksize > NILFS_MAX_BLOCK_SIZE) { + printk(KERN_ERR "NILFS: couldn't mount because of unsupported " + "filesystem blocksize %d\n", blocksize); + err = -EINVAL; + goto failed_sbh; + } if (sb->s_blocksize != blocksize) { int hw_blocksize = bdev_logical_block_size(sb->s_bdev); @@ -604,6 +718,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) when reloading fails. */ } nilfs->ns_blocksize_bits = sb->s_blocksize_bits; + nilfs->ns_blocksize = blocksize; err = nilfs_store_disk_layout(nilfs, sbp); if (err) @@ -616,23 +731,9 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info; nilfs->ns_bdi = bdi ? : &default_backing_dev_info; - /* Finding last segment */ - nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg); - nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno); - nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq); - - nilfs->ns_seg_seq = nilfs->ns_last_seq; - nilfs->ns_segnum = - nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg); - nilfs->ns_cno = nilfs->ns_last_cno + 1; - if (nilfs->ns_segnum >= nilfs->ns_nsegments) { - printk(KERN_ERR "NILFS invalid last segment number.\n"); - err = -EINVAL; + err = nilfs_store_log_cursor(nilfs, sbp); + if (err) goto failed_sbh; - } - /* Dummy values */ - nilfs->ns_free_segments_count = - nilfs->ns_nsegments - (nilfs->ns_segnum + 1); /* Initialize gcinode cache */ err = nilfs_init_gccache(nilfs); diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index 1ab97453369..f785a7b0ab9 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -57,7 +57,8 @@ enum { * @ns_current: back pointer to current mount * @ns_sbh: buffer heads of on-disk super blocks * @ns_sbp: pointers to super block data - * @ns_sbwtime: previous write time of super blocks + * @ns_sbwtime: previous write time of super block + * @ns_sbwcount: write count of super block * @ns_sbsize: size of valid data in super block * @ns_supers: list of nilfs super block structs * @ns_seg_seq: segment sequence counter @@ -73,7 +74,7 @@ enum { * @ns_last_seq: sequence value of the latest segment * @ns_last_cno: checkpoint number of the latest segment * @ns_prot_seq: least sequence number of segments which must not be reclaimed - * @ns_free_segments_count: counter of free segments + * @ns_prev_seq: base sequence number used to decide if advance log cursor * @ns_segctor_sem: segment constructor semaphore * @ns_dat: DAT file inode * @ns_cpfile: checkpoint file inode @@ -82,6 +83,7 @@ enum { * @ns_gc_inodes: dummy inodes to keep live blocks * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks * @ns_blocksize_bits: bit length of block size + * @ns_blocksize: block size * @ns_nsegments: number of segments in filesystem * @ns_blocks_per_segment: number of blocks per segment * @ns_r_segments_percentage: reserved segments percentage @@ -119,7 +121,8 @@ struct the_nilfs { */ struct buffer_head *ns_sbh[2]; struct nilfs_super_block *ns_sbp[2]; - time_t ns_sbwtime[2]; + time_t ns_sbwtime; + unsigned ns_sbwcount; unsigned ns_sbsize; unsigned ns_mount_state; @@ -149,7 +152,7 @@ struct the_nilfs { u64 ns_last_seq; __u64 ns_last_cno; u64 ns_prot_seq; - unsigned long ns_free_segments_count; + u64 ns_prev_seq; struct rw_semaphore ns_segctor_sem; @@ -168,6 +171,7 @@ struct the_nilfs { /* Disk layout information (static) */ unsigned int ns_blocksize_bits; + unsigned int ns_blocksize; unsigned long ns_nsegments; unsigned long ns_blocks_per_segment; unsigned long ns_r_segments_percentage; @@ -203,20 +207,17 @@ THE_NILFS_FNS(SB_DIRTY, sb_dirty) /* Minimum interval of periodical update of superblocks (in seconds) */ #define NILFS_SB_FREQ 10 -#define NILFS_ALTSB_FREQ 60 /* spare superblock */ static inline int nilfs_sb_need_update(struct the_nilfs *nilfs) { u64 t = get_seconds(); - return t < nilfs->ns_sbwtime[0] || - t > nilfs->ns_sbwtime[0] + NILFS_SB_FREQ; + return t < nilfs->ns_sbwtime || t > nilfs->ns_sbwtime + NILFS_SB_FREQ; } -static inline int nilfs_altsb_need_update(struct the_nilfs *nilfs) +static inline int nilfs_sb_will_flip(struct the_nilfs *nilfs) { - u64 t = get_seconds(); - struct nilfs_super_block **sbp = nilfs->ns_sbp; - return sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ; + int flip_bits = nilfs->ns_sbwcount & 0x0FL; + return (flip_bits != 0x08 && flip_bits != 0x0F); } void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 625de9d7088..9b57c0350ff 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) if (osb->osb_commit_interval) commit_interval = osb->osb_commit_interval; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 5dcd4b0c553..72c52656dc2 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -459,7 +459,6 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ - INIT_RCU_HEAD(&p->rcu_head); rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk supresses it */ diff --git a/fs/proc/base.c b/fs/proc/base.c index acb7ef80ea4..69254a365ce 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -63,6 +63,7 @@ #include <linux/namei.h> #include <linux/mnt_namespace.h> #include <linux/mm.h> +#include <linux/swap.h> #include <linux/rcupdate.h> #include <linux/kallsyms.h> #include <linux/stacktrace.h> @@ -427,17 +428,14 @@ static const struct file_operations proc_lstats_operations = { #endif -/* The badness from the OOM killer */ -unsigned long badness(struct task_struct *p, unsigned long uptime); static int proc_oom_score(struct task_struct *task, char *buffer) { unsigned long points = 0; - struct timespec uptime; - do_posix_clock_monotonic_gettime(&uptime); read_lock(&tasklist_lock); if (pid_alive(task)) - points = badness(task, uptime.tv_sec); + points = oom_badness(task, NULL, NULL, + totalram_pages + total_swap_pages); read_unlock(&tasklist_lock); return sprintf(buffer, "%lu\n", points); } @@ -1039,8 +1037,24 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EACCES; } + /* + * Warn that /proc/pid/oom_adj is deprecated, see + * Documentation/feature-removal-schedule.txt. + */ + printk_once(KERN_WARNING "%s (%d): /proc/%d/oom_adj is deprecated, " + "please use /proc/%d/oom_score_adj instead.\n", + current->comm, task_pid_nr(current), + task_pid_nr(task), task_pid_nr(task)); task->signal->oom_adj = oom_adjust; - + /* + * Scale /proc/pid/oom_score_adj appropriately ensuring that a maximum + * value is always attainable. + */ + if (task->signal->oom_adj == OOM_ADJUST_MAX) + task->signal->oom_score_adj = OOM_SCORE_ADJ_MAX; + else + task->signal->oom_score_adj = (oom_adjust * OOM_SCORE_ADJ_MAX) / + -OOM_DISABLE; unlock_task_sighand(task, &flags); put_task_struct(task); @@ -1053,6 +1067,82 @@ static const struct file_operations proc_oom_adjust_operations = { .llseek = generic_file_llseek, }; +static ssize_t oom_score_adj_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task = get_proc_task(file->f_path.dentry->d_inode); + char buffer[PROC_NUMBUF]; + int oom_score_adj = OOM_SCORE_ADJ_MIN; + unsigned long flags; + size_t len; + + if (!task) + return -ESRCH; + if (lock_task_sighand(task, &flags)) { + oom_score_adj = task->signal->oom_score_adj; + unlock_task_sighand(task, &flags); + } + put_task_struct(task); + len = snprintf(buffer, sizeof(buffer), "%d\n", oom_score_adj); + return simple_read_from_buffer(buf, count, ppos, buffer, len); +} + +static ssize_t oom_score_adj_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct task_struct *task; + char buffer[PROC_NUMBUF]; + unsigned long flags; + long oom_score_adj; + int err; + + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; + if (copy_from_user(buffer, buf, count)) + return -EFAULT; + + err = strict_strtol(strstrip(buffer), 0, &oom_score_adj); + if (err) + return -EINVAL; + if (oom_score_adj < OOM_SCORE_ADJ_MIN || + oom_score_adj > OOM_SCORE_ADJ_MAX) + return -EINVAL; + + task = get_proc_task(file->f_path.dentry->d_inode); + if (!task) + return -ESRCH; + if (!lock_task_sighand(task, &flags)) { + put_task_struct(task); + return -ESRCH; + } + if (oom_score_adj < task->signal->oom_score_adj && + !capable(CAP_SYS_RESOURCE)) { + unlock_task_sighand(task, &flags); + put_task_struct(task); + return -EACCES; + } + + task->signal->oom_score_adj = oom_score_adj; + /* + * Scale /proc/pid/oom_adj appropriately ensuring that OOM_DISABLE is + * always attainable. + */ + if (task->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + task->signal->oom_adj = OOM_DISABLE; + else + task->signal->oom_adj = (oom_score_adj * OOM_ADJUST_MAX) / + OOM_SCORE_ADJ_MAX; + unlock_task_sighand(task, &flags); + put_task_struct(task); + return count; +} + +static const struct file_operations proc_oom_score_adj_operations = { + .read = oom_score_adj_read, + .write = oom_score_adj_write, +}; + #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, @@ -2625,6 +2715,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif INF("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUGO, proc_sessionid_operations), @@ -2959,6 +3050,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif INF("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adjust_operations), + REG("oom_score_adj", S_IRUGO|S_IWUSR, proc_oom_score_adj_operations), #ifdef CONFIG_AUDITSYSCALL REG("loginuid", S_IWUSR|S_IRUGO, proc_loginuid_operations), REG("sessionid", S_IRUSR, proc_sessionid_operations), diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 437d2ca2de9..ef72b169942 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -132,6 +132,22 @@ static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_state_lock); __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_data_lock); EXPORT_SYMBOL(dq_data_lock); +void __quota_error(struct super_block *sb, const char *func, + const char *fmt, ...) +{ + va_list args; + + if (printk_ratelimit()) { + va_start(args, fmt); + printk(KERN_ERR "Quota error (device %s): %s: ", + sb->s_id, func); + vprintk(fmt, args); + printk("\n"); + va_end(args); + } +} +EXPORT_SYMBOL(__quota_error); + #if defined(CONFIG_QUOTA_DEBUG) || defined(CONFIG_PRINT_QUOTA_WARNING) static char *quotatypes[] = INITQFNAMES; #endif @@ -705,11 +721,8 @@ void dqput(struct dquot *dquot) return; #ifdef CONFIG_QUOTA_DEBUG if (!atomic_read(&dquot->dq_count)) { - printk("VFS: dqput: trying to free free dquot\n"); - printk("VFS: device %s, dquot of %s %d\n", - dquot->dq_sb->s_id, - quotatypes[dquot->dq_type], - dquot->dq_id); + quota_error(dquot->dq_sb, "trying to free free dquot of %s %d", + quotatypes[dquot->dq_type], dquot->dq_id); BUG(); } #endif @@ -732,9 +745,9 @@ we_slept: /* Commit dquot before releasing */ ret = dquot->dq_sb->dq_op->write_dquot(dquot); if (ret < 0) { - printk(KERN_ERR "VFS: cannot write quota structure on " - "device %s (error %d). Quota may get out of " - "sync!\n", dquot->dq_sb->s_id, ret); + quota_error(dquot->dq_sb, "Can't write quota structure" + " (error %d). Quota may get out of sync!", + ret); /* * We clear dirty bit anyway, so that we avoid * infinite loop here @@ -914,9 +927,9 @@ static void add_dquot_ref(struct super_block *sb, int type) #ifdef CONFIG_QUOTA_DEBUG if (reserved) { - printk(KERN_WARNING "VFS (%s): Writes happened before quota" - " was turned on thus quota information is probably " - "inconsistent. Please run quotacheck(8).\n", sb->s_id); + quota_error(sb, "Writes happened before quota was turned on " + "thus quota information is probably inconsistent. " + "Please run quotacheck(8)"); } #endif } @@ -947,7 +960,9 @@ static int remove_inode_dquot_ref(struct inode *inode, int type, if (dqput_blocks(dquot)) { #ifdef CONFIG_QUOTA_DEBUG if (atomic_read(&dquot->dq_count) != 1) - printk(KERN_WARNING "VFS: Adding dquot with dq_count %d to dispose list.\n", atomic_read(&dquot->dq_count)); + quota_error(inode->i_sb, "Adding dquot with " + "dq_count %d to dispose list", + atomic_read(&dquot->dq_count)); #endif spin_lock(&dq_list_lock); /* As dquot must have currently users it can't be on @@ -986,6 +1001,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct list_head *tofree_head) { struct inode *inode; + int reserved = 0; spin_lock(&inode_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { @@ -995,10 +1011,20 @@ static void remove_dquot_ref(struct super_block *sb, int type, * only quota pointers and these have separate locking * (dqptr_sem). */ - if (!IS_NOQUOTA(inode)) + if (!IS_NOQUOTA(inode)) { + if (unlikely(inode_get_rsv_space(inode) > 0)) + reserved = 1; remove_inode_dquot_ref(inode, type, tofree_head); + } } spin_unlock(&inode_lock); +#ifdef CONFIG_QUOTA_DEBUG + if (reserved) { + printk(KERN_WARNING "VFS (%s): Writes happened after quota" + " was disabled thus quota information is probably " + "inconsistent. Please run quotacheck(8).\n", sb->s_id); + } +#endif } /* Gather all references from inodes and drop them */ @@ -1304,6 +1330,15 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space) return QUOTA_NL_NOWARN; } +static int dquot_active(const struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (IS_NOQUOTA(inode)) + return 0; + return sb_any_quota_loaded(sb) & ~sb_any_quota_suspended(sb); +} + /* * Initialize quota pointers in inode * @@ -1323,7 +1358,7 @@ static void __dquot_initialize(struct inode *inode, int type) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return; /* First get references to structures we might need. */ @@ -1507,7 +1542,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number, int flags) * First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { + if (!dquot_active(inode)) { inode_incr_space(inode, number, reserve); goto out; } @@ -1559,7 +1594,7 @@ int dquot_alloc_inode(const struct inode *inode) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return 0; for (cnt = 0; cnt < MAXQUOTAS; cnt++) warntype[cnt] = QUOTA_NL_NOWARN; @@ -1596,7 +1631,7 @@ int dquot_claim_space_nodirty(struct inode *inode, qsize_t number) { int cnt; - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { + if (!dquot_active(inode)) { inode_claim_rsv_space(inode, number); return 0; } @@ -1629,7 +1664,7 @@ void __dquot_free_space(struct inode *inode, qsize_t number, int flags) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) { + if (!dquot_active(inode)) { inode_decr_space(inode, number, reserve); return; } @@ -1667,7 +1702,7 @@ void dquot_free_inode(const struct inode *inode) /* First test before acquiring mutex - solves deadlocks when we * re-enter the quota code and are already holding the mutex */ - if (!sb_any_quota_active(inode->i_sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return; down_read(&sb_dqopt(inode->i_sb)->dqptr_sem); @@ -1790,7 +1825,7 @@ int dquot_transfer(struct inode *inode, struct iattr *iattr) struct super_block *sb = inode->i_sb; int ret; - if (!sb_any_quota_active(sb) || IS_NOQUOTA(inode)) + if (!dquot_active(inode)) return 0; if (iattr->ia_valid & ATTR_UID && iattr->ia_uid != inode->i_uid) @@ -1957,7 +1992,7 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) truncate_inode_pages(&toputinode[cnt]->i_data, 0); mutex_unlock(&toputinode[cnt]->i_mutex); - mark_inode_dirty(toputinode[cnt]); + mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); } @@ -2270,7 +2305,7 @@ static void do_get_dqblk(struct dquot *dquot, struct fs_disk_quota *di) memset(di, 0, sizeof(*di)); di->d_version = FS_DQUOT_VERSION; di->d_flags = dquot->dq_type == USRQUOTA ? - XFS_USER_QUOTA : XFS_GROUP_QUOTA; + FS_USER_QUOTA : FS_GROUP_QUOTA; di->d_id = dquot->dq_id; spin_lock(&dq_data_lock); diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c index 24f03407eeb..9e48874eabc 100644 --- a/fs/quota/quota_tree.c +++ b/fs/quota/quota_tree.c @@ -65,8 +65,7 @@ static ssize_t write_blk(struct qtree_mem_dqinfo *info, uint blk, char *buf) ret = sb->s_op->quota_write(sb, info->dqi_type, buf, info->dqi_usable_bs, blk << info->dqi_blocksize_bits); if (ret != info->dqi_usable_bs) { - q_warn(KERN_WARNING "VFS: dquota write failed on " - "dev %s\n", sb->s_id); + quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -EIO; } @@ -160,9 +159,8 @@ static int remove_free_dqentry(struct qtree_mem_dqinfo *info, char *buf, dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); /* No matter whether write succeeds block is out of list */ if (write_blk(info, blk, buf) < 0) - q_warn(KERN_ERR - "VFS: Can't write block (%u) with free entries.\n", - blk); + quota_error(info->dqi_sb, "Can't write block (%u) " + "with free entries", blk); return 0; out_buf: kfree(tmpbuf); @@ -252,9 +250,8 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, if (le16_to_cpu(dh->dqdh_entries) + 1 >= qtree_dqstr_in_blk(info)) { *err = remove_free_dqentry(info, buf, blk); if (*err < 0) { - q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't " - "remove block (%u) from entry free list.\n", - blk); + quota_error(dquot->dq_sb, "Can't remove block (%u) " + "from entry free list", blk); goto out_buf; } } @@ -268,16 +265,15 @@ static uint find_free_dqentry(struct qtree_mem_dqinfo *info, } #ifdef __QUOTA_QT_PARANOIA if (i == qtree_dqstr_in_blk(info)) { - printk(KERN_ERR "VFS: find_free_dqentry(): Data block full " - "but it shouldn't.\n"); + quota_error(dquot->dq_sb, "Data block full but it shouldn't"); *err = -EIO; goto out_buf; } #endif *err = write_blk(info, blk, buf); if (*err < 0) { - q_warn(KERN_ERR "VFS: find_free_dqentry(): Can't write quota " - "data block %u.\n", blk); + quota_error(dquot->dq_sb, "Can't write quota data block %u", + blk); goto out_buf; } dquot->dq_off = (blk << info->dqi_blocksize_bits) + @@ -311,8 +307,8 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = read_blk(info, *treeblk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read tree quota block " - "%u.\n", *treeblk); + quota_error(dquot->dq_sb, "Can't read tree quota " + "block %u", *treeblk); goto out_buf; } } @@ -323,9 +319,9 @@ static int do_insert_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (depth == info->dqi_qtree_depth - 1) { #ifdef __QUOTA_QT_PARANOIA if (newblk) { - printk(KERN_ERR "VFS: Inserting already present quota " - "entry (block %u).\n", - le32_to_cpu(ref[get_index(info, + quota_error(dquot->dq_sb, "Inserting already present " + "quota entry (block %u)", + le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)])); ret = -EIO; goto out_buf; @@ -373,8 +369,8 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (!dquot->dq_off) { ret = dq_insert_tree(info, dquot); if (ret < 0) { - q_warn(KERN_ERR "VFS: Error %zd occurred while " - "creating quota.\n", ret); + quota_error(sb, "Error %zd occurred while creating " + "quota", ret); kfree(ddquot); return ret; } @@ -385,8 +381,7 @@ int qtree_write_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) ret = sb->s_op->quota_write(sb, type, ddquot, info->dqi_entry_size, dquot->dq_off); if (ret != info->dqi_entry_size) { - q_warn(KERN_WARNING "VFS: dquota write failed on dev %s\n", - sb->s_id); + quota_error(sb, "dquota write failed"); if (ret >= 0) ret = -ENOSPC; } else { @@ -410,14 +405,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (!buf) return -ENOMEM; if (dquot->dq_off >> info->dqi_blocksize_bits != blk) { - q_warn(KERN_ERR "VFS: Quota structure has offset to other " - "block (%u) than it should (%u).\n", blk, - (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); + quota_error(dquot->dq_sb, "Quota structure has offset to " + "other block (%u) than it should (%u)", blk, + (uint)(dquot->dq_off >> info->dqi_blocksize_bits)); goto out_buf; } ret = read_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", blk); + quota_error(dquot->dq_sb, "Can't read quota data block %u", + blk); goto out_buf; } dh = (struct qt_disk_dqdbheader *)buf; @@ -427,8 +423,8 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, if (ret >= 0) ret = put_free_dqblk(info, buf, blk); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't move quota data block (%u) " - "to free list.\n", blk); + quota_error(dquot->dq_sb, "Can't move quota data block " + "(%u) to free list", blk); goto out_buf; } } else { @@ -440,15 +436,15 @@ static int free_dqentry(struct qtree_mem_dqinfo *info, struct dquot *dquot, /* Insert will write block itself */ ret = insert_free_dqentry(info, buf, blk); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't insert quota data " - "block (%u) to free entry list.\n", blk); + quota_error(dquot->dq_sb, "Can't insert quota " + "data block (%u) to free entry list", blk); goto out_buf; } } else { ret = write_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't write quota data " - "block %u\n", blk); + quota_error(dquot->dq_sb, "Can't write quota " + "data block %u", blk); goto out_buf; } } @@ -472,7 +468,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, return -ENOMEM; ret = read_blk(info, *blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota data block %u\n", *blk); + quota_error(dquot->dq_sb, "Can't read quota data " + "block %u", blk); goto out_buf; } newblk = le32_to_cpu(ref[get_index(info, dquot->dq_id, depth)]); @@ -496,8 +493,8 @@ static int remove_tree(struct qtree_mem_dqinfo *info, struct dquot *dquot, } else { ret = write_blk(info, *blk, buf); if (ret < 0) - q_warn(KERN_ERR "VFS: Can't write quota tree " - "block %u.\n", *blk); + quota_error(dquot->dq_sb, "Can't write quota " + "tree block %u", blk); } } out_buf: @@ -529,7 +526,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + quota_error(dquot->dq_sb, "Can't read quota tree " + "block %u", blk); goto out_buf; } ddquot = buf + sizeof(struct qt_disk_dqdbheader); @@ -539,8 +537,8 @@ static loff_t find_block_dqentry(struct qtree_mem_dqinfo *info, ddquot += info->dqi_entry_size; } if (i == qtree_dqstr_in_blk(info)) { - q_warn(KERN_ERR "VFS: Quota for id %u referenced " - "but not present.\n", dquot->dq_id); + quota_error(dquot->dq_sb, "Quota for id %u referenced " + "but not present", dquot->dq_id); ret = -EIO; goto out_buf; } else { @@ -564,7 +562,8 @@ static loff_t find_tree_dqentry(struct qtree_mem_dqinfo *info, return -ENOMEM; ret = read_blk(info, blk, buf); if (ret < 0) { - q_warn(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + quota_error(dquot->dq_sb, "Can't read quota tree block %u", + blk); goto out_buf; } ret = 0; @@ -598,7 +597,7 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) #ifdef __QUOTA_QT_PARANOIA /* Invalidated quota? */ if (!sb_dqopt(dquot->dq_sb)->files[type]) { - printk(KERN_ERR "VFS: Quota invalidated while reading!\n"); + quota_error(sb, "Quota invalidated while reading!"); return -EIO; } #endif @@ -607,8 +606,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) offset = find_dqentry(info, dquot); if (offset <= 0) { /* Entry not present? */ if (offset < 0) - q_warn(KERN_ERR "VFS: Can't read quota " - "structure for id %u.\n", dquot->dq_id); + quota_error(sb, "Can't read quota structure " + "for id %u", dquot->dq_id); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); @@ -625,8 +624,8 @@ int qtree_read_dquot(struct qtree_mem_dqinfo *info, struct dquot *dquot) if (ret != info->dqi_entry_size) { if (ret >= 0) ret = -EIO; - q_warn(KERN_ERR "VFS: Error while reading quota " - "structure for id %u.\n", dquot->dq_id); + quota_error(sb, "Error while reading quota structure for id %u", + dquot->dq_id); set_bit(DQ_FAKE_B, &dquot->dq_flags); memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); kfree(ddquot); diff --git a/fs/quota/quota_tree.h b/fs/quota/quota_tree.h index ccc3e71fb1d..a1ab8db81a5 100644 --- a/fs/quota/quota_tree.h +++ b/fs/quota/quota_tree.h @@ -22,10 +22,4 @@ struct qt_disk_dqdbheader { #define QT_TREEOFF 1 /* Offset of tree in file in blocks */ -#define q_warn(fmt, args...) \ -do { \ - if (printk_ratelimit()) \ - printk(fmt, ## args); \ -} while(0) - #endif /* _LINUX_QUOTAIO_TREE_H */ diff --git a/fs/quota/quota_v1.c b/fs/quota/quota_v1.c index 4af344c5852..34b37a67bb1 100644 --- a/fs/quota/quota_v1.c +++ b/fs/quota/quota_v1.c @@ -95,8 +95,7 @@ static int v1_commit_dqblk(struct dquot *dquot) (char *)&dqblk, sizeof(struct v1_disk_dqblk), v1_dqoff(dquot->dq_id)); if (ret != sizeof(struct v1_disk_dqblk)) { - printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", - dquot->dq_sb->s_id); + quota_error(dquot->dq_sb, "dquota write failed"); if (ret >= 0) ret = -EIO; goto out; diff --git a/fs/quota/quota_v2.c b/fs/quota/quota_v2.c index 135206af145..65444d29406 100644 --- a/fs/quota/quota_v2.c +++ b/fs/quota/quota_v2.c @@ -63,9 +63,8 @@ static int v2_read_header(struct super_block *sb, int type, size = sb->s_op->quota_read(sb, type, (char *)dqhead, sizeof(struct v2_disk_dqheader), 0); if (size != sizeof(struct v2_disk_dqheader)) { - q_warn(KERN_WARNING "quota_v2: Failed header read:" - " expected=%zd got=%zd\n", - sizeof(struct v2_disk_dqheader), size); + quota_error(sb, "Failed header read: expected=%zd got=%zd", + sizeof(struct v2_disk_dqheader), size); return 0; } return 1; @@ -106,8 +105,7 @@ static int v2_read_file_info(struct super_block *sb, int type) size = sb->s_op->quota_read(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - q_warn(KERN_WARNING "quota_v2: Can't read info structure on device %s.\n", - sb->s_id); + quota_error(sb, "Can't read info structure"); return -1; } info->dqi_priv = kmalloc(sizeof(struct qtree_mem_dqinfo), GFP_NOFS); @@ -167,8 +165,7 @@ static int v2_write_file_info(struct super_block *sb, int type) size = sb->s_op->quota_write(sb, type, (char *)&dinfo, sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); if (size != sizeof(struct v2_disk_dqinfo)) { - q_warn(KERN_WARNING "Can't write info structure on device %s.\n", - sb->s_id); + quota_error(sb, "Can't write info structure"); return -1; } return 0; diff --git a/fs/readdir.c b/fs/readdir.c index 7723401f8d8..356f71528ad 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -4,6 +4,7 @@ * Copyright (C) 1995 Linus Torvalds */ +#include <linux/stddef.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/time.h> @@ -54,7 +55,6 @@ EXPORT_SYMBOL(vfs_readdir); * anyway. Thus the special "fillonedir()" function for that * case (the low-level handlers don't need to care about this). */ -#define NAME_OFFSET(de) ((int) ((de)->d_name - (char __user *) (de))) #ifdef __ARCH_WANT_OLD_READDIR @@ -152,7 +152,8 @@ static int filldir(void * __buf, const char * name, int namlen, loff_t offset, struct linux_dirent __user * dirent; struct getdents_callback * buf = (struct getdents_callback *) __buf; unsigned long d_ino; - int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 2, sizeof(long)); + int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, + sizeof(long)); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) @@ -237,7 +238,8 @@ static int filldir64(void * __buf, const char * name, int namlen, loff_t offset, { struct linux_dirent64 __user *dirent; struct getdents_callback64 * buf = (struct getdents_callback64 *) __buf; - int reclen = ALIGN(NAME_OFFSET(dirent) + namlen + 1, sizeof(u64)); + int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, + sizeof(u64)); buf->error = -EINVAL; /* only used if we fail.. */ if (reclen > buf->count) diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 1beaa739d0a..1b27b5688f6 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -593,7 +593,8 @@ EXPORT_SYMBOL_GPL(sysfs_add_file_to_group); * @mode: file permissions. * */ -int sysfs_chmod_file(struct kobject *kobj, struct attribute *attr, mode_t mode) +int sysfs_chmod_file(struct kobject *kobj, const struct attribute *attr, + mode_t mode) { struct sysfs_dirent *sd; struct iattr newattrs; diff --git a/fs/udf/file.c b/fs/udf/file.c index 94e06d6bddb..6e450e01a1b 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -36,7 +36,6 @@ #include <linux/pagemap.h> #include <linux/buffer_head.h> #include <linux/aio.h> -#include <linux/smp_lock.h> #include "udf_i.h" #include "udf_sb.h" diff --git a/fs/udf/super.c b/fs/udf/super.c index 612d1e2e285..12bb651e540 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1579,9 +1579,7 @@ static int udf_load_sequence(struct super_block *sb, struct buffer_head *bh, { struct anchorVolDescPtr *anchor; long main_s, main_e, reserve_s, reserve_e; - struct udf_sb_info *sbi; - sbi = UDF_SB(sb); anchor = (struct anchorVolDescPtr *)bh->b_data; /* Locate the main sequence */ diff --git a/fs/xfs/linux-2.6/xfs_quotaops.c b/fs/xfs/linux-2.6/xfs_quotaops.c index bfd5ac9d1f6..29b9d642e93 100644 --- a/fs/xfs/linux-2.6/xfs_quotaops.c +++ b/fs/xfs/linux-2.6/xfs_quotaops.c @@ -68,15 +68,15 @@ xfs_fs_set_xstate( if (op != Q_XQUOTARM && !XFS_IS_QUOTA_RUNNING(mp)) return -ENOSYS; - if (uflags & XFS_QUOTA_UDQ_ACCT) + if (uflags & FS_QUOTA_UDQ_ACCT) flags |= XFS_UQUOTA_ACCT; - if (uflags & XFS_QUOTA_PDQ_ACCT) + if (uflags & FS_QUOTA_PDQ_ACCT) flags |= XFS_PQUOTA_ACCT; - if (uflags & XFS_QUOTA_GDQ_ACCT) + if (uflags & FS_QUOTA_GDQ_ACCT) flags |= XFS_GQUOTA_ACCT; - if (uflags & XFS_QUOTA_UDQ_ENFD) + if (uflags & FS_QUOTA_UDQ_ENFD) flags |= XFS_UQUOTA_ENFD; - if (uflags & (XFS_QUOTA_PDQ_ENFD|XFS_QUOTA_GDQ_ENFD)) + if (uflags & (FS_QUOTA_PDQ_ENFD|FS_QUOTA_GDQ_ENFD)) flags |= XFS_OQUOTA_ENFD; switch (op) { diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index d257eb8557c..45e5849df23 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -810,9 +810,9 @@ xfs_qm_export_dquot( } #ifdef DEBUG - if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == XFS_USER_QUOTA) || + if (((XFS_IS_UQUOTA_ENFORCED(mp) && dst->d_flags == FS_USER_QUOTA) || (XFS_IS_OQUOTA_ENFORCED(mp) && - (dst->d_flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)))) && + (dst->d_flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)))) && dst->d_id != 0) { if (((int) dst->d_bcount >= (int) dst->d_blk_softlimit) && (dst->d_blk_softlimit > 0)) { @@ -833,17 +833,17 @@ xfs_qm_export_qtype_flags( /* * Can't be more than one, or none. */ - ASSERT((flags & (XFS_PROJ_QUOTA | XFS_USER_QUOTA)) != - (XFS_PROJ_QUOTA | XFS_USER_QUOTA)); - ASSERT((flags & (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)) != - (XFS_PROJ_QUOTA | XFS_GROUP_QUOTA)); - ASSERT((flags & (XFS_USER_QUOTA | XFS_GROUP_QUOTA)) != - (XFS_USER_QUOTA | XFS_GROUP_QUOTA)); - ASSERT((flags & (XFS_PROJ_QUOTA|XFS_USER_QUOTA|XFS_GROUP_QUOTA)) != 0); + ASSERT((flags & (FS_PROJ_QUOTA | FS_USER_QUOTA)) != + (FS_PROJ_QUOTA | FS_USER_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA | FS_GROUP_QUOTA)) != + (FS_PROJ_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_USER_QUOTA | FS_GROUP_QUOTA)) != + (FS_USER_QUOTA | FS_GROUP_QUOTA)); + ASSERT((flags & (FS_PROJ_QUOTA|FS_USER_QUOTA|FS_GROUP_QUOTA)) != 0); return (flags & XFS_DQ_USER) ? - XFS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? - XFS_PROJ_QUOTA : XFS_GROUP_QUOTA; + FS_USER_QUOTA : (flags & XFS_DQ_PROJ) ? + FS_PROJ_QUOTA : FS_GROUP_QUOTA; } STATIC uint @@ -854,16 +854,16 @@ xfs_qm_export_flags( uflags = 0; if (flags & XFS_UQUOTA_ACCT) - uflags |= XFS_QUOTA_UDQ_ACCT; + uflags |= FS_QUOTA_UDQ_ACCT; if (flags & XFS_PQUOTA_ACCT) - uflags |= XFS_QUOTA_PDQ_ACCT; + uflags |= FS_QUOTA_PDQ_ACCT; if (flags & XFS_GQUOTA_ACCT) - uflags |= XFS_QUOTA_GDQ_ACCT; + uflags |= FS_QUOTA_GDQ_ACCT; if (flags & XFS_UQUOTA_ENFD) - uflags |= XFS_QUOTA_UDQ_ENFD; + uflags |= FS_QUOTA_UDQ_ENFD; if (flags & (XFS_OQUOTA_ENFD)) { uflags |= (flags & XFS_GQUOTA_ACCT) ? - XFS_QUOTA_GDQ_ENFD : XFS_QUOTA_PDQ_ENFD; + FS_QUOTA_GDQ_ENFD : FS_QUOTA_PDQ_ENFD; } return (uflags); } |