diff options
Diffstat (limited to 'fs')
114 files changed, 4311 insertions, 3310 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c index f4407eb276c..12e1baa4508 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -712,7 +712,7 @@ static void v9fs_read_work(void *a) * v9fs_send_request - send 9P request * The function can sleep until the request is scheduled for sending. * The function can be interrupted. Return from the function is not - * a guarantee that the request is sent succesfully. Can return errors + * a guarantee that the request is sent successfully. Can return errors * that can be retrieved by PTR_ERR macros. * * @m: mux data diff --git a/fs/Kconfig b/fs/Kconfig index 1cdc043922d..6dc8cfd6d80 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1116,7 +1116,7 @@ config JFFS2_SUMMARY config JFFS2_FS_XATTR bool "JFFS2 XATTR support (EXPERIMENTAL)" - depends on JFFS2_FS && EXPERIMENTAL && !JFFS2_FS_WRITEBUFFER + depends on JFFS2_FS && EXPERIMENTAL default n help Extended attributes are name:value pairs associated with inodes by @@ -1490,7 +1490,12 @@ config NFSD select LOCKD select SUNRPC select EXPORTFS - select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL + select NFSD_V2_ACL if NFSD_V3_ACL + select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFSD_TCP if NFSD_V4 + select CRYPTO_MD5 if NFSD_V4 + select CRYPTO if NFSD_V4 + select FS_POSIX_ACL if NFSD_V4 help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain @@ -1528,7 +1533,6 @@ config NFSD_V3 config NFSD_V3_ACL bool "Provide server support for the NFSv3 ACL protocol extension" depends on NFSD_V3 - select NFSD_V2_ACL help Implement the NFSv3 ACL protocol extension for manipulating POSIX Access Control Lists on exported file systems. NFS clients should @@ -1538,10 +1542,6 @@ config NFSD_V3_ACL config NFSD_V4 bool "Provide NFSv4 server support (EXPERIMENTAL)" depends on NFSD_V3 && EXPERIMENTAL - select NFSD_TCP - select CRYPTO_MD5 - select CRYPTO - select FS_POSIX_ACL help If you would like to include the NFSv4 server as well as the NFSv2 and NFSv3 servers, say Y here. This feature is experimental, and @@ -1722,7 +1722,7 @@ config CIFS_STATS mounted by the cifs client to be displayed in /proc/fs/cifs/Stats config CIFS_STATS2 - bool "CIFS extended statistics" + bool "Extended statistics" depends on CIFS_STATS help Enabling this option will allow more detailed statistics on SMB @@ -1735,6 +1735,32 @@ config CIFS_STATS2 Unless you are a developer or are doing network performance analysis or tuning, say N. +config CIFS_WEAK_PW_HASH + bool "Support legacy servers which use weaker LANMAN security" + depends on CIFS + help + Modern CIFS servers including Samba and most Windows versions + (since 1997) support stronger NTLM (and even NTLMv2 and Kerberos) + security mechanisms. These hash the password more securely + than the mechanisms used in the older LANMAN version of the + SMB protocol needed to establish sessions with old SMB servers. + + Enabling this option allows the cifs module to mount to older + LANMAN based servers such as OS/2 and Windows 95, but such + mounts may be less secure than mounts using NTLM or more recent + security mechanisms if you are on a public network. Unless you + have a need to access old SMB servers (and are on a private + network) you probably want to say N. Even if this support + is enabled in the kernel build, they will not be used + automatically. At runtime LANMAN mounts are disabled but + can be set to required (or optional) either in + /proc/fs/cifs (see fs/cifs/README for more detail) or via an + option on the mount command. This support is disabled by + default in order to reduce the possibility of a downgrade + attack. + + If unsure, say N. + config CIFS_XATTR bool "CIFS extended attributes" depends on CIFS @@ -1763,6 +1789,16 @@ config CIFS_POSIX (such as Samba 3.10 and later) which can negotiate CIFS POSIX ACL support. If unsure, say N. +config CIFS_DEBUG2 + bool "Enable additional CIFS debugging routines" + help + Enabling this option adds a few more debugging routines + to the cifs code which slightly increases the size of + the cifs module and can cause additional logging of debug + messages in some error paths, slowing performance. This + option can be turned off unless you are debugging + cifs problems. If unsure, say N. + config CIFS_EXPERIMENTAL bool "CIFS Experimental Features (EXPERIMENTAL)" depends on CIFS && EXPERIMENTAL @@ -1778,7 +1814,7 @@ config CIFS_EXPERIMENTAL If unsure, say N. config CIFS_UPCALL - bool "CIFS Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" + bool "Kerberos/SPNEGO advanced session setup (EXPERIMENTAL)" depends on CIFS_EXPERIMENTAL select CONNECTOR help diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 009a9ae88d6..bfc1fd22d5b 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer, /* we found it in the graveyard - resurrect it */ found_dead_server: - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_list); + list_move_tail(&server->link, &cell->sv_list); afs_get_server(server); afs_kafstimod_del_timer(&server->timeout); spin_unlock(&cell->sv_gylock); diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c index 7ac07d0d47b..f09a794f248 100644 --- a/fs/afs/kafsasyncd.c +++ b/fs/afs/kafsasyncd.c @@ -136,8 +136,7 @@ static int kafsasyncd(void *arg) if (!list_empty(&kafsasyncd_async_attnq)) { op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link); - list_del(&op->link); - list_add_tail(&op->link, + list_move_tail(&op->link, &kafsasyncd_async_busyq); } @@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op) init_waitqueue_entry(&op->waiter, kafsasyncd_task); add_wait_queue(&op->call->waitq, &op->waiter); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_busyq); + list_move_tail(&op->link, &kafsasyncd_async_busyq); spin_unlock(&kafsasyncd_async_lock); @@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op) spin_lock(&kafsasyncd_async_lock); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_attnq); + list_move_tail(&op->link, &kafsasyncd_async_attnq); spin_unlock(&kafsasyncd_async_lock); diff --git a/fs/afs/server.c b/fs/afs/server.c index 62b093aa41c..22afaae1a4c 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr, resurrect_server: _debug("resurrecting server"); - list_del(&zombie->link); - list_add_tail(&zombie->link, &cell->sv_list); + list_move_tail(&zombie->link, &cell->sv_list); afs_get_server(zombie); afs_kafstimod_del_timer(&zombie->timeout); spin_unlock(&cell->sv_gylock); @@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server) } spin_lock(&cell->sv_gylock); - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_graveyard); + list_move_tail(&server->link, &cell->sv_graveyard); /* time out in 10 secs */ afs_kafstimod_add_timer(&server->timeout, 10 * HZ); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index eced20618ec..331f730a1fb 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell, /* found in the graveyard - resurrect */ _debug("found in graveyard"); atomic_inc(&vlocation->usage); - list_del(&vlocation->link); - list_add_tail(&vlocation->link, &cell->vl_list); + list_move_tail(&vlocation->link, &cell->vl_list); spin_unlock(&cell->vl_gylock); afs_kafstimod_del_timer(&vlocation->timeout); @@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation) } /* move to graveyard queue */ - list_del(&vlocation->link); - list_add_tail(&vlocation->link,&cell->vl_graveyard); + list_move_tail(&vlocation->link,&cell->vl_graveyard); /* remove from pending timeout queue (refcounted if actually being * updated) */ diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index 9867fef3261..cf62da5d782 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode, vnode->cb_expiry * HZ); spin_lock(&afs_cb_hash_lock); - list_del(&vnode->cb_hash_link); - list_add_tail(&vnode->cb_hash_link, + list_move_tail(&vnode->cb_hash_link, &afs_cb_hash(server, &vnode->fid)); spin_unlock(&afs_cb_hash_lock); @@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb) * invoked both for initial i/o submission and * subsequent retries via the aio_kick_handler. * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reaquired + * already held. The lock is released and reacquired * as needed during processing. * * Calls the iocb retry method (already setup for the diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4456d1daa40..8dbd44f10e9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -376,8 +376,7 @@ next: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); spin_lock(&dcache_lock); - list_del(&expired->d_parent->d_subdirs); - list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } diff --git a/fs/buffer.c b/fs/buffer.c index 373bb6292bd..f23bb647db4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -564,7 +564,7 @@ still_busy: * Completion handler for block_write_full_page() - pages which are unlocked * during I/O, and which have PageWriteback cleared upon I/O completion. */ -void end_buffer_async_write(struct buffer_head *bh, int uptodate) +static void end_buffer_async_write(struct buffer_head *bh, int uptodate) { char b[BDEVNAME_SIZE]; unsigned long flags; @@ -3166,7 +3166,6 @@ EXPORT_SYMBOL(block_sync_page); EXPORT_SYMBOL(block_truncate_page); EXPORT_SYMBOL(block_write_full_page); EXPORT_SYMBOL(cont_prepare_write); -EXPORT_SYMBOL(end_buffer_async_write); EXPORT_SYMBOL(end_buffer_read_sync); EXPORT_SYMBOL(end_buffer_write_sync); EXPORT_SYMBOL(file_fsync); diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 7271bb0257f..a61d17ed182 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,9 +1,24 @@ +Version 1.44 +------------ +Rewritten sessionsetup support, including support for legacy SMB +session setup needed for OS/2 and older servers such as Windows 95 and 98. +Fix oops on ls to OS/2 servers. Add support for level 1 FindFirst +so we can do search (ls etc.) to OS/2. Do not send NTCreateX +or recent levels of FindFirst unless server says it supports NT SMBs +(instead use legacy equivalents from LANMAN dialect). Fix to allow +NTLMv2 authentication support (now can use stronger password hashing +on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004). +Allow override of global cifs security flags on mount via "sec=" option(s). + Version 1.43 ------------ POSIX locking to servers which support CIFS POSIX Extensions (disabled by default controlled by proc/fs/cifs/Experimental). Handle conversion of long share names (especially Asian languages) -to Unicode during mount. +to Unicode during mount. Fix memory leak in sess struct on reconnect. +Fix rare oops after acpi suspend. Fix O_TRUNC opens to overwrite on +cifs open which helps rare case when setpathinfo fails or server does +not support it. Version 1.42 ------------ diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 58c77254a23..a26f26ed5a1 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -3,4 +3,4 @@ # obj-$(CONFIG_CIFS) += cifs.o -cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o ntlmssp.o +cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o sess.o diff --git a/fs/cifs/README b/fs/cifs/README index 0355003f4f0..7986d0d97ac 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -443,7 +443,10 @@ A partial list of the supported mount options follows: SFU does). In the future the bottom 9 bits of the mode mode also will be emulated using queries of the security descriptor (ACL). -sec Security mode. Allowed values are: + sign Must use packet signing (helps avoid unwanted data modification + by intermediate systems in the route). Note that signing + does not work with lanman or plaintext authentication. + sec Security mode. Allowed values are: none attempt to connection as a null user (no name) krb5 Use Kerberos version 5 authentication krb5i Use Kerberos authentication and packet signing @@ -453,6 +456,8 @@ sec Security mode. Allowed values are: server requires signing also can be the default) ntlmv2 Use NTLMv2 password hashing ntlmv2i Use NTLMv2 password hashing with packet signing + lanman (if configured in kernel config) use older + lanman hash The mount.cifs mount helper also accepts a few mount options before -o including: @@ -485,14 +490,34 @@ PacketSigningEnabled If set to one, cifs packet signing is enabled it. If set to two, cifs packet signing is required even if the server considers packet signing optional. (default 1) +SecurityFlags Flags which control security negotiation and + also packet signing. Authentication (may/must) + flags (e.g. for NTLM and/or NTLMv2) may be combined with + the signing flags. Specifying two different password + hashing mechanisms (as "must use") on the other hand + does not make much sense. Default flags are + 0x07007 + (NTLM, NTLMv2 and packet signing allowed). Maximum + allowable flags if you want to allow mounts to servers + using weaker password hashes is 0x37037 (lanman, + plaintext, ntlm, ntlmv2, signing allowed): + + may use packet signing 0x00001 + must use packet signing 0x01001 + may use NTLM (most common password hash) 0x00002 + must use NTLM 0x02002 + may use NTLMv2 0x00004 + must use NTLMv2 0x04004 + may use Kerberos security (not implemented yet) 0x00008 + must use Kerberos (not implemented yet) 0x08008 + may use lanman (weak) password hash 0x00010 + must use lanman password hash 0x10010 + may use plaintext passwords 0x00020 + must use plaintext passwords 0x20020 + (reserved for future packet encryption) 0x00040 + cifsFYI If set to one, additional debug information is logged to the system error log. (default 0) -ExtendedSecurity If set to one, SPNEGO session establishment - is allowed which enables more advanced - secure CIFS session establishment (default 0) -NTLMV2Enabled If set to one, more secure password hashes - are used when the server supports them and - when kerberos is not negotiated (default 0) traceSMB If set to one, debug information is logged to the system error log with the start of smb requests and responses (default 0) diff --git a/fs/cifs/asn1.c b/fs/cifs/asn1.c index 086ae8f4a20..031cdf29325 100644 --- a/fs/cifs/asn1.c +++ b/fs/cifs/asn1.c @@ -467,7 +467,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, asn1_open(&ctx, security_blob, length); if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding negTokenInit header ")); + cFYI(1, ("Error decoding negTokenInit header")); return 0; } else if ((cls != ASN1_APL) || (con != ASN1_CON) || (tag != ASN1_EOC)) { @@ -495,7 +495,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, } if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding negTokenInit ")); + cFYI(1, ("Error decoding negTokenInit")); return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { @@ -505,7 +505,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, } if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding negTokenInit ")); + cFYI(1, ("Error decoding negTokenInit")); return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { @@ -515,7 +515,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, } if (asn1_header_decode(&ctx, &end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding 2nd part of negTokenInit ")); + cFYI(1, ("Error decoding 2nd part of negTokenInit")); return 0; } else if ((cls != ASN1_CTX) || (con != ASN1_CON) || (tag != ASN1_EOC)) { @@ -527,7 +527,7 @@ decode_negTokenInit(unsigned char *security_blob, int length, if (asn1_header_decode (&ctx, &sequence_end, &cls, &con, &tag) == 0) { - cFYI(1, ("Error decoding 2nd part of negTokenInit ")); + cFYI(1, ("Error decoding 2nd part of negTokenInit")); return 0; } else if ((cls != ASN1_UNI) || (con != ASN1_CON) || (tag != ASN1_SEQ)) { diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index f4124a32bef..96abeb73897 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -39,7 +39,7 @@ cifs_dump_mem(char *label, void *data, int length) char *charptr = data; char buf[10], line[80]; - printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n\n", + printk(KERN_DEBUG "%s: dump of %d bytes of data at 0x%p\n", label, length, data); for (i = 0; i < length; i += 16) { line[0] = 0; @@ -57,6 +57,57 @@ cifs_dump_mem(char *label, void *data, int length) } } +#ifdef CONFIG_CIFS_DEBUG2 +void cifs_dump_detail(struct smb_hdr * smb) +{ + cERROR(1,("Cmd: %d Err: 0x%x Flags: 0x%x Flgs2: 0x%x Mid: %d Pid: %d", + smb->Command, smb->Status.CifsError, + smb->Flags, smb->Flags2, smb->Mid, smb->Pid)); + cERROR(1,("smb buf %p len %d", smb, smbCalcSize_LE(smb))); +} + + +void cifs_dump_mids(struct TCP_Server_Info * server) +{ + struct list_head *tmp; + struct mid_q_entry * mid_entry; + + if(server == NULL) + return; + + cERROR(1,("Dump pending requests:")); + spin_lock(&GlobalMid_Lock); + list_for_each(tmp, &server->pending_mid_q) { + mid_entry = list_entry(tmp, struct mid_q_entry, qhead); + if(mid_entry) { + cERROR(1,("State: %d Cmd: %d Pid: %d Tsk: %p Mid %d", + mid_entry->midState, + (int)mid_entry->command, + mid_entry->pid, + mid_entry->tsk, + mid_entry->mid)); +#ifdef CONFIG_CIFS_STATS2 + cERROR(1,("IsLarge: %d buf: %p time rcv: %ld now: %ld", + mid_entry->largeBuf, + mid_entry->resp_buf, + mid_entry->when_received, + jiffies)); +#endif /* STATS2 */ + cERROR(1,("IsMult: %d IsEnd: %d", mid_entry->multiRsp, + mid_entry->multiEnd)); + if(mid_entry->resp_buf) { + cifs_dump_detail(mid_entry->resp_buf); + cifs_dump_mem("existing buf: ", + mid_entry->resp_buf, + 62 /* fixme */); + } + + } + } + spin_unlock(&GlobalMid_Lock); +} +#endif /* CONFIG_CIFS_DEBUG2 */ + #ifdef CONFIG_PROC_FS static int cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, @@ -73,7 +124,6 @@ cifs_debug_data_read(char *buf, char **beginBuffer, off_t offset, *beginBuffer = buf + offset; - length = sprintf(buf, "Display Internal CIFS Data Structures for Debugging\n" @@ -395,12 +445,12 @@ static read_proc_t traceSMB_read; static write_proc_t traceSMB_write; static read_proc_t multiuser_mount_read; static write_proc_t multiuser_mount_write; -static read_proc_t extended_security_read; -static write_proc_t extended_security_write; -static read_proc_t ntlmv2_enabled_read; +static read_proc_t security_flags_read; +static write_proc_t security_flags_write; +/* static read_proc_t ntlmv2_enabled_read; static write_proc_t ntlmv2_enabled_write; static read_proc_t packet_signing_enabled_read; -static write_proc_t packet_signing_enabled_write; +static write_proc_t packet_signing_enabled_write;*/ static read_proc_t experimEnabled_read; static write_proc_t experimEnabled_write; static read_proc_t linuxExtensionsEnabled_read; @@ -458,10 +508,10 @@ cifs_proc_init(void) pde->write_proc = multiuser_mount_write; pde = - create_proc_read_entry("ExtendedSecurity", 0, proc_fs_cifs, - extended_security_read, NULL); + create_proc_read_entry("SecurityFlags", 0, proc_fs_cifs, + security_flags_read, NULL); if (pde) - pde->write_proc = extended_security_write; + pde->write_proc = security_flags_write; pde = create_proc_read_entry("LookupCacheEnabled", 0, proc_fs_cifs, @@ -469,7 +519,7 @@ cifs_proc_init(void) if (pde) pde->write_proc = lookupFlag_write; - pde = +/* pde = create_proc_read_entry("NTLMV2Enabled", 0, proc_fs_cifs, ntlmv2_enabled_read, NULL); if (pde) @@ -479,7 +529,7 @@ cifs_proc_init(void) create_proc_read_entry("PacketSigningEnabled", 0, proc_fs_cifs, packet_signing_enabled_read, NULL); if (pde) - pde->write_proc = packet_signing_enabled_write; + pde->write_proc = packet_signing_enabled_write;*/ } void @@ -496,9 +546,9 @@ cifs_proc_clean(void) #endif remove_proc_entry("MultiuserMount", proc_fs_cifs); remove_proc_entry("OplockEnabled", proc_fs_cifs); - remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); - remove_proc_entry("ExtendedSecurity",proc_fs_cifs); - remove_proc_entry("PacketSigningEnabled",proc_fs_cifs); +/* remove_proc_entry("NTLMV2Enabled",proc_fs_cifs); */ + remove_proc_entry("SecurityFlags",proc_fs_cifs); +/* remove_proc_entry("PacketSigningEnabled",proc_fs_cifs); */ remove_proc_entry("LinuxExtensionsEnabled",proc_fs_cifs); remove_proc_entry("Experimental",proc_fs_cifs); remove_proc_entry("LookupCacheEnabled",proc_fs_cifs); @@ -782,12 +832,12 @@ multiuser_mount_write(struct file *file, const char __user *buffer, } static int -extended_security_read(char *page, char **start, off_t off, +security_flags_read(char *page, char **start, off_t off, int count, int *eof, void *data) { int len; - len = sprintf(page, "%d\n", extended_security); + len = sprintf(page, "0x%x\n", extended_security); len -= off; *start = page + off; @@ -803,24 +853,52 @@ extended_security_read(char *page, char **start, off_t off, return len; } static int -extended_security_write(struct file *file, const char __user *buffer, +security_flags_write(struct file *file, const char __user *buffer, unsigned long count, void *data) { + unsigned int flags; + char flags_string[12]; char c; - int rc; - rc = get_user(c, buffer); - if (rc) - return rc; - if (c == '0' || c == 'n' || c == 'N') - extended_security = 0; - else if (c == '1' || c == 'y' || c == 'Y') - extended_security = 1; + if((count < 1) || (count > 11)) + return -EINVAL; + + memset(flags_string, 0, 12); + + if(copy_from_user(flags_string, buffer, count)) + return -EFAULT; + + if(count < 3) { + /* single char or single char followed by null */ + c = flags_string[0]; + if (c == '0' || c == 'n' || c == 'N') + extended_security = CIFSSEC_DEF; /* default */ + else if (c == '1' || c == 'y' || c == 'Y') + extended_security = CIFSSEC_MAX; + return count; + } + /* else we have a number */ + + flags = simple_strtoul(flags_string, NULL, 0); + + cFYI(1,("sec flags 0x%x", flags)); + + if(flags <= 0) { + cERROR(1,("invalid security flags %s",flags_string)); + return -EINVAL; + } + if(flags & ~CIFSSEC_MASK) { + cERROR(1,("attempt to set unsupported security flags 0x%x", + flags & ~CIFSSEC_MASK)); + return -EINVAL; + } + /* flags look ok - update the global security flags for cifs module */ + extended_security = flags; return count; } -static int +/* static int ntlmv2_enabled_read(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -855,6 +933,8 @@ ntlmv2_enabled_write(struct file *file, const char __user *buffer, ntlmv2_support = 0; else if (c == '1' || c == 'y' || c == 'Y') ntlmv2_support = 1; + else if (c == '2') + ntlmv2_support = 2; return count; } @@ -898,7 +978,7 @@ packet_signing_enabled_write(struct file *file, const char __user *buffer, sign_CIFS_PDUs = 2; return count; -} +} */ #endif diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index 4304d9dcfb6..c26cd0d2c6d 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -24,6 +24,10 @@ #define _H_CIFS_DEBUG void cifs_dump_mem(char *label, void *data, int length); +#ifdef CONFIG_CIFS_DEBUG2 +void cifs_dump_detail(struct smb_hdr *); +void cifs_dump_mids(struct TCP_Server_Info *); +#endif extern int traceSMB; /* flag which enables the function below */ void dump_smb(struct smb_hdr *, int); #define CIFS_INFO 0x01 diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index d2b12825594..d2a8b2941fc 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -22,6 +22,7 @@ #include "cifs_unicode.h" #include "cifs_uniupr.h" #include "cifspdu.h" +#include "cifsglob.h" #include "cifs_debug.h" /* diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index e7d63737e65..a89efaf78a2 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -26,6 +26,8 @@ #include "md5.h" #include "cifs_unicode.h" #include "cifsproto.h" +#include <linux/ctype.h> +#include <linux/random.h> /* Calculate and return the CIFS signature based on the mac key and the smb pdu */ /* the 16 byte signature must be allocated by the caller */ @@ -35,6 +37,8 @@ extern void mdfour(unsigned char *out, unsigned char *in, int n); extern void E_md4hash(const unsigned char *passwd, unsigned char *p16); +extern void SMBencrypt(unsigned char *passwd, unsigned char *c8, + unsigned char *p24); static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu, const char * key, char * signature) @@ -45,7 +49,7 @@ static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu, return -EINVAL; MD5Init(&context); - MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); + MD5Update(&context,key,CIFS_SESS_KEY_SIZE+16); MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length); MD5Final(signature,&context); return 0; @@ -90,7 +94,7 @@ static int cifs_calc_signature2(const struct kvec * iov, int n_vec, return -EINVAL; MD5Init(&context); - MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); + MD5Update(&context,key,CIFS_SESS_KEY_SIZE+16); for(i=0;i<n_vec;i++) { if(iov[i].iov_base == NULL) { cERROR(1,("null iovec entry")); @@ -204,11 +208,12 @@ int cifs_calculate_mac_key(char * key, const char * rn, const char * password) E_md4hash(password, temp_key); mdfour(key,temp_key,16); - memcpy(key+16,rn, CIFS_SESSION_KEY_SIZE); + memcpy(key+16,rn, CIFS_SESS_KEY_SIZE); return 0; } -int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_info) +int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, + const struct nls_table * nls_info) { char temp_hash[16]; struct HMACMD5Context ctx; @@ -225,6 +230,8 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_ user_name_len = strlen(ses->userName); if(user_name_len > MAX_USERNAME_SIZE) return -EINVAL; + if(ses->domainName == NULL) + return -EINVAL; /* BB should we use CIFS_LINUX_DOM */ dom_name_len = strlen(ses->domainName); if(dom_name_len > MAX_USERNAME_SIZE) return -EINVAL; @@ -259,16 +266,131 @@ int CalcNTLMv2_partial_mac_key(struct cifsSesInfo * ses, struct nls_table * nls_ kfree(unicode_buf); return 0; } -void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_response) + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key) +{ + int i; + char password_with_pad[CIFS_ENCPWD_SIZE]; + + if(ses->server == NULL) + return; + + memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); + strncpy(password_with_pad, ses->password, CIFS_ENCPWD_SIZE); + + if((ses->server->secMode & SECMODE_PW_ENCRYPT) == 0) + if(extended_security & CIFSSEC_MAY_PLNTXT) { + memcpy(lnm_session_key, password_with_pad, CIFS_ENCPWD_SIZE); + return; + } + + /* calculate old style session key */ + /* calling toupper is less broken than repeatedly + calling nls_toupper would be since that will never + work for UTF8, but neither handles multibyte code pages + but the only alternative would be converting to UCS-16 (Unicode) + (using a routine something like UniStrupr) then + uppercasing and then converting back from Unicode - which + would only worth doing it if we knew it were utf8. Basically + utf8 and other multibyte codepages each need their own strupper + function since a byte at a time will ont work. */ + + for(i = 0; i < CIFS_ENCPWD_SIZE; i++) { + password_with_pad[i] = toupper(password_with_pad[i]); + } + + SMBencrypt(password_with_pad, ses->server->cryptKey, lnm_session_key); + /* clear password before we return/free memory */ + memset(password_with_pad, 0, CIFS_ENCPWD_SIZE); +} +#endif /* CIFS_WEAK_PW_HASH */ + +static int calc_ntlmv2_hash(struct cifsSesInfo *ses, + const struct nls_table * nls_cp) +{ + int rc = 0; + int len; + char nt_hash[16]; + struct HMACMD5Context * pctxt; + wchar_t * user; + wchar_t * domain; + + pctxt = kmalloc(sizeof(struct HMACMD5Context), GFP_KERNEL); + + if(pctxt == NULL) + return -ENOMEM; + + /* calculate md4 hash of password */ + E_md4hash(ses->password, nt_hash); + + /* convert Domainname to unicode and uppercase */ + hmac_md5_init_limK_to_64(nt_hash, 16, pctxt); + + /* convert ses->userName to unicode and uppercase */ + len = strlen(ses->userName); + user = kmalloc(2 + (len * 2), GFP_KERNEL); + if(user == NULL) + goto calc_exit_2; + len = cifs_strtoUCS(user, ses->userName, len, nls_cp); + UniStrupr(user); + hmac_md5_update((char *)user, 2*len, pctxt); + + /* convert ses->domainName to unicode and uppercase */ + if(ses->domainName) { + len = strlen(ses->domainName); + + domain = kmalloc(2 + (len * 2), GFP_KERNEL); + if(domain == NULL) + goto calc_exit_1; + len = cifs_strtoUCS(domain, ses->domainName, len, nls_cp); + UniStrupr(domain); + + hmac_md5_update((char *)domain, 2*len, pctxt); + + kfree(domain); + } +calc_exit_1: + kfree(user); +calc_exit_2: + /* BB FIXME what about bytes 24 through 40 of the signing key? + compare with the NTLM example */ + hmac_md5_final(ses->server->mac_signing_key, pctxt); + + return rc; +} + +void setup_ntlmv2_rsp(struct cifsSesInfo * ses, char * resp_buf, + const struct nls_table * nls_cp) +{ + int rc; + struct ntlmv2_resp * buf = (struct ntlmv2_resp *)resp_buf; + + buf->blob_signature = cpu_to_le32(0x00000101); + buf->reserved = 0; + buf->time = cpu_to_le64(cifs_UnixTimeToNT(CURRENT_TIME)); + get_random_bytes(&buf->client_chal, sizeof(buf->client_chal)); + buf->reserved2 = 0; + buf->names[0].type = 0; + buf->names[0].length = 0; + + /* calculate buf->ntlmv2_hash */ + rc = calc_ntlmv2_hash(ses, nls_cp); + if(rc) + cERROR(1,("could not get v2 hash rc %d",rc)); + CalcNTLMv2_response(ses, resp_buf); +} + +void CalcNTLMv2_response(const struct cifsSesInfo * ses, char * v2_session_response) { struct HMACMD5Context context; + /* rest of v2 struct already generated */ memcpy(v2_session_response + 8, ses->server->cryptKey,8); - /* gen_blob(v2_session_response + 16); */ hmac_md5_init_limK_to_64(ses->server->mac_signing_key, 16, &context); - hmac_md5_update(ses->server->cryptKey,8,&context); -/* hmac_md5_update(v2_session_response+16)client thing,8,&context); */ /* BB fix */ + hmac_md5_update(v2_session_response+8, + sizeof(struct ntlmv2_resp) - 8, &context); hmac_md5_final(v2_session_response,&context); - cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); /* BB removeme BB */ +/* cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); */ } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 8b4de6eaabd..c28ede59994 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -56,8 +56,8 @@ unsigned int experimEnabled = 0; unsigned int linuxExtEnabled = 1; unsigned int lookupCacheEnabled = 1; unsigned int multiuser_mount = 0; -unsigned int extended_security = 0; -unsigned int ntlmv2_support = 0; +unsigned int extended_security = CIFSSEC_DEF; +/* unsigned int ntlmv2_support = 0; */ unsigned int sign_CIFS_PDUs = 1; extern struct task_struct * oplockThread; /* remove sparse warning */ struct task_struct * oplockThread = NULL; @@ -908,7 +908,7 @@ static int cifs_dnotify_thread(void * dummyarg) struct cifsSesInfo *ses; do { - if(try_to_freeze()) + if (try_to_freeze()) continue; set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(15*HZ); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index d56c0577c71..a6384d83fde 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -33,6 +33,7 @@ #endif extern struct address_space_operations cifs_addr_ops; +extern struct address_space_operations cifs_addr_ops_smallbuf; /* Functions related to super block operations */ extern struct super_operations cifs_super_ops; @@ -99,5 +100,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.43" +#define CIFS_VERSION "1.44" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 006eb33bff5..6d7cf5f3bc0 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -88,7 +88,8 @@ enum statusEnum { }; enum securityEnum { - NTLM = 0, /* Legacy NTLM012 auth with NTLM hash */ + LANMAN = 0, /* Legacy LANMAN auth */ + NTLM, /* Legacy NTLM012 auth with NTLM hash */ NTLMv2, /* Legacy NTLM auth with NTLMv2 hash */ RawNTLMSSP, /* NTLMSSP without SPNEGO */ NTLMSSP, /* NTLMSSP via SPNEGO */ @@ -157,7 +158,7 @@ struct TCP_Server_Info { /* 16th byte of RFC1001 workstation name is always null */ char workstation_RFC1001_name[SERVER_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* needed for CIFS PDU signature */ - char mac_signing_key[CIFS_SESSION_KEY_SIZE + 16]; + char mac_signing_key[CIFS_SESS_KEY_SIZE + 16]; }; /* @@ -179,10 +180,13 @@ struct cifsUidInfo { struct cifsSesInfo { struct list_head cifsSessionList; struct semaphore sesSem; +#if 0 struct cifsUidInfo *uidInfo; /* pointer to user info */ +#endif struct TCP_Server_Info *server; /* pointer to server info */ atomic_t inUse; /* # of mounts (tree connections) on this ses */ enum statusEnum status; + unsigned overrideSecFlg; /* if non-zero override global sec flags */ __u16 ipc_tid; /* special tid for connection to IPC share */ __u16 flags; char *serverOS; /* name of operating system underlying server */ @@ -194,7 +198,7 @@ struct cifsSesInfo { char serverName[SERVER_NAME_LEN_WITH_NULL * 2]; /* BB make bigger for TCP names - will ipv6 and sctp addresses fit? */ char userName[MAX_USERNAME_SIZE + 1]; - char domainName[MAX_USERNAME_SIZE + 1]; + char * domainName; char * password; }; /* session flags */ @@ -209,12 +213,12 @@ struct cifsTconInfo { struct list_head openFileList; struct semaphore tconSem; struct cifsSesInfo *ses; /* pointer to session associated with */ - char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource (in ASCII not UTF) */ + char treeName[MAX_TREE_SIZE + 1]; /* UNC name of resource in ASCII */ char *nativeFileSystem; __u16 tid; /* The 2 byte tree id */ __u16 Flags; /* optional support bits */ enum statusEnum tidStatus; - atomic_t useCount; /* how many mounts (explicit or implicit) to this share */ + atomic_t useCount; /* how many explicit/implicit mounts to share */ #ifdef CONFIG_CIFS_STATS atomic_t num_smbs_sent; atomic_t num_writes; @@ -254,7 +258,7 @@ struct cifsTconInfo { spinlock_t stat_lock; #endif /* CONFIG_CIFS_STATS */ FILE_SYSTEM_DEVICE_INFO fsDevInfo; - FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if file system name truncated */ + FILE_SYSTEM_ATTRIBUTE_INFO fsAttrInfo; /* ok if fs name truncated */ FILE_SYSTEM_UNIX_INFO fsUnixInfo; unsigned retry:1; unsigned nocase:1; @@ -305,7 +309,6 @@ struct cifsFileInfo { atomic_t wrtPending; /* handle in use - defer close */ struct semaphore fh_sem; /* prevents reopen race after dead ses*/ char * search_resume_name; /* BB removeme BB */ - unsigned int resume_name_length; /* BB removeme - field renamed and moved BB */ struct cifs_search_info srch_inf; }; @@ -391,9 +394,9 @@ struct mid_q_entry { struct smb_hdr *resp_buf; /* response buffer */ int midState; /* wish this were enum but can not pass to wait_event */ __u8 command; /* smb command code */ - unsigned multiPart:1; /* multiple responses to one SMB request */ unsigned largeBuf:1; /* if valid response, is pointer to large buf */ - unsigned multiResp:1; /* multiple trans2 responses for one request */ + unsigned multiRsp:1; /* multiple trans2 responses for one request */ + unsigned multiEnd:1; /* both received */ }; struct oplock_q_entry { @@ -430,15 +433,35 @@ struct dir_notify_req { #define CIFS_LARGE_BUFFER 2 #define CIFS_IOVEC 4 /* array of response buffers */ -/* Type of session setup needed */ -#define CIFS_PLAINTEXT 0 -#define CIFS_LANMAN 1 -#define CIFS_NTLM 2 -#define CIFS_NTLMSSP_NEG 3 -#define CIFS_NTLMSSP_AUTH 4 -#define CIFS_SPNEGO_INIT 5 -#define CIFS_SPNEGO_TARG 6 - +/* Security Flags: indicate type of session setup needed */ +#define CIFSSEC_MAY_SIGN 0x00001 +#define CIFSSEC_MAY_NTLM 0x00002 +#define CIFSSEC_MAY_NTLMV2 0x00004 +#define CIFSSEC_MAY_KRB5 0x00008 +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFSSEC_MAY_LANMAN 0x00010 +#define CIFSSEC_MAY_PLNTXT 0x00020 +#endif /* weak passwords */ +#define CIFSSEC_MAY_SEAL 0x00040 /* not supported yet */ + +#define CIFSSEC_MUST_SIGN 0x01001 +/* note that only one of the following can be set so the +result of setting MUST flags more than once will be to +require use of the stronger protocol */ +#define CIFSSEC_MUST_NTLM 0x02002 +#define CIFSSEC_MUST_NTLMV2 0x04004 +#define CIFSSEC_MUST_KRB5 0x08008 +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFSSEC_MUST_LANMAN 0x10010 +#define CIFSSEC_MUST_PLNTXT 0x20020 +#define CIFSSEC_MASK 0x37037 /* current flags supported if weak */ +#else +#define CIFSSEC_MASK 0x07007 /* flags supported if no weak config */ +#endif /* WEAK_PW_HASH */ +#define CIFSSEC_MUST_SEAL 0x40040 /* not supported yet */ + +#define CIFSSEC_DEF CIFSSEC_MAY_SIGN | CIFSSEC_MAY_NTLM | CIFSSEC_MAY_NTLMV2 +#define CIFSSEC_MAX CIFSSEC_MUST_SIGN | CIFSSEC_MUST_NTLMV2 /* ***************************************************************** * All constants go here @@ -500,16 +523,16 @@ GLOBAL_EXTERN rwlock_t GlobalSMBSeslock; /* protects list inserts on 3 above */ GLOBAL_EXTERN struct list_head GlobalOplock_Q; GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; /* Outstanding dir notify requests */ -GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; /* Dir notify response queue */ +GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q;/* DirNotify response queue */ /* * Global transaction id (XID) information */ GLOBAL_EXTERN unsigned int GlobalCurrentXid; /* protected by GlobalMid_Sem */ -GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ +GLOBAL_EXTERN unsigned int GlobalTotalActiveXid; /* prot by GlobalMid_Sem */ GLOBAL_EXTERN unsigned int GlobalMaxActiveXid; /* prot by GlobalMid_Sem */ -GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above and list operations */ - /* on midQ entries */ +GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ + /* on midQ entries */ GLOBAL_EXTERN char Local_System_Name[15]; /* @@ -531,7 +554,7 @@ GLOBAL_EXTERN atomic_t smBufAllocCount; GLOBAL_EXTERN atomic_t midCount; /* Misc globals */ -GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions +GLOBAL_EXTERN unsigned int multiuser_mount; /* if enabled allows new sessions to be established on existing mount if we have the uid/password or Kerberos credential or equivalent for current user */ @@ -540,8 +563,8 @@ GLOBAL_EXTERN unsigned int experimEnabled; GLOBAL_EXTERN unsigned int lookupCacheEnabled; GLOBAL_EXTERN unsigned int extended_security; /* if on, session setup sent with more secure ntlmssp2 challenge/resp */ -GLOBAL_EXTERN unsigned int ntlmv2_support; /* better optional password hash */ GLOBAL_EXTERN unsigned int sign_CIFS_PDUs; /* enable smb packet signing */ +GLOBAL_EXTERN unsigned int secFlags; GLOBAL_EXTERN unsigned int linuxExtEnabled;/*enable Linux/Unix CIFS extensions*/ GLOBAL_EXTERN unsigned int CIFSMaxBufSize; /* max size not including hdr */ GLOBAL_EXTERN unsigned int cifs_min_rcv; /* min size of big ntwrk buf pool */ diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b2233ac05bd..86239023545 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -16,7 +16,7 @@ * * You should have received a copy of the GNU Lesser General Public License * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #ifndef _CIFSPDU_H @@ -24,8 +24,14 @@ #include <net/sock.h> +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define LANMAN_PROT 0 +#define CIFS_PROT 1 +#else #define CIFS_PROT 0 -#define BAD_PROT CIFS_PROT+1 +#endif +#define POSIX_PROT CIFS_PROT+1 +#define BAD_PROT 0xFFFF /* SMB command codes */ /* Some commands have minimal (wct=0,bcc=0), or uninteresting, responses @@ -110,7 +116,7 @@ /* * Size of the session key (crypto key encrypted with the password */ -#define CIFS_SESSION_KEY_SIZE (24) +#define CIFS_SESS_KEY_SIZE (24) /* * Maximum user name length @@ -400,6 +406,29 @@ typedef struct negotiate_req { unsigned char DialectsArray[1]; } __attribute__((packed)) NEGOTIATE_REQ; +/* Dialect index is 13 for LANMAN */ + +typedef struct lanman_neg_rsp { + struct smb_hdr hdr; /* wct = 13 */ + __le16 DialectIndex; + __le16 SecurityMode; + __le16 MaxBufSize; + __le16 MaxMpxCount; + __le16 MaxNumberVcs; + __le16 RawMode; + __le32 SessionKey; + __le32 ServerTime; + __le16 ServerTimeZone; + __le16 EncryptionKeyLength; + __le16 Reserved; + __u16 ByteCount; + unsigned char EncryptionKey[1]; +} __attribute__((packed)) LANMAN_NEG_RSP; + +#define READ_RAW_ENABLE 1 +#define WRITE_RAW_ENABLE 2 +#define RAW_ENABLE (READ_RAW_ENABLE | WRITE_RAW_ENABLE) + typedef struct negotiate_rsp { struct smb_hdr hdr; /* wct = 17 */ __le16 DialectIndex; @@ -509,7 +538,7 @@ typedef union smb_com_session_setup_andx { /* unsigned char * NativeOS; */ /* unsigned char * NativeLanMan; */ /* unsigned char * PrimaryDomain; */ - } __attribute__((packed)) resp; /* NTLM response format (with or without extended security */ + } __attribute__((packed)) resp; /* NTLM response with or without extended sec*/ struct { /* request format */ struct smb_hdr hdr; /* wct = 10 */ @@ -520,8 +549,8 @@ typedef union smb_com_session_setup_andx { __le16 MaxMpxCount; __le16 VcNumber; __u32 SessionKey; - __le16 PassswordLength; - __u32 Reserved; + __le16 PasswordLength; + __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; unsigned char AccountPassword[1]; /* followed by */ /* STRING AccountName */ @@ -543,6 +572,26 @@ typedef union smb_com_session_setup_andx { } __attribute__((packed)) old_resp; /* pre-NTLM (LANMAN2.1) response */ } __attribute__((packed)) SESSION_SETUP_ANDX; +/* format of NLTMv2 Response ie "case sensitive password" hash when NTLMv2 */ + +struct ntlmssp2_name { + __le16 type; + __le16 length; +/* char name[length]; */ +} __attribute__((packed)); + +struct ntlmv2_resp { + char ntlmv2_hash[CIFS_ENCPWD_SIZE]; + __le32 blob_signature; + __u32 reserved; + __le64 time; + __u64 client_chal; /* random */ + __u32 reserved2; + struct ntlmssp2_name names[1]; + /* array of name entries could follow ending in minimum 4 byte struct */ +} __attribute__((packed)); + + #define CIFS_NETWORK_OPSYS "CIFS VFS Client for Linux" /* Capabilities bits (for NTLM SessSetup request) */ @@ -573,7 +622,9 @@ typedef struct smb_com_tconx_req { } __attribute__((packed)) TCONX_REQ; typedef struct smb_com_tconx_rsp { - struct smb_hdr hdr; /* wct = 3 *//* note that Win2000 has sent wct=7 in some cases on responses. Four unspecified words followed OptionalSupport */ + struct smb_hdr hdr; /* wct = 3 note that Win2000 has sent wct = 7 + in some cases on responses. Four unspecified + words followed OptionalSupport */ __u8 AndXCommand; __u8 AndXReserved; __le16 AndXOffset; @@ -1323,6 +1374,9 @@ struct smb_t2_rsp { #define SMB_FILE_MAXIMUM_INFO 0x40d /* Find File infolevels */ +#define SMB_FIND_FILE_INFO_STANDARD 0x001 +#define SMB_FIND_FILE_QUERY_EA_SIZE 0x002 +#define SMB_FIND_FILE_QUERY_EAS_FROM_LIST 0x003 #define SMB_FIND_FILE_DIRECTORY_INFO 0x101 #define SMB_FIND_FILE_FULL_DIRECTORY_INFO 0x102 #define SMB_FIND_FILE_NAMES_INFO 0x103 @@ -1844,13 +1898,13 @@ typedef struct { typedef struct { __le32 DeviceType; __le32 DeviceCharacteristics; -} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info, level 0x104 */ +} __attribute__((packed)) FILE_SYSTEM_DEVICE_INFO; /* device info level 0x104 */ typedef struct { __le32 Attributes; __le32 MaxPathNameComponentLength; __le32 FileSystemNameLen; - char FileSystemName[52]; /* do not really need to save this - so potentially get only subset of name */ + char FileSystemName[52]; /* do not have to save this - get subset? */ } __attribute__((packed)) FILE_SYSTEM_ATTRIBUTE_INFO; /******************************************************************************/ @@ -1947,7 +2001,8 @@ typedef struct { struct file_allocation_info { __le64 AllocationSize; /* Note old Samba srvr rounds this up too much */ -} __attribute__((packed)); /* size used on disk, level 0x103 for set, 0x105 for query */ +} __attribute__((packed)); /* size used on disk, for level 0x103 for set, + 0x105 for query */ struct file_end_of_file_info { __le64 FileSize; /* offset to end of file */ @@ -2054,7 +2109,7 @@ typedef struct { __le32 ExtFileAttributes; __le32 FileNameLength; char FileName[1]; -} __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF response data area */ +} __attribute__((packed)) FILE_DIRECTORY_INFO; /* level 0x101 FF resp data */ typedef struct { __le32 NextEntryOffset; @@ -2069,7 +2124,7 @@ typedef struct { __le32 FileNameLength; __le32 EaSize; /* length of the xattrs */ char FileName[1]; -} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 FF response data area */ +} __attribute__((packed)) FILE_FULL_DIRECTORY_INFO; /* level 0x102 rsp data */ typedef struct { __le32 NextEntryOffset; @@ -2086,7 +2141,7 @@ typedef struct { __le32 Reserved; __u64 UniqueId; /* inode num - le since Samba puts ino in low 32 bit*/ char FileName[1]; -} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF response data area */ +} __attribute__((packed)) SEARCH_ID_FULL_DIR_INFO; /* level 0x105 FF rsp data */ typedef struct { __le32 NextEntryOffset; @@ -2104,7 +2159,22 @@ typedef struct { __u8 Reserved; __u8 ShortName[12]; char FileName[1]; -} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FF response data area */ +} __attribute__((packed)) FILE_BOTH_DIRECTORY_INFO; /* level 0x104 FFrsp data */ + +typedef struct { + __u32 ResumeKey; + __le16 CreationDate; /* SMB Date */ + __le16 CreationTime; /* SMB Time */ + __le16 LastAccessDate; + __le16 LastAccessTime; + __le16 LastWriteDate; + __le16 LastWriteTime; + __le32 DataSize; /* File Size (EOF) */ + __le32 AllocationSize; + __le16 Attributes; /* verify not u32 */ + __u8 FileNameLength; + char FileName[1]; +} __attribute__((packed)) FIND_FILE_STANDARD_INFO; /* level 0x1 FF resp data */ struct win_dev { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 310ea2f0e0b..a5ddc62d6fe 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -64,14 +64,12 @@ extern int map_smb_to_linux_error(struct smb_hdr *smb); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of fixed section (word count) in two byte units */); -#ifdef CONFIG_CIFS_EXPERIMENTAL extern int small_smb_init_no_tc(const int smb_cmd, const int wct, struct cifsSesInfo *ses, void ** request_buf); extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, - const int stage, int * pNTLMv2_flg, + const int stage, const struct nls_table *nls_cp); -#endif extern __u16 GetNextMid(struct TCP_Server_Info *server); extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16, struct cifsTconInfo *); @@ -285,8 +283,14 @@ extern int cifs_sign_smb2(struct kvec *iov, int n_vec, struct TCP_Server_Info *, extern int cifs_verify_signature(struct smb_hdr *, const char * mac_key, __u32 expected_sequence_number); extern int cifs_calculate_mac_key(char * key,const char * rn,const char * pass); -extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, struct nls_table *); -extern void CalcNTLMv2_response(const struct cifsSesInfo *,char * ); +extern int CalcNTLMv2_partial_mac_key(struct cifsSesInfo *, + const struct nls_table *); +extern void CalcNTLMv2_response(const struct cifsSesInfo *, char * ); +extern void setup_ntlmv2_rsp(struct cifsSesInfo *, char *, + const struct nls_table *); +#ifdef CONFIG_CIFS_WEAK_PW_HASH +extern void calc_lanman_hash(struct cifsSesInfo * ses, char * lnm_session_key); +#endif /* CIFS_WEAK_PW_HASH */ extern int CIFSSMBCopy(int xid, struct cifsTconInfo *source_tcon, const char *fromName, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 925881e00ff..19678c575df 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -44,8 +44,11 @@ static struct { int index; char *name; } protocols[] = { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + {LANMAN_PROT, "\2LM1.2X002"}, +#endif /* weak password hashing for legacy clients */ {CIFS_PROT, "\2NT LM 0.12"}, - {CIFS_PROT, "\2POSIX 2"}, + {POSIX_PROT, "\2POSIX 2"}, {BAD_PROT, "\2"} }; #else @@ -53,11 +56,29 @@ static struct { int index; char *name; } protocols[] = { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + {LANMAN_PROT, "\2LM1.2X002"}, +#endif /* weak password hashing for legacy clients */ {CIFS_PROT, "\2NT LM 0.12"}, {BAD_PROT, "\2"} }; #endif +/* define the number of elements in the cifs dialect array */ +#ifdef CONFIG_CIFS_POSIX +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFS_NUM_PROT 3 +#else +#define CIFS_NUM_PROT 2 +#endif /* CIFS_WEAK_PW_HASH */ +#else /* not posix */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH +#define CIFS_NUM_PROT 2 +#else +#define CIFS_NUM_PROT 1 +#endif /* CONFIG_CIFS_WEAK_PW_HASH */ +#endif /* CIFS_POSIX */ + /* Mark as invalid, all open files on tree connections since they were closed when session to server was lost */ @@ -188,7 +209,6 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, return rc; } -#ifdef CONFIG_CIFS_EXPERIMENTAL int small_smb_init_no_tc(const int smb_command, const int wct, struct cifsSesInfo *ses, void **request_buf) @@ -214,7 +234,6 @@ small_smb_init_no_tc(const int smb_command, const int wct, return rc; } -#endif /* CONFIG_CIFS_EXPERIMENTAL */ /* If the return code is zero, this function must fill in request_buf pointer */ static int @@ -322,7 +341,8 @@ smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, /* potential retries of smb operations it turns out we can determine */ /* from the mid flags when the request buffer can be resent without */ /* having to use a second distinct buffer for the response */ - *response_buf = *request_buf; + if(response_buf) + *response_buf = *request_buf; header_assemble((struct smb_hdr *) *request_buf, smb_command, tcon, wct /*wct */ ); @@ -373,8 +393,10 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) NEGOTIATE_RSP *pSMBr; int rc = 0; int bytes_returned; + int i; struct TCP_Server_Info * server; u16 count; + unsigned int secFlags; if(ses->server) server = ses->server; @@ -386,101 +408,200 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) (void **) &pSMB, (void **) &pSMBr); if (rc) return rc; + + /* if any of auth flags (ie not sign or seal) are overriden use them */ + if(ses->overrideSecFlg & (~(CIFSSEC_MUST_SIGN | CIFSSEC_MUST_SEAL))) + secFlags = ses->overrideSecFlg; + else /* if override flags set only sign/seal OR them with global auth */ + secFlags = extended_security | ses->overrideSecFlg; + + cFYI(1,("secFlags 0x%x",secFlags)); + pSMB->hdr.Mid = GetNextMid(server); pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; - if (extended_security) + if((secFlags & CIFSSEC_MUST_KRB5) == CIFSSEC_MUST_KRB5) pSMB->hdr.Flags2 |= SMBFLG2_EXT_SEC; - - count = strlen(protocols[0].name) + 1; - strncpy(pSMB->DialectsArray, protocols[0].name, 30); - /* null guaranteed to be at end of source and target buffers anyway */ - + + count = 0; + for(i=0;i<CIFS_NUM_PROT;i++) { + strncpy(pSMB->DialectsArray+count, protocols[i].name, 16); + count += strlen(protocols[i].name) + 1; + /* null at end of source and target buffers anyway */ + } pSMB->hdr.smb_buf_length += count; pSMB->ByteCount = cpu_to_le16(count); rc = SendReceive(xid, ses, (struct smb_hdr *) pSMB, (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc == 0) { - server->secMode = pSMBr->SecurityMode; - if((server->secMode & SECMODE_USER) == 0) - cFYI(1,("share mode security")); - server->secType = NTLM; /* BB override default for - NTLMv2 or kerberos v5 */ - /* one byte - no need to convert this or EncryptionKeyLen - from little endian */ - server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); - /* probably no need to store and check maxvcs */ - server->maxBuf = - min(le32_to_cpu(pSMBr->MaxBufferSize), + if (rc != 0) + goto neg_err_exit; + + cFYI(1,("Dialect: %d", pSMBr->DialectIndex)); + /* Check wct = 1 error case */ + if((pSMBr->hdr.WordCount < 13) || (pSMBr->DialectIndex == BAD_PROT)) { + /* core returns wct = 1, but we do not ask for core - otherwise + small wct just comes when dialect index is -1 indicating we + could not negotiate a common dialect */ + rc = -EOPNOTSUPP; + goto neg_err_exit; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + } else if((pSMBr->hdr.WordCount == 13) + && (pSMBr->DialectIndex == LANMAN_PROT)) { + struct lanman_neg_rsp * rsp = (struct lanman_neg_rsp *)pSMBr; + + if((secFlags & CIFSSEC_MAY_LANMAN) || + (secFlags & CIFSSEC_MAY_PLNTXT)) + server->secType = LANMAN; + else { + cERROR(1, ("mount failed weak security disabled" + " in /proc/fs/cifs/SecurityFlags")); + rc = -EOPNOTSUPP; + goto neg_err_exit; + } + server->secMode = (__u8)le16_to_cpu(rsp->SecurityMode); + server->maxReq = le16_to_cpu(rsp->MaxMpxCount); + server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), + (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey); + /* even though we do not use raw we might as well set this + accurately, in case we ever find a need for it */ + if((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { + server->maxRw = 0xFF00; + server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; + } else { + server->maxRw = 0;/* we do not need to use raw anyway */ + server->capabilities = CAP_MPX_MODE; + } + server->timeZone = le16_to_cpu(rsp->ServerTimeZone); + + /* BB get server time for time conversions and add + code to use it and timezone since this is not UTC */ + + if (rsp->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + memcpy(server->cryptKey, rsp->EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if (server->secMode & SECMODE_PW_ENCRYPT) { + rc = -EIO; /* need cryptkey unless plain text */ + goto neg_err_exit; + } + + cFYI(1,("LANMAN negotiated")); + /* we will not end up setting signing flags - as no signing + was in LANMAN and server did not return the flags on */ + goto signing_check; +#else /* weak security disabled */ + } else if(pSMBr->hdr.WordCount == 13) { + cERROR(1,("mount failed, cifs module not built " + "with CIFS_WEAK_PW_HASH support")); + rc = -EOPNOTSUPP; +#endif /* WEAK_PW_HASH */ + goto neg_err_exit; + } else if(pSMBr->hdr.WordCount != 17) { + /* unknown wct */ + rc = -EOPNOTSUPP; + goto neg_err_exit; + } + /* else wct == 17 NTLM */ + server->secMode = pSMBr->SecurityMode; + if((server->secMode & SECMODE_USER) == 0) + cFYI(1,("share mode security")); + + if((server->secMode & SECMODE_PW_ENCRYPT) == 0) +#ifdef CONFIG_CIFS_WEAK_PW_HASH + if ((secFlags & CIFSSEC_MAY_PLNTXT) == 0) +#endif /* CIFS_WEAK_PW_HASH */ + cERROR(1,("Server requests plain text password" + " but client support disabled")); + + if((secFlags & CIFSSEC_MUST_NTLMV2) == CIFSSEC_MUST_NTLMV2) + server->secType = NTLMv2; + else if(secFlags & CIFSSEC_MAY_NTLM) + server->secType = NTLM; + else if(secFlags & CIFSSEC_MAY_NTLMV2) + server->secType = NTLMv2; + /* else krb5 ... any others ... */ + + /* one byte, so no need to convert this or EncryptionKeyLen from + little endian */ + server->maxReq = le16_to_cpu(pSMBr->MaxMpxCount); + /* probably no need to store and check maxvcs */ + server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); - server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); - cFYI(0, ("Max buf = %d", ses->server->maxBuf)); - GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); - server->capabilities = le32_to_cpu(pSMBr->Capabilities); - server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone); - /* BB with UTC do we ever need to be using srvr timezone? */ - if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { - memcpy(server->cryptKey, pSMBr->u.EncryptionKey, - CIFS_CRYPTO_KEY_SIZE); - } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) - && (pSMBr->EncryptionKeyLength == 0)) { - /* decode security blob */ - } else - rc = -EIO; + server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); + cFYI(0, ("Max buf = %d", ses->server->maxBuf)); + GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); + server->capabilities = le32_to_cpu(pSMBr->Capabilities); + server->timeZone = le16_to_cpu(pSMBr->ServerTimeZone); + if (pSMBr->EncryptionKeyLength == CIFS_CRYPTO_KEY_SIZE) { + memcpy(server->cryptKey, pSMBr->u.EncryptionKey, + CIFS_CRYPTO_KEY_SIZE); + } else if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) + && (pSMBr->EncryptionKeyLength == 0)) { + /* decode security blob */ + } else if (server->secMode & SECMODE_PW_ENCRYPT) { + rc = -EIO; /* no crypt key only if plain text pwd */ + goto neg_err_exit; + } - /* BB might be helpful to save off the domain of server here */ + /* BB might be helpful to save off the domain of server here */ - if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && - (server->capabilities & CAP_EXTENDED_SECURITY)) { - count = pSMBr->ByteCount; - if (count < 16) - rc = -EIO; - else if (count == 16) { - server->secType = RawNTLMSSP; - if (server->socketUseCount.counter > 1) { - if (memcmp - (server->server_GUID, - pSMBr->u.extended_response. - GUID, 16) != 0) { - cFYI(1, ("server UID changed")); - memcpy(server-> - server_GUID, - pSMBr->u. - extended_response. - GUID, 16); - } - } else + if ((pSMBr->hdr.Flags2 & SMBFLG2_EXT_SEC) && + (server->capabilities & CAP_EXTENDED_SECURITY)) { + count = pSMBr->ByteCount; + if (count < 16) + rc = -EIO; + else if (count == 16) { + server->secType = RawNTLMSSP; + if (server->socketUseCount.counter > 1) { + if (memcmp(server->server_GUID, + pSMBr->u.extended_response. + GUID, 16) != 0) { + cFYI(1, ("server UID changed")); memcpy(server->server_GUID, - pSMBr->u.extended_response. - GUID, 16); - } else { - rc = decode_negTokenInit(pSMBr->u. - extended_response. - SecurityBlob, - count - 16, - &server->secType); - if(rc == 1) { - /* BB Need to fill struct for sessetup here */ - rc = -EOPNOTSUPP; - } else { - rc = -EINVAL; + pSMBr->u.extended_response.GUID, + 16); } + } else + memcpy(server->server_GUID, + pSMBr->u.extended_response.GUID, 16); + } else { + rc = decode_negTokenInit(pSMBr->u.extended_response. + SecurityBlob, + count - 16, + &server->secType); + if(rc == 1) { + /* BB Need to fill struct for sessetup here */ + rc = -EOPNOTSUPP; + } else { + rc = -EINVAL; } - } else - server->capabilities &= ~CAP_EXTENDED_SECURITY; - if(sign_CIFS_PDUs == FALSE) { - if(server->secMode & SECMODE_SIGN_REQUIRED) - cERROR(1, - ("Server requires /proc/fs/cifs/PacketSigningEnabled")); - server->secMode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); - } else if(sign_CIFS_PDUs == 1) { - if((server->secMode & SECMODE_SIGN_REQUIRED) == 0) - server->secMode &= ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); } - + } else + server->capabilities &= ~CAP_EXTENDED_SECURITY; + +#ifdef CONFIG_CIFS_WEAK_PW_HASH +signing_check: +#endif + if(sign_CIFS_PDUs == FALSE) { + if(server->secMode & SECMODE_SIGN_REQUIRED) + cERROR(1,("Server requires " + "/proc/fs/cifs/PacketSigningEnabled to be on")); + server->secMode &= + ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + } else if(sign_CIFS_PDUs == 1) { + if((server->secMode & SECMODE_SIGN_REQUIRED) == 0) + server->secMode &= + ~(SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED); + } else if(sign_CIFS_PDUs == 2) { + if((server->secMode & + (SECMODE_SIGN_ENABLED | SECMODE_SIGN_REQUIRED)) == 0) { + cERROR(1,("signing required but server lacks support")); + } } - +neg_err_exit: cifs_buf_release(pSMB); + + cFYI(1,("negprot rc %d",rc)); return rc; } @@ -2239,7 +2360,7 @@ CIFSSMBQueryReparseLinkInfo(const int xid, struct cifsTconInfo *tcon, } symlinkinfo[buflen] = 0; /* just in case so the caller does not go off the end of the buffer */ - cFYI(1,("readlink result - %s ",symlinkinfo)); + cFYI(1,("readlink result - %s",symlinkinfo)); } } qreparse_out: diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index bae1479318d..876eb9ef85f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -49,8 +49,6 @@ static DECLARE_COMPLETION(cifsd_complete); -extern void SMBencrypt(unsigned char *passwd, unsigned char *c8, - unsigned char *p24); extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); @@ -70,6 +68,7 @@ struct smb_vol { gid_t linux_gid; mode_t file_mode; mode_t dir_mode; + unsigned secFlg; unsigned rw:1; unsigned retry:1; unsigned intr:1; @@ -83,12 +82,7 @@ struct smb_vol { unsigned remap:1; /* set to remap seven reserved chars in filenames */ unsigned posix_paths:1; /* unset to not ask for posix pathnames. */ unsigned sfu_emul:1; - unsigned krb5:1; - unsigned ntlm:1; - unsigned ntlmv2:1; unsigned nullauth:1; /* attempt to authenticate with null user */ - unsigned sign:1; - unsigned seal:1; /* encrypt */ unsigned nocase; /* request case insensitive filenames */ unsigned nobrl; /* disable sending byte range locks to srv */ unsigned int rsize; @@ -369,21 +363,21 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) continue; if (bigbuf == NULL) { bigbuf = cifs_buf_get(); - if(bigbuf == NULL) { - cERROR(1,("No memory for large SMB response")); + if (!bigbuf) { + cERROR(1, ("No memory for large SMB response")); msleep(3000); /* retry will check if exiting */ continue; } - } else if(isLargeBuf) { - /* we are reusing a dirtry large buf, clear its start */ + } else if (isLargeBuf) { + /* we are reusing a dirty large buf, clear its start */ memset(bigbuf, 0, sizeof (struct smb_hdr)); } if (smallbuf == NULL) { smallbuf = cifs_small_buf_get(); - if(smallbuf == NULL) { - cERROR(1,("No memory for SMB response")); + if (!smallbuf) { + cERROR(1, ("No memory for SMB response")); msleep(1000); /* retry will check if exiting */ continue; @@ -403,12 +397,12 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) kernel_recvmsg(csocket, &smb_msg, &iov, 1, 4, 0 /* BB see socket.h flags */); - if(server->tcpStatus == CifsExiting) { + if (server->tcpStatus == CifsExiting) { break; } else if (server->tcpStatus == CifsNeedReconnect) { - cFYI(1,("Reconnect after server stopped responding")); + cFYI(1, ("Reconnect after server stopped responding")); cifs_reconnect(server); - cFYI(1,("call to reconnect done")); + cFYI(1, ("call to reconnect done")); csocket = server->ssocket; continue; } else if ((length == -ERESTARTSYS) || (length == -EAGAIN)) { @@ -417,15 +411,15 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) tcpStatus CifsNeedReconnect if server hung */ continue; } else if (length <= 0) { - if(server->tcpStatus == CifsNew) { - cFYI(1,("tcp session abend after SMBnegprot")); + if (server->tcpStatus == CifsNew) { + cFYI(1, ("tcp session abend after SMBnegprot")); /* some servers kill the TCP session rather than returning an SMB negprot error, in which case reconnecting here is not going to help, and so simply return error to mount */ break; } - if(length == -EINTR) { + if (!try_to_freeze() && (length == -EINTR)) { cFYI(1,("cifsd thread killed")); break; } @@ -585,9 +579,11 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) /* merge response - fix up 1st*/ if(coalesce_t2(smb_buffer, mid_entry->resp_buf)) { + mid_entry->multiRsp = 1; break; } else { /* all parts received */ + mid_entry->multiEnd = 1; goto multi_t2_fnd; } } else { @@ -632,9 +628,14 @@ multi_t2_fnd: wake_up_process(task_to_wake); } else if ((is_valid_oplock_break(smb_buffer, server) == FALSE) && (isMultiRsp == FALSE)) { - cERROR(1, ("No task to wake, unknown frame rcvd!")); + cERROR(1, ("No task to wake, unknown frame rcvd! NumMids %d", midCount.counter)); cifs_dump_mem("Received Data is: ",(char *)smb_buffer, sizeof(struct smb_hdr)); +#ifdef CONFIG_CIFS_DEBUG2 + cifs_dump_detail(smb_buffer); + cifs_dump_mids(server); +#endif /* CIFS_DEBUG2 */ + } } /* end while !EXITING */ @@ -784,7 +785,6 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol) /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */ vol->rw = TRUE; - vol->ntlm = TRUE; /* default is always to request posix paths. */ vol->posix_paths = 1; @@ -915,30 +915,35 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol) cERROR(1,("no security value specified")); continue; } else if (strnicmp(value, "krb5i", 5) == 0) { - vol->sign = 1; - vol->krb5 = 1; + vol->secFlg |= CIFSSEC_MAY_KRB5 | + CIFSSEC_MUST_SIGN; } else if (strnicmp(value, "krb5p", 5) == 0) { - /* vol->seal = 1; - vol->krb5 = 1; */ + /* vol->secFlg |= CIFSSEC_MUST_SEAL | + CIFSSEC_MAY_KRB5; */ cERROR(1,("Krb5 cifs privacy not supported")); return 1; } else if (strnicmp(value, "krb5", 4) == 0) { - vol->krb5 = 1; + vol->secFlg |= CIFSSEC_MAY_KRB5; } else if (strnicmp(value, "ntlmv2i", 7) == 0) { - vol->ntlmv2 = 1; - vol->sign = 1; + vol->secFlg |= CIFSSEC_MAY_NTLMV2 | + CIFSSEC_MUST_SIGN; } else if (strnicmp(value, "ntlmv2", 6) == 0) { - vol->ntlmv2 = 1; + vol->secFlg |= CIFSSEC_MAY_NTLMV2; } else if (strnicmp(value, "ntlmi", 5) == 0) { - vol->ntlm = 1; - vol->sign = 1; + vol->secFlg |= CIFSSEC_MAY_NTLM | + CIFSSEC_MUST_SIGN; } else if (strnicmp(value, "ntlm", 4) == 0) { /* ntlm is default so can be turned off too */ - vol->ntlm = 1; + vol->secFlg |= CIFSSEC_MAY_NTLM; } else if (strnicmp(value, "nontlm", 6) == 0) { - vol->ntlm = 0; + /* BB is there a better way to do this? */ + vol->secFlg |= CIFSSEC_MAY_NTLMV2; +#ifdef CONFIG_CIFS_WEAK_PW_HASH + } else if (strnicmp(value, "lanman", 6) == 0) { + vol->secFlg |= CIFSSEC_MAY_LANMAN; +#endif } else if (strnicmp(value, "none", 4) == 0) { - vol->nullauth = 1; + vol->nullauth = 1; } else { cERROR(1,("bad security option: %s", value)); return 1; @@ -976,7 +981,7 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol) } /* BB are there cases in which a comma can be valid in a domain name and need special handling? */ - if (strnlen(value, 65) < 65) { + if (strnlen(value, 256) < 256) { vol->domainname = value; cFYI(1, ("Domain name set")); } else { @@ -1168,6 +1173,10 @@ cifs_parse_mount_options(char *options, const char *devname,struct smb_vol *vol) vol->no_psx_acl = 0; } else if (strnicmp(data, "noacl",5) == 0) { vol->no_psx_acl = 1; + } else if (strnicmp(data, "sign",4) == 0) { + vol->secFlg |= CIFSSEC_MUST_SIGN; +/* } else if (strnicmp(data, "seal",4) == 0) { + vol->secFlg |= CIFSSEC_MUST_SEAL; */ } else if (strnicmp(data, "direct",6) == 0) { vol->direct_io = 1; } else if (strnicmp(data, "forcedirectio",13) == 0) { @@ -1762,11 +1771,18 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, if (volume_info.username) strncpy(pSesInfo->userName, volume_info.username,MAX_USERNAME_SIZE); - if (volume_info.domainname) - strncpy(pSesInfo->domainName, - volume_info.domainname,MAX_USERNAME_SIZE); + if (volume_info.domainname) { + int len = strlen(volume_info.domainname); + pSesInfo->domainName = + kmalloc(len + 1, GFP_KERNEL); + if(pSesInfo->domainName) + strcpy(pSesInfo->domainName, + volume_info.domainname); + } pSesInfo->linux_uid = volume_info.linux_uid; + pSesInfo->overrideSecFlg = volume_info.secFlg; down(&pSesInfo->sesSem); + /* BB FIXME need to pass vol->secFlgs BB */ rc = cifs_setup_session(xid,pSesInfo, cifs_sb->local_nls); up(&pSesInfo->sesSem); if(!rc) @@ -1980,7 +1996,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, static int CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, - char session_key[CIFS_SESSION_KEY_SIZE], + char session_key[CIFS_SESS_KEY_SIZE], const struct nls_table *nls_codepage) { struct smb_hdr *smb_buffer; @@ -2038,15 +2054,15 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_SESSION_KEY_SIZE); + cpu_to_le16(CIFS_SESS_KEY_SIZE); pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_SESSION_KEY_SIZE); + cpu_to_le16(CIFS_SESS_KEY_SIZE); bcc_ptr = pByteArea(smb_buffer); - memcpy(bcc_ptr, (char *) session_key, CIFS_SESSION_KEY_SIZE); - bcc_ptr += CIFS_SESSION_KEY_SIZE; - memcpy(bcc_ptr, (char *) session_key, CIFS_SESSION_KEY_SIZE); - bcc_ptr += CIFS_SESSION_KEY_SIZE; + memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE); + bcc_ptr += CIFS_SESS_KEY_SIZE; + memcpy(bcc_ptr, (char *) session_key, CIFS_SESS_KEY_SIZE); + bcc_ptr += CIFS_SESS_KEY_SIZE; if (ses->capabilities & CAP_UNICODE) { if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode */ @@ -2054,7 +2070,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; } if(user == NULL) - bytes_returned = 0; /* skill null user */ + bytes_returned = 0; /* skip null user */ else bytes_returned = cifs_strtoUCS((__le16 *) bcc_ptr, user, 100, @@ -2162,8 +2178,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (remaining_words > 0) { len = UniStrnlen((wchar_t *)bcc_ptr, remaining_words-1); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1),GFP_KERNEL); if(ses->serverNOS == NULL) goto sesssetup_nomem; @@ -2203,12 +2218,10 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, /* if these kcallocs fail not much we can do, but better to not fail the sesssetup itself */ - if(ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } @@ -2217,8 +2230,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, if (((long) bcc_ptr + len) - (long) pByteArea(smb_buffer_response) <= BCC(smb_buffer_response)) { - if(ses->serverOS) - kfree(ses->serverOS); + kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1,GFP_KERNEL); if(ses->serverOS == NULL) goto sesssetup_nomem; @@ -2229,8 +2241,7 @@ CIFSSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1,GFP_KERNEL); if(ses->serverNOS == NULL) goto sesssetup_nomem; @@ -2274,292 +2285,6 @@ sesssetup_nomem: /* do not return an error on nomem for the info strings, } static int -CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, - char *SecurityBlob,int SecurityBlobLength, - const struct nls_table *nls_codepage) -{ - struct smb_hdr *smb_buffer; - struct smb_hdr *smb_buffer_response; - SESSION_SETUP_ANDX *pSMB; - SESSION_SETUP_ANDX *pSMBr; - char *bcc_ptr; - char *user; - char *domain; - int rc = 0; - int remaining_words = 0; - int bytes_returned = 0; - int len; - __u32 capabilities; - __u16 count; - - cFYI(1, ("In spnego sesssetup ")); - if(ses == NULL) - return -EINVAL; - user = ses->userName; - domain = ses->domainName; - - smb_buffer = cifs_buf_get(); - if (smb_buffer == NULL) { - return -ENOMEM; - } - smb_buffer_response = smb_buffer; - pSMBr = pSMB = (SESSION_SETUP_ANDX *) smb_buffer; - - /* send SMBsessionSetup here */ - header_assemble(smb_buffer, SMB_COM_SESSION_SETUP_ANDX, - NULL /* no tCon exists yet */ , 12 /* wct */ ); - - smb_buffer->Mid = GetNextMid(ses->server); - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - pSMB->req.AndXCommand = 0xFF; - if(ses->server->maxBuf > 64*1024) - ses->server->maxBuf = (64*1023); - pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); - pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - - if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - smb_buffer->Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - - capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | - CAP_EXTENDED_SECURITY; - if (ses->capabilities & CAP_UNICODE) { - smb_buffer->Flags2 |= SMBFLG2_UNICODE; - capabilities |= CAP_UNICODE; - } - if (ses->capabilities & CAP_STATUS32) { - smb_buffer->Flags2 |= SMBFLG2_ERR_STATUS; - capabilities |= CAP_STATUS32; - } - if (ses->capabilities & CAP_DFS) { - smb_buffer->Flags2 |= SMBFLG2_DFS; - capabilities |= CAP_DFS; - } - pSMB->req.Capabilities = cpu_to_le32(capabilities); - - pSMB->req.SecurityBlobLength = cpu_to_le16(SecurityBlobLength); - bcc_ptr = pByteArea(smb_buffer); - memcpy(bcc_ptr, SecurityBlob, SecurityBlobLength); - bcc_ptr += SecurityBlobLength; - - if (ses->capabilities & CAP_UNICODE) { - if ((long) bcc_ptr % 2) { /* must be word aligned for Unicode strings */ - *bcc_ptr = 0; - bcc_ptr++; - } - bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, user, 100, nls_codepage); - bcc_ptr += 2 * bytes_returned; /* convert num of 16 bit words to bytes */ - bcc_ptr += 2; /* trailing null */ - if (domain == NULL) - bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, - "CIFS_LINUX_DOM", 32, nls_codepage); - else - bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, domain, 64, - nls_codepage); - bcc_ptr += 2 * bytes_returned; - bcc_ptr += 2; - bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, "Linux version ", - 32, nls_codepage); - bcc_ptr += 2 * bytes_returned; - bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, 32, - nls_codepage); - bcc_ptr += 2 * bytes_returned; - bcc_ptr += 2; - bytes_returned = - cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, - 64, nls_codepage); - bcc_ptr += 2 * bytes_returned; - bcc_ptr += 2; - } else { - strncpy(bcc_ptr, user, 200); - bcc_ptr += strnlen(user, 200); - *bcc_ptr = 0; - bcc_ptr++; - if (domain == NULL) { - strcpy(bcc_ptr, "CIFS_LINUX_DOM"); - bcc_ptr += strlen("CIFS_LINUX_DOM") + 1; - } else { - strncpy(bcc_ptr, domain, 64); - bcc_ptr += strnlen(domain, 64); - *bcc_ptr = 0; - bcc_ptr++; - } - strcpy(bcc_ptr, "Linux version "); - bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, system_utsname.release); - bcc_ptr += strlen(system_utsname.release) + 1; - strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); - bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; - } - count = (long) bcc_ptr - (long) pByteArea(smb_buffer); - smb_buffer->smb_buf_length += count; - pSMB->req.ByteCount = cpu_to_le16(count); - - rc = SendReceive(xid, ses, smb_buffer, smb_buffer_response, - &bytes_returned, 1); - if (rc) { -/* rc = map_smb_to_linux_error(smb_buffer_response); *//* done in SendReceive now */ - } else if ((smb_buffer_response->WordCount == 3) - || (smb_buffer_response->WordCount == 4)) { - __u16 action = le16_to_cpu(pSMBr->resp.Action); - __u16 blob_len = - le16_to_cpu(pSMBr->resp.SecurityBlobLength); - if (action & GUEST_LOGIN) - cFYI(1, (" Guest login")); /* BB do we want to set anything in SesInfo struct ? */ - if (ses) { - ses->Suid = smb_buffer_response->Uid; /* UID left in wire format (le) */ - cFYI(1, ("UID = %d ", ses->Suid)); - bcc_ptr = pByteArea(smb_buffer_response); /* response can have either 3 or 4 word count - Samba sends 3 */ - - /* BB Fix below to make endian neutral !! */ - - if ((pSMBr->resp.hdr.WordCount == 3) - || ((pSMBr->resp.hdr.WordCount == 4) - && (blob_len < - pSMBr->resp.ByteCount))) { - if (pSMBr->resp.hdr.WordCount == 4) { - bcc_ptr += - blob_len; - cFYI(1, - ("Security Blob Length %d ", - blob_len)); - } - - if (smb_buffer->Flags2 & SMBFLG2_UNICODE) { - if ((long) (bcc_ptr) % 2) { - remaining_words = - (BCC(smb_buffer_response) - - 1) / 2; - bcc_ptr++; /* Unicode strings must be word aligned */ - } else { - remaining_words = - BCC - (smb_buffer_response) / 2; - } - len = - UniStrnlen((wchar_t *) bcc_ptr, - remaining_words - 1); -/* We look for obvious messed up bcc or strings in response so we do not go off - the end since (at least) WIN2K and Windows XP have a major bug in not null - terminating last Unicode string in response */ - if(ses->serverOS) - kfree(ses->serverOS); - ses->serverOS = - kzalloc(2 * (len + 1), GFP_KERNEL); - cifs_strfromUCS_le(ses->serverOS, - (__le16 *) - bcc_ptr, len, - nls_codepage); - bcc_ptr += 2 * (len + 1); - remaining_words -= len + 1; - ses->serverOS[2 * len] = 0; - ses->serverOS[1 + (2 * len)] = 0; - if (remaining_words > 0) { - len = UniStrnlen((wchar_t *)bcc_ptr, - remaining_words - - 1); - if(ses->serverNOS) - kfree(ses->serverNOS); - ses->serverNOS = - kzalloc(2 * (len + 1), - GFP_KERNEL); - cifs_strfromUCS_le(ses->serverNOS, - (__le16 *)bcc_ptr, - len, - nls_codepage); - bcc_ptr += 2 * (len + 1); - ses->serverNOS[2 * len] = 0; - ses->serverNOS[1 + (2 * len)] = 0; - remaining_words -= len + 1; - if (remaining_words > 0) { - len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); - /* last string not null terminated (e.g.Windows XP/2000) */ - if(ses->serverDomain) - kfree(ses->serverDomain); - ses->serverDomain = kzalloc(2*(len+1),GFP_KERNEL); - cifs_strfromUCS_le(ses->serverDomain, - (__le16 *)bcc_ptr, - len, nls_codepage); - bcc_ptr += 2*(len+1); - ses->serverDomain[2*len] = 0; - ses->serverDomain[1+(2*len)] = 0; - } /* else no more room so create dummy domain string */ - else { - if(ses->serverDomain) - kfree(ses->serverDomain); - ses->serverDomain = - kzalloc(2,GFP_KERNEL); - } - } else {/* no room use dummy domain&NOS */ - if(ses->serverDomain) - kfree(ses->serverDomain); - ses->serverDomain = kzalloc(2, GFP_KERNEL); - if(ses->serverNOS) - kfree(ses->serverNOS); - ses->serverNOS = kzalloc(2, GFP_KERNEL); - } - } else { /* ASCII */ - - len = strnlen(bcc_ptr, 1024); - if (((long) bcc_ptr + len) - (long) - pByteArea(smb_buffer_response) - <= BCC(smb_buffer_response)) { - if(ses->serverOS) - kfree(ses->serverOS); - ses->serverOS = kzalloc(len + 1, GFP_KERNEL); - strncpy(ses->serverOS, bcc_ptr, len); - - bcc_ptr += len; - bcc_ptr[0] = 0; /* null terminate the string */ - bcc_ptr++; - - len = strnlen(bcc_ptr, 1024); - if(ses->serverNOS) - kfree(ses->serverNOS); - ses->serverNOS = kzalloc(len + 1,GFP_KERNEL); - strncpy(ses->serverNOS, bcc_ptr, len); - bcc_ptr += len; - bcc_ptr[0] = 0; - bcc_ptr++; - - len = strnlen(bcc_ptr, 1024); - if(ses->serverDomain) - kfree(ses->serverDomain); - ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); - strncpy(ses->serverDomain, bcc_ptr, len); - bcc_ptr += len; - bcc_ptr[0] = 0; - bcc_ptr++; - } else - cFYI(1, - ("Variable field of length %d extends beyond end of smb ", - len)); - } - } else { - cERROR(1, - (" Security Blob Length extends beyond end of SMB")); - } - } else { - cERROR(1, ("No session structure passed in.")); - } - } else { - cERROR(1, - (" Invalid Word count %d: ", - smb_buffer_response->WordCount)); - rc = -EIO; - } - - if (smb_buffer) - cifs_buf_release(smb_buffer); - - return rc; -} - -static int CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, struct cifsSesInfo *ses, int * pNTLMv2_flag, const struct nls_table *nls_codepage) @@ -2635,8 +2360,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128; if(sign_CIFS_PDUs) negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN; - if(ntlmv2_support) - negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2; +/* if(ntlmv2_support) + negotiate_flags |= NTLMSSP_NEGOTIATE_NTLMV2;*/ /* setup pointers to domain name and workstation name */ bcc_ptr += SecurityBlobLength; @@ -2783,8 +2508,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr, remaining_words - 1); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -2802,8 +2526,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (remaining_words > 0) { len = UniStrnlen((wchar_t *) bcc_ptr, remaining_words); /* last string is not always null terminated (for e.g. for Windows XP & 2000) */ - if(ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2 * (len + @@ -2822,19 +2545,16 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, = 0; } /* else no more room so create dummy domain string */ else { - if(ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); } } else { /* no room so create dummy domain and NOS string */ - if(ses->serverDomain); - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } @@ -2856,8 +2576,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; len = strnlen(bcc_ptr, 1024); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); @@ -2867,8 +2586,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; len = strnlen(bcc_ptr, 1024); - if(ses->serverDomain) - kfree(ses->serverDomain); + kfree(ses->serverDomain); ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); @@ -2994,14 +2712,14 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, SecurityBlob->LmChallengeResponse.Buffer = 0; SecurityBlob->NtChallengeResponse.Length = - cpu_to_le16(CIFS_SESSION_KEY_SIZE); + cpu_to_le16(CIFS_SESS_KEY_SIZE); SecurityBlob->NtChallengeResponse.MaximumLength = - cpu_to_le16(CIFS_SESSION_KEY_SIZE); - memcpy(bcc_ptr, ntlm_session_key, CIFS_SESSION_KEY_SIZE); + cpu_to_le16(CIFS_SESS_KEY_SIZE); + memcpy(bcc_ptr, ntlm_session_key, CIFS_SESS_KEY_SIZE); SecurityBlob->NtChallengeResponse.Buffer = cpu_to_le32(SecurityBlobLength); - SecurityBlobLength += CIFS_SESSION_KEY_SIZE; - bcc_ptr += CIFS_SESSION_KEY_SIZE; + SecurityBlobLength += CIFS_SESS_KEY_SIZE; + bcc_ptr += CIFS_SESS_KEY_SIZE; if (ses->capabilities & CAP_UNICODE) { if (domain == NULL) { @@ -3190,8 +2908,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr, remaining_words - 1); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2 * (len + 1), GFP_KERNEL); @@ -3244,8 +2961,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, if(ses->serverDomain) kfree(ses->serverDomain); ses->serverDomain = kzalloc(2, GFP_KERNEL); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(2, GFP_KERNEL); } } else { /* ASCII */ @@ -3263,8 +2979,7 @@ CIFSNTLMSSPAuthSessSetup(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr++; len = strnlen(bcc_ptr, 1024); - if(ses->serverNOS) - kfree(ses->serverNOS); + kfree(ses->serverNOS); ses->serverNOS = kzalloc(len+1,GFP_KERNEL); strncpy(ses->serverNOS, bcc_ptr, len); bcc_ptr += len; @@ -3340,22 +3055,33 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, bcc_ptr = &pSMB->Password[0]; if((ses->server->secMode) & SECMODE_USER) { pSMB->PasswordLength = cpu_to_le16(1); /* minimum */ + *bcc_ptr = 0; /* password is null byte */ bcc_ptr++; /* skip password */ + /* already aligned so no need to do it below */ } else { - pSMB->PasswordLength = cpu_to_le16(CIFS_SESSION_KEY_SIZE); + pSMB->PasswordLength = cpu_to_le16(CIFS_SESS_KEY_SIZE); /* BB FIXME add code to fail this if NTLMv2 or Kerberos specified as required (when that support is added to the vfs in the future) as only NTLM or the much - weaker LANMAN (which we do not send) is accepted + weaker LANMAN (which we do not send by default) is accepted by Samba (not sure whether other servers allow NTLMv2 password here) */ +#ifdef CONFIG_CIFS_WEAK_PW_HASH + if((extended_security & CIFSSEC_MAY_LANMAN) && + (ses->server->secType == LANMAN)) + calc_lanman_hash(ses, bcc_ptr); + else +#endif /* CIFS_WEAK_PW_HASH */ SMBNTencrypt(ses->password, ses->server->cryptKey, bcc_ptr); - bcc_ptr += CIFS_SESSION_KEY_SIZE; - *bcc_ptr = 0; - bcc_ptr++; /* align */ + bcc_ptr += CIFS_SESS_KEY_SIZE; + if(ses->capabilities & CAP_UNICODE) { + /* must align unicode strings */ + *bcc_ptr = 0; /* null byte password */ + bcc_ptr++; + } } if(ses->server->secMode & @@ -3429,7 +3155,10 @@ CIFSTCon(unsigned int xid, struct cifsSesInfo *ses, } /* else do not bother copying these informational fields */ } - tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); + if(smb_buffer_response->WordCount == 3) + tcon->Flags = le16_to_cpu(pSMBr->OptionalSupport); + else + tcon->Flags = 0; cFYI(1, ("Tcon flags: 0x%x ", tcon->Flags)); } else if ((rc == 0) && tcon == NULL) { /* all we need to save for IPC$ connection */ @@ -3494,7 +3223,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, struct nls_table * nls_info) { int rc = 0; - char ntlm_session_key[CIFS_SESSION_KEY_SIZE]; + char ntlm_session_key[CIFS_SESS_KEY_SIZE]; int ntlmv2_flag = FALSE; int first_time = 0; @@ -3526,20 +3255,13 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, pSesInfo->server->secMode, pSesInfo->server->capabilities, pSesInfo->server->timeZone)); -#ifdef CONFIG_CIFS_EXPERIMENTAL - if(experimEnabled > 1) - rc = CIFS_SessSetup(xid, pSesInfo, CIFS_NTLM /* type */, - &ntlmv2_flag, nls_info); - else -#endif - if (extended_security + if(experimEnabled < 2) + rc = CIFS_SessSetup(xid, pSesInfo, + first_time, nls_info); + else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { - cFYI(1, ("New style sesssetup")); - rc = CIFSSpnegoSessSetup(xid, pSesInfo, - NULL /* security blob */, - 0 /* blob length */, - nls_info); + rc = -EOPNOTSUPP; } else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == RawNTLMSSP)) { diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 82315edc77d..ba4cbe9b068 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -113,7 +113,7 @@ cifs_bp_rename_retry: full_path[namelen+2] = 0; BB remove above eight lines BB */ -/* Inode operations in similar order to how they appear in the Linux file fs.h */ +/* Inode operations in similar order to how they appear in Linux file fs.h */ int cifs_create(struct inode *inode, struct dentry *direntry, int mode, @@ -178,11 +178,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, FreeXid(xid); return -ENOMEM; } - - rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, + if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, CREATE_NOT_DIR, &fileHandle, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + else + rc = -EIO; /* no NT SMB support fall into legacy open below */ + if(rc == -EIO) { /* old server, retry the open legacy style */ rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, @@ -191,7 +194,7 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); } if (rc) { - cFYI(1, ("cifs_create returned 0x%x ", rc)); + cFYI(1, ("cifs_create returned 0x%x", rc)); } else { /* If Open reported that we actually created a file then we now have to set the mode if possible */ @@ -369,6 +372,10 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, int mode, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + /* BB FIXME - add handling for backlevel servers + which need legacy open and check for all + calls to SMBOpen for fallback to + SMBLeagcyOpen */ if(!rc) { /* BB Do not bother to decode buf since no local inode yet to put timestamps in, diff --git a/fs/cifs/fcntl.c b/fs/cifs/fcntl.c index 633a9381132..d91a3d44e9e 100644 --- a/fs/cifs/fcntl.c +++ b/fs/cifs/fcntl.c @@ -91,14 +91,14 @@ int cifs_dir_notify(struct file * file, unsigned long arg) if(full_path == NULL) { rc = -ENOMEM; } else { - cERROR(1,("cifs dir notify on file %s with arg 0x%lx",full_path,arg)); /* BB removeme BB */ + cFYI(1,("dir notify on file %s Arg 0x%lx",full_path,arg)); rc = CIFSSMBOpen(xid, pTcon, full_path, FILE_OPEN, GENERIC_READ | SYNCHRONIZE, 0 /* create options */, &netfid, &oplock,NULL, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); /* BB fixme - add this handle to a notify handle list */ if(rc) { - cERROR(1,("Could not open directory for notify")); /* BB remove BB */ + cFYI(1,("Could not open directory for notify")); } else { filter = convert_to_cifs_notify_flags(arg); if(filter != 0) { diff --git a/fs/cifs/file.c b/fs/cifs/file.c index b4a18c1cab0..e9c1573f6aa 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -110,7 +110,6 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file, &pCifsInode->openFileList); } write_unlock(&GlobalSMBSeslock); - write_unlock(&file->f_owner.lock); if (pCifsInode->clientCanCacheRead) { /* we have the inode open somewhere else no need to discard cache data */ @@ -201,7 +200,7 @@ int cifs_open(struct inode *inode, struct file *file) } else { if (file->f_flags & O_EXCL) cERROR(1, ("could not find file instance for " - "new file %p ", file)); + "new file %p", file)); } } @@ -260,10 +259,15 @@ int cifs_open(struct inode *inode, struct file *file) rc = -ENOMEM; goto out; } - rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, desiredAccess, - CREATE_NOT_DIR, &netfid, &oplock, buf, + + if (cifs_sb->tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBOpen(xid, pTcon, full_path, disposition, + desiredAccess, CREATE_NOT_DIR, &netfid, &oplock, buf, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + else + rc = -EIO; /* no NT SMB support fall into legacy open below */ + if (rc == -EIO) { /* Old server, try legacy style OpenX */ rc = SMBLegacyOpen(xid, pTcon, full_path, disposition, @@ -272,7 +276,7 @@ int cifs_open(struct inode *inode, struct file *file) & CIFS_MOUNT_MAP_SPECIAL_CHR); } if (rc) { - cFYI(1, ("cifs_open returned 0x%x ", rc)); + cFYI(1, ("cifs_open returned 0x%x", rc)); goto out; } file->private_data = @@ -282,7 +286,6 @@ int cifs_open(struct inode *inode, struct file *file) goto out; } pCifsFile = cifs_init_private(file->private_data, inode, file, netfid); - write_lock(&file->f_owner.lock); write_lock(&GlobalSMBSeslock); list_add(&pCifsFile->tlist, &pTcon->openFileList); @@ -293,7 +296,6 @@ int cifs_open(struct inode *inode, struct file *file) &oplock, buf, full_path, xid); } else { write_unlock(&GlobalSMBSeslock); - write_unlock(&file->f_owner.lock); } if (oplock & CIFS_CREATE_ACTION) { @@ -409,8 +411,8 @@ static int cifs_reopen_file(struct inode *inode, struct file *file, CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { up(&pCifsFile->fh_sem); - cFYI(1, ("cifs_open returned 0x%x ", rc)); - cFYI(1, ("oplock: %d ", oplock)); + cFYI(1, ("cifs_open returned 0x%x", rc)); + cFYI(1, ("oplock: %d", oplock)); } else { pCifsFile->netfid = netfid; pCifsFile->invalidHandle = FALSE; @@ -472,7 +474,6 @@ int cifs_close(struct inode *inode, struct file *file) pTcon = cifs_sb->tcon; if (pSMBFile) { pSMBFile->closePend = TRUE; - write_lock(&file->f_owner.lock); if (pTcon) { /* no sense reconnecting to close a file that is already closed */ @@ -487,23 +488,18 @@ int cifs_close(struct inode *inode, struct file *file) the struct would be in each open file, but this should give enough time to clear the socket */ - write_unlock(&file->f_owner.lock); cERROR(1,("close with pending writes")); msleep(timeout); - write_lock(&file->f_owner.lock); timeout *= 4; } - write_unlock(&file->f_owner.lock); rc = CIFSSMBClose(xid, pTcon, pSMBFile->netfid); - write_lock(&file->f_owner.lock); } } write_lock(&GlobalSMBSeslock); list_del(&pSMBFile->flist); list_del(&pSMBFile->tlist); write_unlock(&GlobalSMBSeslock); - write_unlock(&file->f_owner.lock); kfree(pSMBFile->search_resume_name); kfree(file->private_data); file->private_data = NULL; @@ -531,7 +527,7 @@ int cifs_closedir(struct inode *inode, struct file *file) (struct cifsFileInfo *)file->private_data; char *ptmp; - cFYI(1, ("Closedir inode = 0x%p with ", inode)); + cFYI(1, ("Closedir inode = 0x%p", inode)); xid = GetXid(); @@ -605,7 +601,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) } if (pfLock->fl_flags & FL_ACCESS) cFYI(1, ("Process suspended by mandatory locking - " - "not implemented yet ")); + "not implemented yet")); if (pfLock->fl_flags & FL_LEASE) cFYI(1, ("Lease on file - not implemented yet")); if (pfLock->fl_flags & @@ -1375,7 +1371,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) xid = GetXid(); - cFYI(1, ("Sync file - name: %s datasync: 0x%x ", + cFYI(1, ("Sync file - name: %s datasync: 0x%x", dentry->d_name.name, datasync)); rc = filemap_fdatawrite(inode->i_mapping); @@ -1404,7 +1400,7 @@ int cifs_fsync(struct file *file, struct dentry *dentry, int datasync) /* fill in rpages then result = cifs_pagein_inode(inode, index, rpages); */ /* BB finish */ -/* cFYI(1, ("rpages is %d for sync page of Index %ld ", rpages, index)); +/* cFYI(1, ("rpages is %d for sync page of Index %ld", rpages, index)); #if 0 if (rc < 0) @@ -1836,7 +1832,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page, if (rc < 0) goto io_error; else - cFYI(1, ("Bytes read %d ",rc)); + cFYI(1, ("Bytes read %d",rc)); file->f_dentry->d_inode->i_atime = current_fs_time(file->f_dentry->d_inode->i_sb); @@ -1957,3 +1953,19 @@ struct address_space_operations cifs_addr_ops = { /* .sync_page = cifs_sync_page, */ /* .direct_IO = */ }; + +/* + * cifs_readpages requires the server to support a buffer large enough to + * contain the header plus one complete page of data. Otherwise, we need + * to leave cifs_readpages out of the address space operations. + */ +struct address_space_operations cifs_addr_ops_smallbuf = { + .readpage = cifs_readpage, + .writepage = cifs_writepage, + .writepages = cifs_writepages, + .prepare_write = cifs_prepare_write, + .commit_write = cifs_commit_write, + .set_page_dirty = __set_page_dirty_nobuffers, + /* .sync_page = cifs_sync_page, */ + /* .direct_IO = */ +}; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 4093764ef46..b88147c1dc2 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -41,7 +41,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, char *tmp_path; pTcon = cifs_sb->tcon; - cFYI(1, ("Getting info on %s ", search_path)); + cFYI(1, ("Getting info on %s", search_path)); /* could have done a find first instead but this returns more info */ rc = CIFSSMBUnixQPathInfo(xid, pTcon, search_path, &findData, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & @@ -97,9 +97,9 @@ int cifs_get_inode_info_unix(struct inode **pinode, inode = *pinode; cifsInfo = CIFS_I(inode); - cFYI(1, ("Old time %ld ", cifsInfo->time)); + cFYI(1, ("Old time %ld", cifsInfo->time)); cifsInfo->time = jiffies; - cFYI(1, ("New time %ld ", cifsInfo->time)); + cFYI(1, ("New time %ld", cifsInfo->time)); /* this is ok to set on every inode revalidate */ atomic_set(&cifsInfo->inUse,1); @@ -180,11 +180,12 @@ int cifs_get_inode_info_unix(struct inode **pinode, else /* not direct, send byte range locks */ inode->i_fop = &cifs_file_ops; - inode->i_data.a_ops = &cifs_addr_ops; /* check if server can support readpages */ if(pTcon->ses->server->maxBuf < - 4096 + MAX_CIFS_HDR_SIZE) - inode->i_data.a_ops->readpages = NULL; + PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) + inode->i_data.a_ops = &cifs_addr_ops_smallbuf; + else + inode->i_data.a_ops = &cifs_addr_ops; } else if (S_ISDIR(inode->i_mode)) { cFYI(1, ("Directory inode")); inode->i_op = &cifs_dir_inode_ops; @@ -421,23 +422,23 @@ int cifs_get_inode_info(struct inode **pinode, inode = *pinode; cifsInfo = CIFS_I(inode); cifsInfo->cifsAttrs = attr; - cFYI(1, ("Old time %ld ", cifsInfo->time)); + cFYI(1, ("Old time %ld", cifsInfo->time)); cifsInfo->time = jiffies; - cFYI(1, ("New time %ld ", cifsInfo->time)); + cFYI(1, ("New time %ld", cifsInfo->time)); /* blksize needs to be multiple of two. So safer to default to blksize and blkbits set in superblock so 2**blkbits and blksize will match rather than setting to: (pTcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE) & 0xFFFFFE00;*/ - /* Linux can not store file creation time unfortunately so we ignore it */ + /* Linux can not store file creation time so ignore it */ inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); inode->i_mtime = cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); - cFYI(0, ("Attributes came in as 0x%x ", attr)); + cFYI(0, ("Attributes came in as 0x%x", attr)); /* set default mode. will override for dirs below */ if (atomic_read(&cifsInfo->inUse) == 0) @@ -519,10 +520,11 @@ int cifs_get_inode_info(struct inode **pinode, else /* not direct, send byte range locks */ inode->i_fop = &cifs_file_ops; - inode->i_data.a_ops = &cifs_addr_ops; if(pTcon->ses->server->maxBuf < - 4096 + MAX_CIFS_HDR_SIZE) - inode->i_data.a_ops->readpages = NULL; + PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE) + inode->i_data.a_ops = &cifs_addr_ops_smallbuf; + else + inode->i_data.a_ops = &cifs_addr_ops; } else if (S_ISDIR(inode->i_mode)) { cFYI(1, ("Directory inode")); inode->i_op = &cifs_dir_inode_ops; @@ -731,7 +733,7 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) rc = CIFSSMBMkDir(xid, pTcon, full_path, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc) { - cFYI(1, ("cifs_mkdir returned 0x%x ", rc)); + cFYI(1, ("cifs_mkdir returned 0x%x", rc)); d_drop(direntry); } else { inode->i_nlink++; @@ -798,7 +800,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) char *full_path = NULL; struct cifsInodeInfo *cifsInode; - cFYI(1, ("cifs_rmdir, inode = 0x%p with ", inode)); + cFYI(1, ("cifs_rmdir, inode = 0x%p", inode)); xid = GetXid(); @@ -1121,7 +1123,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) xid = GetXid(); - cFYI(1, ("In cifs_setattr, name = %s attrs->iavalid 0x%x ", + cFYI(1, ("setattr on file %s attrs->iavalid 0x%x", direntry->d_name.name, attrs->ia_valid)); cifs_sb = CIFS_SB(direntry->d_inode->i_sb); @@ -1157,6 +1159,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) when the local oplock break takes longer to flush writebehind data than the SMB timeout for the SetPathInfo request would allow */ + open_file = find_writable_file(cifsInode); if (open_file) { __u16 nfid = open_file->netfid; @@ -1289,7 +1292,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) it may be useful to Windows - but we do not want to set ctime unless some other timestamp is changing */ - cFYI(1, ("CIFS - CTIME changed ")); + cFYI(1, ("CIFS - CTIME changed")); time_buf.ChangeTime = cpu_to_le64(cifs_UnixTimeToNT(attrs->ia_ctime)); } else @@ -1356,7 +1359,7 @@ cifs_setattr_exit: void cifs_delete_inode(struct inode *inode) { - cFYI(1, ("In cifs_delete_inode, inode = 0x%p ", inode)); + cFYI(1, ("In cifs_delete_inode, inode = 0x%p", inode)); /* may have to add back in if and when safe distributed caching of directories added e.g. via FindNotify */ } diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 2ec99f83314..a57f5d6e621 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -167,7 +167,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) return -ENOMEM; } - cFYI(1, ("Full path: %s ", full_path)); + cFYI(1, ("Full path: %s", full_path)); cFYI(1, ("symname is %s", symname)); /* BB what if DFS and this volume is on different share? BB */ @@ -186,8 +186,7 @@ cifs_symlink(struct inode *inode, struct dentry *direntry, const char *symname) inode->i_sb,xid); if (rc != 0) { - cFYI(1, - ("Create symlink worked but get_inode_info failed with rc = %d ", + cFYI(1, ("Create symlink ok, getinodeinfo fail rc = %d", rc)); } else { if (pTcon->nocase) @@ -289,7 +288,7 @@ cifs_readlink(struct dentry *direntry, char __user *pBuffer, int buflen) else { cFYI(1,("num referral: %d",num_referrals)); if(referrals) { - cFYI(1,("referral string: %s ",referrals)); + cFYI(1,("referral string: %s",referrals)); strncpy(tmpbuffer, referrals, len-1); } } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index fafd056426e..22c937e5884 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -101,6 +101,7 @@ sesInfoFree(struct cifsSesInfo *buf_to_free) kfree(buf_to_free->serverDomain); kfree(buf_to_free->serverNOS); kfree(buf_to_free->password); + kfree(buf_to_free->domainName); kfree(buf_to_free); } @@ -499,11 +500,12 @@ is_valid_oplock_break(struct smb_hdr *buf, struct TCP_Server_Info *srv) if(pSMBr->ByteCount > sizeof(struct file_notify_information)) { data_offset = le32_to_cpu(pSMBr->DataOffset); - pnotify = (struct file_notify_information *)((char *)&pSMBr->hdr.Protocol - + data_offset); - cFYI(1,("dnotify on %s with action: 0x%x",pnotify->FileName, + pnotify = (struct file_notify_information *) + ((char *)&pSMBr->hdr.Protocol + data_offset); + cFYI(1,("dnotify on %s Action: 0x%x",pnotify->FileName, pnotify->Action)); /* BB removeme BB */ - /* cifs_dump_mem("Received notify Data is: ",buf,sizeof(struct smb_hdr)+60); */ + /* cifs_dump_mem("Rcvd notify Data: ",buf, + sizeof(struct smb_hdr)+60); */ return TRUE; } if(pSMBr->hdr.Status.CifsError) { diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 5de74d216fd..b66eff5dc62 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -84,11 +84,11 @@ static const struct smb_to_posix_error mapping_table_ERRDOS[] = { static const struct smb_to_posix_error mapping_table_ERRSRV[] = { {ERRerror, -EIO}, - {ERRbadpw, -EPERM}, + {ERRbadpw, -EACCES}, /* was EPERM */ {ERRbadtype, -EREMOTE}, {ERRaccess, -EACCES}, {ERRinvtid, -ENXIO}, - {ERRinvnetname, -ENODEV}, + {ERRinvnetname, -ENXIO}, {ERRinvdevice, -ENXIO}, {ERRqfull, -ENOSPC}, {ERRqtoobig, -ENOSPC}, diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c deleted file mode 100644 index 115359cc7a3..00000000000 --- a/fs/cifs/ntlmssp.c +++ /dev/null @@ -1,143 +0,0 @@ -/* - * fs/cifs/ntlmssp.h - * - * Copyright (c) International Business Machines Corp., 2006 - * Author(s): Steve French (sfrench@us.ibm.com) - * - * This library is free software; you can redistribute it and/or modify - * it under the terms of the GNU Lesser General Public License as published - * by the Free Software Foundation; either version 2.1 of the License, or - * (at your option) any later version. - * - * This library is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See - * the GNU Lesser General Public License for more details. - * - * You should have received a copy of the GNU Lesser General Public License - * along with this library; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - */ - -#include "cifspdu.h" -#include "cifsglob.h" -#include "cifsproto.h" -#include "cifs_unicode.h" -#include "cifs_debug.h" -#include "ntlmssp.h" -#include "nterr.h" - -#ifdef CONFIG_CIFS_EXPERIMENTAL -static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) -{ - __u32 capabilities = 0; - - /* init fields common to all four types of SessSetup */ - /* note that header is initialized to zero in header_assemble */ - pSMB->req.AndXCommand = 0xFF; - pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); - pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); - - /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ - - /* BB verify whether signing required on neg or just on auth frame - (and NTLM case) */ - - capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | - CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; - - if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) - pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; - - if (ses->capabilities & CAP_UNICODE) { - pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE; - capabilities |= CAP_UNICODE; - } - if (ses->capabilities & CAP_STATUS32) { - pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS; - capabilities |= CAP_STATUS32; - } - if (ses->capabilities & CAP_DFS) { - pSMB->req.hdr.Flags2 |= SMBFLG2_DFS; - capabilities |= CAP_DFS; - } - - /* BB check whether to init vcnum BB */ - return capabilities; -} -int -CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type, - int * pNTLMv2_flg, const struct nls_table *nls_cp) -{ - int rc = 0; - int wct; - struct smb_hdr *smb_buffer; - char *bcc_ptr; - SESSION_SETUP_ANDX *pSMB; - __u32 capabilities; - - if(ses == NULL) - return -EINVAL; - - cFYI(1,("SStp type: %d",type)); - if(type < CIFS_NTLM) { -#ifndef CONFIG_CIFS_WEAK_PW_HASH - /* LANMAN and plaintext are less secure and off by default. - So we make this explicitly be turned on in kconfig (in the - build) and turned on at runtime (changed from the default) - in proc/fs/cifs or via mount parm. Unfortunately this is - needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ - return -EOPNOTSUPP; -#endif - wct = 10; /* lanman 2 style sessionsetup */ - } else if(type < CIFS_NTLMSSP_NEG) - wct = 13; /* old style NTLM sessionsetup */ - else /* same size for negotiate or auth, NTLMSSP or extended security */ - wct = 12; - - rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, - (void **)&smb_buffer); - if(rc) - return rc; - - pSMB = (SESSION_SETUP_ANDX *)smb_buffer; - - capabilities = cifs_ssetup_hdr(ses, pSMB); - bcc_ptr = pByteArea(smb_buffer); - if(type > CIFS_NTLM) { - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; - capabilities |= CAP_EXTENDED_SECURITY; - pSMB->req.Capabilities = cpu_to_le32(capabilities); - /* BB set password lengths */ - } else if(type < CIFS_NTLM) /* lanman */ { - /* no capabilities flags in old lanman negotiation */ - /* pSMB->old_req.PasswordLength = */ /* BB fixme BB */ - } else /* type CIFS_NTLM */ { - pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); - pSMB->req_no_secext.CaseInsensitivePasswordLength = - cpu_to_le16(CIFS_SESSION_KEY_SIZE); - pSMB->req_no_secext.CaseSensitivePasswordLength = - cpu_to_le16(CIFS_SESSION_KEY_SIZE); - } - - - /* copy session key */ - - /* if Unicode, align strings to two byte boundary */ - - /* copy user name */ /* BB Do we need to special case null user name? */ - - /* copy domain name */ - - /* copy Linux version */ - - /* copy network operating system name */ - - /* update bcc and smb buffer length */ - -/* rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */ - /* SMB request buf freed in SendReceive2 */ - - return rc; -} -#endif /* CONFIG_CIFS_EXPERIMENTAL */ diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b689c503512..03bbcb37791 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -21,6 +21,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ #include <linux/fs.h> +#include <linux/pagemap.h> #include <linux/stat.h> #include <linux/smp_lock.h> #include "cifspdu.h" @@ -31,8 +32,8 @@ #include "cifs_fs_sb.h" #include "cifsfs.h" -/* BB fixme - add debug wrappers around this function to disable it fixme BB */ -/* static void dump_cifs_file_struct(struct file *file, char *label) +#ifdef CONFIG_CIFS_DEBUG2 +static void dump_cifs_file_struct(struct file *file, char *label) { struct cifsFileInfo * cf; @@ -53,7 +54,8 @@ } } -} */ +} +#endif /* DEBUG2 */ /* Returns one if new inode created (which therefore needs to be hashed) */ /* Might check in the future if inode number changed so we can rehash inode */ @@ -107,32 +109,52 @@ static int construct_dentry(struct qstr *qstring, struct file *file, return rc; } -static void fill_in_inode(struct inode *tmp_inode, - FILE_DIRECTORY_INFO *pfindData, int *pobject_type, int isNewInode) +static void fill_in_inode(struct inode *tmp_inode, int new_buf_type, + char * buf, int *pobject_type, int isNewInode) { loff_t local_size; struct timespec local_mtime; struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); struct cifs_sb_info *cifs_sb = CIFS_SB(tmp_inode->i_sb); - __u32 attr = le32_to_cpu(pfindData->ExtFileAttributes); - __u64 allocation_size = le64_to_cpu(pfindData->AllocationSize); - __u64 end_of_file = le64_to_cpu(pfindData->EndOfFile); - - cifsInfo->cifsAttrs = attr; - cifsInfo->time = jiffies; + __u32 attr; + __u64 allocation_size; + __u64 end_of_file; /* save mtime and size */ local_mtime = tmp_inode->i_mtime; local_size = tmp_inode->i_size; + if(new_buf_type) { + FILE_DIRECTORY_INFO *pfindData = (FILE_DIRECTORY_INFO *)buf; + + attr = le32_to_cpu(pfindData->ExtFileAttributes); + allocation_size = le64_to_cpu(pfindData->AllocationSize); + end_of_file = le64_to_cpu(pfindData->EndOfFile); + tmp_inode->i_atime = + cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); + tmp_inode->i_mtime = + cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); + tmp_inode->i_ctime = + cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); + } else { /* legacy, OS2 and DOS style */ + FIND_FILE_STANDARD_INFO * pfindData = + (FIND_FILE_STANDARD_INFO *)buf; + + attr = le16_to_cpu(pfindData->Attributes); + allocation_size = le32_to_cpu(pfindData->AllocationSize); + end_of_file = le32_to_cpu(pfindData->DataSize); + tmp_inode->i_atime = CURRENT_TIME; + /* tmp_inode->i_mtime = BB FIXME - add dos time handling + tmp_inode->i_ctime = 0; BB FIXME */ + + } + /* Linux can not store file creation time unfortunately so ignore it */ - tmp_inode->i_atime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime)); - tmp_inode->i_mtime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime)); - tmp_inode->i_ctime = - cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime)); + + cifsInfo->cifsAttrs = attr; + cifsInfo->time = jiffies; + /* treat dos attribute of read-only as read-only mode bit e.g. 555? */ /* 2767 perms - indicate mandatory locking */ /* BB fill in uid and gid here? with help from winbind? @@ -215,11 +237,13 @@ static void fill_in_inode(struct inode *tmp_inode, else tmp_inode->i_fop = &cifs_file_ops; - tmp_inode->i_data.a_ops = &cifs_addr_ops; if((cifs_sb->tcon) && (cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server->maxBuf < - 4096 + MAX_CIFS_HDR_SIZE)) - tmp_inode->i_data.a_ops->readpages = NULL; + PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)) + tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf; + else + tmp_inode->i_data.a_ops = &cifs_addr_ops; + if(isNewInode) return; /* No sense invalidating pages for new inode since have not started caching readahead file @@ -338,11 +362,12 @@ static void unix_fill_in_inode(struct inode *tmp_inode, else tmp_inode->i_fop = &cifs_file_ops; - tmp_inode->i_data.a_ops = &cifs_addr_ops; if((cifs_sb->tcon) && (cifs_sb->tcon->ses) && (cifs_sb->tcon->ses->server->maxBuf < - 4096 + MAX_CIFS_HDR_SIZE)) - tmp_inode->i_data.a_ops->readpages = NULL; + PAGE_CACHE_SIZE + MAX_CIFS_HDR_SIZE)) + tmp_inode->i_data.a_ops = &cifs_addr_ops_smallbuf; + else + tmp_inode->i_data.a_ops = &cifs_addr_ops; if(isNewInode) return; /* No sense invalidating pages for new inode since we @@ -415,7 +440,10 @@ static int initiate_cifs_search(const int xid, struct file *file) ffirst_retry: /* test for Unix extensions */ if (pTcon->ses->capabilities & CAP_UNIX) { - cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; + cifsFile->srch_inf.info_level = SMB_FIND_FILE_UNIX; + } else if ((pTcon->ses->capabilities & + (CAP_NT_SMBS | CAP_NT_FIND)) == 0) { + cifsFile->srch_inf.info_level = SMB_FIND_FILE_INFO_STANDARD; } else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { cifsFile->srch_inf.info_level = SMB_FIND_FILE_ID_FULL_DIR_INFO; } else /* not srvinos - BB fixme add check for backlevel? */ { @@ -451,12 +479,19 @@ static int cifs_unicode_bytelen(char *str) return len << 1; } -static char *nxt_dir_entry(char *old_entry, char *end_of_smb) +static char *nxt_dir_entry(char *old_entry, char *end_of_smb, int level) { char * new_entry; FILE_DIRECTORY_INFO * pDirInfo = (FILE_DIRECTORY_INFO *)old_entry; - new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); + if(level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO * pfData; + pfData = (FIND_FILE_STANDARD_INFO *)pDirInfo; + + new_entry = old_entry + sizeof(FIND_FILE_STANDARD_INFO) + + pfData->FileNameLength; + } else + new_entry = old_entry + le32_to_cpu(pDirInfo->NextEntryOffset); cFYI(1,("new entry %p old entry %p",new_entry,old_entry)); /* validate that new_entry is not past end of SMB */ if(new_entry >= end_of_smb) { @@ -464,7 +499,10 @@ static char *nxt_dir_entry(char *old_entry, char *end_of_smb) ("search entry %p began after end of SMB %p old entry %p", new_entry, end_of_smb, old_entry)); return NULL; - } else if (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb) { + } else if(((level == SMB_FIND_FILE_INFO_STANDARD) && + (new_entry + sizeof(FIND_FILE_STANDARD_INFO) > end_of_smb)) || + ((level != SMB_FIND_FILE_INFO_STANDARD) && + (new_entry + sizeof(FILE_DIRECTORY_INFO) > end_of_smb))) { cERROR(1,("search entry %p extends after end of SMB %p", new_entry, end_of_smb)); return NULL; @@ -482,7 +520,7 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) char * filename = NULL; int len = 0; - if(cfile->srch_inf.info_level == 0x202) { + if(cfile->srch_inf.info_level == SMB_FIND_FILE_UNIX) { FILE_UNIX_INFO * pFindData = (FILE_UNIX_INFO *)current_entry; filename = &pFindData->FileName[0]; if(cfile->srch_inf.unicode) { @@ -491,26 +529,34 @@ static int cifs_entry_is_dot(char *current_entry, struct cifsFileInfo *cfile) /* BB should we make this strnlen of PATH_MAX? */ len = strnlen(filename, 5); } - } else if(cfile->srch_inf.info_level == 0x101) { + } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_DIRECTORY_INFO) { FILE_DIRECTORY_INFO * pFindData = (FILE_DIRECTORY_INFO *)current_entry; filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); - } else if(cfile->srch_inf.info_level == 0x102) { + } else if(cfile->srch_inf.info_level == + SMB_FIND_FILE_FULL_DIRECTORY_INFO) { FILE_FULL_DIRECTORY_INFO * pFindData = (FILE_FULL_DIRECTORY_INFO *)current_entry; filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); - } else if(cfile->srch_inf.info_level == 0x105) { + } else if(cfile->srch_inf.info_level == + SMB_FIND_FILE_ID_FULL_DIR_INFO) { SEARCH_ID_FULL_DIR_INFO * pFindData = (SEARCH_ID_FULL_DIR_INFO *)current_entry; filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); - } else if(cfile->srch_inf.info_level == 0x104) { + } else if(cfile->srch_inf.info_level == + SMB_FIND_FILE_BOTH_DIRECTORY_INFO) { FILE_BOTH_DIRECTORY_INFO * pFindData = (FILE_BOTH_DIRECTORY_INFO *)current_entry; filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); + } else if(cfile->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO * pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + len = le32_to_cpu(pFindData->FileNameLength); } else { cFYI(1,("Unknown findfirst level %d",cfile->srch_inf.info_level)); } @@ -597,7 +643,9 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, . and .. for the root of a drive and for those we need to start two entries earlier */ -/* dump_cifs_file_struct(file, "In fce ");*/ +#ifdef CONFIG_CIFS_DEBUG2 + dump_cifs_file_struct(file, "In fce "); +#endif if(((index_to_find < cifsFile->srch_inf.index_of_last_entry) && is_dir_changed(file)) || (index_to_find < first_entry_in_buffer)) { @@ -644,10 +692,12 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, first_entry_in_buffer = cifsFile->srch_inf.index_of_last_entry - cifsFile->srch_inf.entries_in_buffer; pos_in_buf = index_to_find - first_entry_in_buffer; - cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); + cFYI(1,("found entry - pos_in_buf %d",pos_in_buf)); + for(i=0;(i<(pos_in_buf)) && (current_entry != NULL);i++) { /* go entry by entry figuring out which is first */ - current_entry = nxt_dir_entry(current_entry,end_of_smb); + current_entry = nxt_dir_entry(current_entry,end_of_smb, + cifsFile->srch_inf.info_level); } if((current_entry == NULL) && (i < pos_in_buf)) { /* BB fixme - check if we should flag this error */ @@ -674,7 +724,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, /* inode num, inode type and filename returned */ static int cifs_get_name_from_search_buf(struct qstr *pqst, char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info * cifs_sb, ino_t *pinum) + struct cifs_sb_info * cifs_sb, int max_len, ino_t *pinum) { int rc = 0; unsigned int len = 0; @@ -718,10 +768,22 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, (FILE_BOTH_DIRECTORY_INFO *)current_entry; filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); + } else if(level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO * pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + /* one byte length, no name conversion */ + len = (unsigned int)pFindData->FileNameLength; } else { cFYI(1,("Unknown findfirst level %d",level)); return -EINVAL; } + + if(len > max_len) { + cERROR(1,("bad search response length %d past smb end", len)); + return -EINVAL; + } + if(unicode) { /* BB fixme - test with long names */ /* Note converted filename can be longer than in unicode */ @@ -741,7 +803,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, } static int cifs_filldir(char *pfindEntry, struct file *file, - filldir_t filldir, void *direntry, char *scratch_buf) + filldir_t filldir, void *direntry, char *scratch_buf, int max_len) { int rc = 0; struct qstr qstring; @@ -777,6 +839,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, rc = cifs_get_name_from_search_buf(&qstring,pfindEntry, pCifsF->srch_inf.info_level, pCifsF->srch_inf.unicode,cifs_sb, + max_len, &inum /* returned */); if(rc) @@ -798,13 +861,16 @@ static int cifs_filldir(char *pfindEntry, struct file *file, /* we pass in rc below, indicating whether it is a new inode, so we can figure out whether to invalidate the inode cached data if the file has changed */ - if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) { + if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX) unix_fill_in_inode(tmp_inode, - (FILE_UNIX_INFO *)pfindEntry,&obj_type, rc); - } else { - fill_in_inode(tmp_inode, - (FILE_DIRECTORY_INFO *)pfindEntry,&obj_type, rc); - } + (FILE_UNIX_INFO *)pfindEntry, + &obj_type, rc); + else if(pCifsF->srch_inf.info_level == SMB_FIND_FILE_INFO_STANDARD) + fill_in_inode(tmp_inode, 0 /* old level 1 buffer type */, + pfindEntry, &obj_type, rc); + else + fill_in_inode(tmp_inode, 1 /* NT */, pfindEntry, &obj_type, rc); + rc = filldir(direntry,qstring.name,qstring.len,file->f_pos, tmp_inode->i_ino,obj_type); @@ -864,6 +930,12 @@ static int cifs_save_resume_key(const char *current_entry, filename = &pFindData->FileName[0]; len = le32_to_cpu(pFindData->FileNameLength); cifsFile->srch_inf.resume_key = pFindData->FileIndex; + } else if(level == SMB_FIND_FILE_INFO_STANDARD) { + FIND_FILE_STANDARD_INFO * pFindData = + (FIND_FILE_STANDARD_INFO *)current_entry; + filename = &pFindData->FileName[0]; + /* one byte length, no name conversion */ + len = (unsigned int)pFindData->FileNameLength; } else { cFYI(1,("Unknown findfirst level %d",level)); return -EINVAL; @@ -884,6 +956,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) int num_to_fill = 0; char * tmp_buf = NULL; char * end_of_smb; + int max_len; xid = GetXid(); @@ -909,7 +982,7 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) case 1: if (filldir(direntry, "..", 2, file->f_pos, file->f_dentry->d_parent->d_inode->i_ino, DT_DIR) < 0) { - cERROR(1, ("Filldir for parent dir failed ")); + cERROR(1, ("Filldir for parent dir failed")); rc = -ENOMEM; break; } @@ -959,10 +1032,11 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) goto rddir2_exit; } cFYI(1,("loop through %d times filling dir for net buf %p", - num_to_fill,cifsFile->srch_inf.ntwrk_buf_start)); - end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + - smbCalcSize((struct smb_hdr *) - cifsFile->srch_inf.ntwrk_buf_start); + num_to_fill,cifsFile->srch_inf.ntwrk_buf_start)); + max_len = smbCalcSize((struct smb_hdr *) + cifsFile->srch_inf.ntwrk_buf_start); + end_of_smb = cifsFile->srch_inf.ntwrk_buf_start + max_len; + /* To be safe - for UCS to UTF-8 with strings loaded with the rare long characters alloc more to account for such multibyte target UTF-8 characters. cifs_unicode.c, @@ -977,17 +1051,19 @@ int cifs_readdir(struct file *file, void *direntry, filldir_t filldir) } /* if buggy server returns . and .. late do we want to check for that here? */ - rc = cifs_filldir(current_entry, file, - filldir, direntry,tmp_buf); + rc = cifs_filldir(current_entry, file, + filldir, direntry, tmp_buf, max_len); file->f_pos++; - if(file->f_pos == cifsFile->srch_inf.index_of_last_entry) { + if(file->f_pos == + cifsFile->srch_inf.index_of_last_entry) { cFYI(1,("last entry in buf at pos %lld %s", - file->f_pos,tmp_buf)); /* BB removeme BB */ + file->f_pos,tmp_buf)); cifs_save_resume_key(current_entry,cifsFile); break; } else - current_entry = nxt_dir_entry(current_entry, - end_of_smb); + current_entry = + nxt_dir_entry(current_entry, end_of_smb, + cifsFile->srch_inf.info_level); } kfree(tmp_buf); break; diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c new file mode 100644 index 00000000000..7202d534ef0 --- /dev/null +++ b/fs/cifs/sess.c @@ -0,0 +1,538 @@ +/* + * fs/cifs/sess.c + * + * SMB/CIFS session setup handling routines + * + * Copyright (c) International Business Machines Corp., 2006 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "ntlmssp.h" +#include "nterr.h" +#include <linux/utsname.h> + +extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, + unsigned char *p24); + +static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) +{ + __u32 capabilities = 0; + + /* init fields common to all four types of SessSetup */ + /* note that header is initialized to zero in header_assemble */ + pSMB->req.AndXCommand = 0xFF; + pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + + /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ + + /* BB verify whether signing required on neg or just on auth frame + (and NTLM case) */ + + capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | + CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; + + if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + if (ses->capabilities & CAP_UNICODE) { + pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE; + capabilities |= CAP_UNICODE; + } + if (ses->capabilities & CAP_STATUS32) { + pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS; + capabilities |= CAP_STATUS32; + } + if (ses->capabilities & CAP_DFS) { + pSMB->req.hdr.Flags2 |= SMBFLG2_DFS; + capabilities |= CAP_DFS; + } + if (ses->capabilities & CAP_UNIX) { + capabilities |= CAP_UNIX; + } + + /* BB check whether to init vcnum BB */ + return capabilities; +} + +static void unicode_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses, + const struct nls_table * nls_cp) +{ + char * bcc_ptr = *pbcc_area; + int bytes_ret = 0; + + /* BB FIXME add check that strings total less + than 335 or will need to send them as arrays */ + + /* unicode strings, must be word aligned before the call */ +/* if ((long) bcc_ptr % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } */ + /* copy user */ + if(ses->userName == NULL) { + /* BB what about null user mounts - check that we do this BB */ + } else { /* 300 should be long enough for any conceivable user name */ + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->userName, + 300, nls_cp); + } + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* account for null termination */ + /* copy domain */ + if(ses->domainName == NULL) + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, + "CIFS_LINUX_DOM", 32, nls_cp); + else + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, ses->domainName, + 256, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* account for null terminator */ + + /* Copy OS version */ + bytes_ret = cifs_strtoUCS((__le16 *)bcc_ptr, "Linux version ", 32, + nls_cp); + bcc_ptr += 2 * bytes_ret; + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, system_utsname.release, + 32, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* trailing null */ + + bytes_ret = cifs_strtoUCS((__le16 *) bcc_ptr, CIFS_NETWORK_OPSYS, + 32, nls_cp); + bcc_ptr += 2 * bytes_ret; + bcc_ptr += 2; /* trailing null */ + + *pbcc_area = bcc_ptr; +} + +static void ascii_ssetup_strings(char ** pbcc_area, struct cifsSesInfo *ses, + const struct nls_table * nls_cp) +{ + char * bcc_ptr = *pbcc_area; + + /* copy user */ + /* BB what about null user mounts - check that we do this BB */ + /* copy user */ + if(ses->userName == NULL) { + /* BB what about null user mounts - check that we do this BB */ + } else { /* 300 should be long enough for any conceivable user name */ + strncpy(bcc_ptr, ses->userName, 300); + } + /* BB improve check for overflow */ + bcc_ptr += strnlen(ses->userName, 300); + *bcc_ptr = 0; + bcc_ptr++; /* account for null termination */ + + /* copy domain */ + + if(ses->domainName == NULL) { + strcpy(bcc_ptr, "CIFS_LINUX_DOM"); + bcc_ptr += 14; /* strlen(CIFS_LINUX_DOM) */ + } else { + strncpy(bcc_ptr, ses->domainName, 256); + bcc_ptr += strnlen(ses->domainName, 256); + } + *bcc_ptr = 0; + bcc_ptr++; + + /* BB check for overflow here */ + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, system_utsname.release); + bcc_ptr += strlen(system_utsname.release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + +static int decode_unicode_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses, + const struct nls_table * nls_cp) +{ + int rc = 0; + int words_left, len; + char * data = *pbcc_area; + + + + cFYI(1,("bleft %d",bleft)); + + + /* word align, if bytes remaining is not even */ + if(bleft % 2) { + bleft--; + data++; + } + words_left = bleft / 2; + + /* save off server operating system */ + len = UniStrnlen((wchar_t *) data, words_left); + +/* We look for obvious messed up bcc or strings in response so we do not go off + the end since (at least) WIN2K and Windows XP have a major bug in not null + terminating last Unicode string in response */ + if(len >= words_left) + return rc; + + if(ses->serverOS) + kfree(ses->serverOS); + /* UTF-8 string will not grow more than four times as big as UCS-16 */ + ses->serverOS = kzalloc(4 * len, GFP_KERNEL); + if(ses->serverOS != NULL) { + cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, + nls_cp); + } + data += 2 * (len + 1); + words_left -= len + 1; + + /* save off server network operating system */ + len = UniStrnlen((wchar_t *) data, words_left); + + if(len >= words_left) + return rc; + + if(ses->serverNOS) + kfree(ses->serverNOS); + ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */ + if(ses->serverNOS != NULL) { + cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, + nls_cp); + if(strncmp(ses->serverNOS, "NT LAN Manager 4",16) == 0) { + cFYI(1,("NT4 server")); + ses->flags |= CIFS_SES_NT4; + } + } + data += 2 * (len + 1); + words_left -= len + 1; + + /* save off server domain */ + len = UniStrnlen((wchar_t *) data, words_left); + + if(len > words_left) + return rc; + + if(ses->serverDomain) + kfree(ses->serverDomain); + ses->serverDomain = kzalloc(2 * (len + 1), GFP_KERNEL); /* BB FIXME wrong length */ + if(ses->serverDomain != NULL) { + cifs_strfromUCS_le(ses->serverDomain, (__le16 *)data, len, + nls_cp); + ses->serverDomain[2*len] = 0; + ses->serverDomain[(2*len) + 1] = 0; + } + data += 2 * (len + 1); + words_left -= len + 1; + + cFYI(1,("words left: %d",words_left)); + + return rc; +} + +static int decode_ascii_ssetup(char ** pbcc_area, int bleft, struct cifsSesInfo *ses, + const struct nls_table * nls_cp) +{ + int rc = 0; + int len; + char * bcc_ptr = *pbcc_area; + + cFYI(1,("decode sessetup ascii. bleft %d", bleft)); + + len = strnlen(bcc_ptr, bleft); + if(len >= bleft) + return rc; + + if(ses->serverOS) + kfree(ses->serverOS); + + ses->serverOS = kzalloc(len + 1, GFP_KERNEL); + if(ses->serverOS) + strncpy(ses->serverOS, bcc_ptr, len); + + bcc_ptr += len + 1; + bleft -= len + 1; + + len = strnlen(bcc_ptr, bleft); + if(len >= bleft) + return rc; + + if(ses->serverNOS) + kfree(ses->serverNOS); + + ses->serverNOS = kzalloc(len + 1, GFP_KERNEL); + if(ses->serverNOS) + strncpy(ses->serverNOS, bcc_ptr, len); + + bcc_ptr += len + 1; + bleft -= len + 1; + + len = strnlen(bcc_ptr, bleft); + if(len > bleft) + return rc; + + if(ses->serverDomain) + kfree(ses->serverDomain); + + ses->serverDomain = kzalloc(len + 1, GFP_KERNEL); + if(ses->serverOS) + strncpy(ses->serverOS, bcc_ptr, len); + + bcc_ptr += len + 1; + bleft -= len + 1; + + cFYI(1,("ascii: bytes left %d",bleft)); + + return rc; +} + +int +CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, int first_time, + const struct nls_table *nls_cp) +{ + int rc = 0; + int wct; + struct smb_hdr *smb_buf; + char *bcc_ptr; + char *str_area; + SESSION_SETUP_ANDX *pSMB; + __u32 capabilities; + int count; + int resp_buf_type = 0; + struct kvec iov[2]; + enum securityEnum type; + __u16 action; + int bytes_remaining; + + if(ses == NULL) + return -EINVAL; + + type = ses->server->secType; + + cFYI(1,("sess setup type %d",type)); + if(type == LANMAN) { +#ifndef CONFIG_CIFS_WEAK_PW_HASH + /* LANMAN and plaintext are less secure and off by default. + So we make this explicitly be turned on in kconfig (in the + build) and turned on at runtime (changed from the default) + in proc/fs/cifs or via mount parm. Unfortunately this is + needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ + return -EOPNOTSUPP; +#endif + wct = 10; /* lanman 2 style sessionsetup */ + } else if((type == NTLM) || (type == NTLMv2)) { + /* For NTLMv2 failures eventually may need to retry NTLM */ + wct = 13; /* old style NTLM sessionsetup */ + } else /* same size for negotiate or auth, NTLMSSP or extended security */ + wct = 12; + + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buf); + if(rc) + return rc; + + pSMB = (SESSION_SETUP_ANDX *)smb_buf; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + + /* we will send the SMB in two pieces, + a fixed length beginning part, and a + second part which will include the strings + and rest of bcc area, in order to avoid having + to do a large buffer 17K allocation */ + iov[0].iov_base = (char *)pSMB; + iov[0].iov_len = smb_buf->smb_buf_length + 4; + + /* 2000 big enough to fit max user, domain, NOS name etc. */ + str_area = kmalloc(2000, GFP_KERNEL); + bcc_ptr = str_area; + + if(type == LANMAN) { +#ifdef CONFIG_CIFS_WEAK_PW_HASH + char lnm_session_key[CIFS_SESS_KEY_SIZE]; + + /* no capabilities flags in old lanman negotiation */ + + pSMB->old_req.PasswordLength = CIFS_SESS_KEY_SIZE; + /* BB calculate hash with password */ + /* and copy into bcc */ + + calc_lanman_hash(ses, lnm_session_key); + +/* #ifdef CONFIG_CIFS_DEBUG2 + cifs_dump_mem("cryptkey: ",ses->server->cryptKey, + CIFS_SESS_KEY_SIZE); +#endif */ + memcpy(bcc_ptr, (char *)lnm_session_key, CIFS_SESS_KEY_SIZE); + bcc_ptr += CIFS_SESS_KEY_SIZE; + + /* can not sign if LANMAN negotiated so no need + to calculate signing key? but what if server + changed to do higher than lanman dialect and + we reconnected would we ever calc signing_key? */ + + cFYI(1,("Negotiating LANMAN setting up strings")); + /* Unicode not allowed for LANMAN dialects */ + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); +#endif + } else if (type == NTLM) { + char ntlm_session_key[CIFS_SESS_KEY_SIZE]; + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = + cpu_to_le16(CIFS_SESS_KEY_SIZE); + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(CIFS_SESS_KEY_SIZE); + + /* calculate session key */ + SMBNTencrypt(ses->password, ses->server->cryptKey, + ntlm_session_key); + + if(first_time) /* should this be moved into common code + with similar ntlmv2 path? */ + cifs_calculate_mac_key(ses->server->mac_signing_key, + ntlm_session_key, ses->password); + /* copy session key */ + + memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE); + bcc_ptr += CIFS_SESS_KEY_SIZE; + memcpy(bcc_ptr, (char *)ntlm_session_key,CIFS_SESS_KEY_SIZE); + bcc_ptr += CIFS_SESS_KEY_SIZE; + if(ses->capabilities & CAP_UNICODE) { + /* unicode strings must be word aligned */ + if (iov[0].iov_len % 2) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else if (type == NTLMv2) { + char * v2_sess_key = + kmalloc(sizeof(struct ntlmv2_resp), GFP_KERNEL); + + /* BB FIXME change all users of v2_sess_key to + struct ntlmv2_resp */ + + if(v2_sess_key == NULL) { + cifs_small_buf_release(smb_buf); + return -ENOMEM; + } + + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + + /* LM2 password would be here if we supported it */ + pSMB->req_no_secext.CaseInsensitivePasswordLength = 0; + /* cpu_to_le16(LM2_SESS_KEY_SIZE); */ + + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(sizeof(struct ntlmv2_resp)); + + /* calculate session key */ + setup_ntlmv2_rsp(ses, v2_sess_key, nls_cp); + if(first_time) /* should this be moved into common code + with similar ntlmv2 path? */ + /* cifs_calculate_ntlmv2_mac_key(ses->server->mac_signing_key, + response BB FIXME, v2_sess_key); */ + + /* copy session key */ + + /* memcpy(bcc_ptr, (char *)ntlm_session_key,LM2_SESS_KEY_SIZE); + bcc_ptr += LM2_SESS_KEY_SIZE; */ + memcpy(bcc_ptr, (char *)v2_sess_key, sizeof(struct ntlmv2_resp)); + bcc_ptr += sizeof(struct ntlmv2_resp); + kfree(v2_sess_key); + if(ses->capabilities & CAP_UNICODE) { + if(iov[0].iov_len % 2) { + *bcc_ptr = 0; + } bcc_ptr++; + unicode_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else + ascii_ssetup_strings(&bcc_ptr, ses, nls_cp); + } else /* NTLMSSP or SPNEGO */ { + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + /* BB set password lengths */ + } + + count = (long) bcc_ptr - (long) str_area; + smb_buf->smb_buf_length += count; + + BCC_LE(smb_buf) = cpu_to_le16(count); + + iov[1].iov_base = str_area; + iov[1].iov_len = count; + rc = SendReceive2(xid, ses, iov, 2 /* num_iovecs */, &resp_buf_type, 0); + /* SMB request buf freed in SendReceive2 */ + + cFYI(1,("ssetup rc from sendrecv2 is %d",rc)); + if(rc) + goto ssetup_exit; + + pSMB = (SESSION_SETUP_ANDX *)iov[0].iov_base; + smb_buf = (struct smb_hdr *)iov[0].iov_base; + + if((smb_buf->WordCount != 3) && (smb_buf->WordCount != 4)) { + rc = -EIO; + cERROR(1,("bad word count %d", smb_buf->WordCount)); + goto ssetup_exit; + } + action = le16_to_cpu(pSMB->resp.Action); + if (action & GUEST_LOGIN) + cFYI(1, ("Guest login")); /* BB mark SesInfo struct? */ + ses->Suid = smb_buf->Uid; /* UID left in wire format (le) */ + cFYI(1, ("UID = %d ", ses->Suid)); + /* response can have either 3 or 4 word count - Samba sends 3 */ + /* and lanman response is 3 */ + bytes_remaining = BCC(smb_buf); + bcc_ptr = pByteArea(smb_buf); + + if(smb_buf->WordCount == 4) { + __u16 blob_len; + blob_len = le16_to_cpu(pSMB->resp.SecurityBlobLength); + bcc_ptr += blob_len; + if(blob_len > bytes_remaining) { + cERROR(1,("bad security blob length %d", blob_len)); + rc = -EINVAL; + goto ssetup_exit; + } + bytes_remaining -= blob_len; + } + + /* BB check if Unicode and decode strings */ + if(smb_buf->Flags2 & SMBFLG2_UNICODE) + rc = decode_unicode_ssetup(&bcc_ptr, bytes_remaining, + ses, nls_cp); + else + rc = decode_ascii_ssetup(&bcc_ptr, bytes_remaining, ses,nls_cp); + +ssetup_exit: + kfree(str_area); + if(resp_buf_type == CIFS_SMALL_BUFFER) { + cFYI(1,("ssetup freeing small buf %p", iov[0].iov_base)); + cifs_small_buf_release(iov[0].iov_base); + } else if(resp_buf_type == CIFS_LARGE_BUFFER) + cifs_buf_release(iov[0].iov_base); + + return rc; +} diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 6103bcdfb16..f518c5e4503 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -30,6 +30,7 @@ #include <linux/random.h> #include "cifs_unicode.h" #include "cifspdu.h" +#include "cifsglob.h" #include "md5.h" #include "cifs_debug.h" #include "cifsencrypt.h" diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 3da80409466..17ba329e2b3 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -654,8 +654,7 @@ SendReceive(const unsigned int xid, struct cifsSesInfo *ses, if (in_buf->smb_buf_length > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) { up(&ses->server->tcpSem); - cERROR(1, - ("Illegal length, greater than maximum frame, %d ", + cERROR(1, ("Illegal length, greater than maximum frame, %d", in_buf->smb_buf_length)); DeleteMidQEntry(midQ); /* If not lock req, update # of requests on wire to server */ diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 6c6771db36d..7caee8d8ea3 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, /* If request was not a signal, enqueue and don't free */ if (!(req->uc_flags & REQ_ASYNC)) { req->uc_flags |= REQ_READ; - list_add(&(req->uc_chain), vcp->vc_processing.prev); + list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index b040eba13a7..a5b5e631ba6 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi, ((union inputArgs *)buffer)->ih.unique = req->uc_unique; /* Append msg to pending queue and poke Venus. */ - list_add(&(req->uc_chain), vcommp->vc_pending.prev); + list_add_tail(&(req->uc_chain), &vcommp->vc_pending); wake_up_interruptible(&vcommp->vc_waitq); /* We can be interrupted while we wait for Venus to process diff --git a/fs/compat.c b/fs/compat.c index 7e7e5bc4f3c..e31e9cf9664 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -55,6 +55,20 @@ extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); +int compat_log = 1; + +int compat_printk(const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = vprintk(fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, sprintf(buf,"'%c'", (cmd>>24) & 0x3f); if (!isprint(buf[1])) sprintf(buf, "%02x", buf[1]); - printk("ioctl32(%s:%d): Unknown cmd fd(%d) " + compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " "cmd(%08x){%s} arg(%08x) on %s\n", current->comm, current->pid, (int)fd, (unsigned int)cmd, buf, diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 9eb9824dd33..d8ecfedef18 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -80,6 +80,7 @@ #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> +#include <linux/gigaset_dev.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5f952187fc5..207f8006fd6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir /* fallthrough */ default: if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); + list_move(q, &parent_sd->s_children); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct configfs_dirent *next; @@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/dcache.c b/fs/dcache.c index b85fda36053..48b44a714b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb) dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - list_del(tmp); - list_add(tmp, &dentry_unused); + list_move(tmp, &dentry_unused); } /* @@ -638,7 +637,7 @@ resume: * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add(&dentry->d_lru, dentry_unused.prev); + list_add_tail(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; found++; } diff --git a/fs/dquot.c b/fs/dquot.c index 81d87a413c6..0122a279106 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { - list_add(&dquot->dq_free, free_dquots.prev); + list_add_tail(&dquot->dq_free, &free_dquots); dqstats.free_dquots++; } @@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ - list_add(&dquot->dq_inuse, inuse_list.prev); + list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats.allocated_dquots++; } diff --git a/fs/exec.c b/fs/exec.c index 0b88bf64614..c8494f513ea 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct dentry *proc_dentry1, *proc_dentry2; - /* * Wait for the thread group leader to be a zombie. * It should already be zombie at this point, most @@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk) */ current->start_time = leader->start_time; - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); - proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); BUG_ON(leader->tgid != current->tgid); @@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail_rcu(¤t->tasks, &init_task.tasks); + list_replace_rcu(&leader->tasks, ¤t->tasks); current->group_leader = current; leader->group_leader = current; @@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk) /* Reduce leader to a thread */ detach_pid(leader, PIDTYPE_PGID); detach_pid(leader, PIDTYPE_SID); - list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; @@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); - proc_pid_flush(proc_dentry1); - proc_pid_flush(proc_dentry2); } /* @@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr) *out_ptr = 0; } -static void zap_threads (struct mm_struct *mm) +static void zap_process(struct task_struct *start) { - struct task_struct *g, *p; - struct task_struct *tsk = current; - struct completion *vfork_done = tsk->vfork_done; - int traced = 0; + struct task_struct *t; - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_stop_count = 0; - read_lock(&tasklist_lock); - do_each_thread(g,p) - if (mm == p->mm && p != tsk) { - force_sig_specific(SIGKILL, p); - mm->core_waiters++; - if (unlikely(p->ptrace) && - unlikely(p->parent->mm == mm)) - traced = 1; + t = start; + do { + if (t != current && t->mm) { + t->mm->core_waiters++; + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); } - while_each_thread(g,p); + } while ((t = next_thread(t)) != start); +} - read_unlock(&tasklist_lock); +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int err = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + tsk->signal->group_exit_code = exit_code; + zap_process(tsk); + err = 0; + } + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + return err; - if (unlikely(traced)) { - /* - * We are zapping a thread and the thread it ptraces. - * If the tracee went into a ptrace stop for exit tracing, - * we could deadlock since the tracer is waiting for this - * coredump to finish. Detach them so they can both die. - */ - write_lock_irq(&tasklist_lock); - do_each_thread(g,p) { - if (mm == p->mm && p != tsk && - p->ptrace && p->parent->mm == mm) { - __ptrace_detach(p, 0); + if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + goto done; + + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + + p = g; + do { + if (p->mm) { + if (p->mm == mm) { + /* + * p->sighand can't disappear, but + * may be changed by de_thread() + */ + lock_task_sighand(p, &flags); + zap_process(p); + unlock_task_sighand(p, &flags); + } + break; } - } while_each_thread(g,p); - write_unlock_irq(&tasklist_lock); + } while ((p = next_thread(p)) != g); } + rcu_read_unlock(); +done: + return mm->core_waiters; } -static void coredump_wait(struct mm_struct *mm) +static int coredump_wait(int exit_code) { - DECLARE_COMPLETION(startup_done); + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + struct completion startup_done; + struct completion *vfork_done; int core_waiters; + init_completion(&mm->core_done); + init_completion(&startup_done); mm->core_startup_done = &startup_done; - zap_threads(mm); - core_waiters = mm->core_waiters; + core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); + if (unlikely(core_waiters < 0)) + goto fail; + + /* + * Make sure nobody is waiting for us to release the VM, + * otherwise we can deadlock when we wait on each other + */ + vfork_done = tsk->vfork_done; + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (core_waiters) wait_for_completion(&startup_done); +fail: BUG_ON(mm->core_waiters); + return core_waiters; } int do_coredump(long signr, int exit_code, struct pt_regs * regs) @@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) } mm->dumpable = 0; - retval = -EAGAIN; - spin_lock_irq(¤t->sighand->siglock); - if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { - current->signal->flags = SIGNAL_GROUP_EXIT; - current->signal->group_exit_code = exit_code; - current->signal->group_stop_count = 0; - retval = 0; - } - spin_unlock_irq(¤t->sighand->siglock); - if (retval) { - up_write(&mm->mmap_sem); + retval = coredump_wait(exit_code); + if (retval < 0) goto fail; - } - - init_completion(&mm->core_done); - coredump_wait(mm); /* * Clear any false indication of pending signals that might diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b2891cc29db..b7483360a2d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -630,7 +630,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -666,6 +666,7 @@ static match_table_t tokens = { {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, @@ -1014,6 +1015,9 @@ clear_qf_name: case Opt_nobh: set_opt(sbi->s_mount_opt, NOBH); break; + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 7f96b5cb678..8c9b28dff11 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -34,6 +34,7 @@ #include <linux/suspend.h> #include <linux/pagemap.h> #include <linux/kthread.h> +#include <linux/poison.h> #include <linux/proc_fs.h> #include <asm/uaccess.h> @@ -1675,7 +1676,7 @@ static void journal_free_journal_head(struct journal_head *jh) { #ifdef CONFIG_JBD_DEBUG atomic_dec(&nr_journal_heads); - memset(jh, 0x5b, sizeof(*jh)); + memset(jh, JBD_POISON_FREE, sizeof(*jh)); #endif kmem_cache_free(journal_head_cache, jh); } diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c index 320dd48b834..9c2077e7e08 100644 --- a/fs/jffs2/acl.c +++ b/fs/jffs2/acl.c @@ -267,6 +267,8 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl) } rc = do_jffs2_setxattr(inode, xprefix, "", value, size, 0); + if (!value && rc == -ENODATA) + rc = 0; if (value) kfree(value); if (!rc) { diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 1862e8bc101..ad0121088dd 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, if (!instr) { printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, /* Erase failed immediately. Refile it on the list */ D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo { D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add_tail(&jeb->list, &c->erase_complete_list); + list_move_tail(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); /* Ensure that kupdated calls us again to mark them clean */ jffs2_erase_pending_trigger(c); @@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { /* We'd like to give this block another try. */ spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock spin_lock(&c->erase_completion_lock); c->erasing_size -= c->sector_size; c->bad_size += c->sector_size; - list_del(&jeb->list); - list_add(&jeb->list, &c->bad_list); + list_move(&jeb->list, &c->bad_list); c->nr_erasing_blocks--; spin_unlock(&c->erase_completion_lock); wake_up(&c->erase_wait); @@ -230,7 +225,6 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, at the end of the linked list. Stash it and continue from the beginning of the list */ ic = (struct jffs2_inode_cache *)(*prev); - BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE); prev = &ic->nodes; continue; } @@ -254,7 +248,8 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, /* PARANOIA */ if (!ic) { - printk(KERN_WARNING "inode_cache not found in remove_node_refs()!!\n"); + JFFS2_WARNING("inode_cache/xattr_datum/xattr_ref" + " not found in remove_node_refs()!!\n"); return; } @@ -279,8 +274,19 @@ static inline void jffs2_remove_node_refs_from_ino_list(struct jffs2_sb_info *c, printk("\n"); }); - if (ic->nodes == (void *)ic && ic->nlink == 0) - jffs2_del_ino_cache(c, ic); + switch (ic->class) { +#ifdef CONFIG_JFFS2_FS_XATTR + case RAWNODE_CLASS_XATTR_DATUM: + jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + break; + case RAWNODE_CLASS_XATTR_REF: + jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + break; +#endif + default: + if (ic->nodes == (void *)ic && ic->nlink == 0) + jffs2_del_ino_cache(c, ic); + } } void jffs2_free_jeb_node_refs(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb) diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2900ec3ec3a..97caa77d60c 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -227,8 +227,6 @@ void jffs2_clear_inode (struct inode *inode) struct jffs2_inode_info *f = JFFS2_INODE_INFO(inode); D1(printk(KERN_DEBUG "jffs2_clear_inode(): ino #%lu mode %o\n", inode->i_ino, inode->i_mode)); - - jffs2_xattr_delete_inode(c, f->inocache); jffs2_do_clear_inode(c, f); } diff --git a/fs/jffs2/gc.c b/fs/jffs2/gc.c index 477c526d638..daff3341ff9 100644 --- a/fs/jffs2/gc.c +++ b/fs/jffs2/gc.c @@ -165,6 +165,7 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) D1(printk(KERN_DEBUG "Skipping check of ino #%d with nlink zero\n", ic->ino)); spin_unlock(&c->inocache_lock); + jffs2_xattr_delete_inode(c, ic); continue; } switch(ic->state) { @@ -275,13 +276,12 @@ int jffs2_garbage_collect_pass(struct jffs2_sb_info *c) * We can decide whether this node is inode or xattr by ic->class. */ if (ic->class == RAWNODE_CLASS_XATTR_DATUM || ic->class == RAWNODE_CLASS_XATTR_REF) { - BUG_ON(raw->next_in_ino != (void *)ic); spin_unlock(&c->erase_completion_lock); if (ic->class == RAWNODE_CLASS_XATTR_DATUM) { - ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + ret = jffs2_garbage_collect_xattr_datum(c, (struct jffs2_xattr_datum *)ic, raw); } else { - ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + ret = jffs2_garbage_collect_xattr_ref(c, (struct jffs2_xattr_ref *)ic, raw); } goto release_sem; } diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 935fec1b120..b98594992ee 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -119,8 +119,11 @@ struct jffs2_sb_info { #ifdef CONFIG_JFFS2_FS_XATTR #define XATTRINDEX_HASHSIZE (57) uint32_t highest_xid; + uint32_t highest_xseqno; struct list_head xattrindex[XATTRINDEX_HASHSIZE]; struct list_head xattr_unchecked; + struct list_head xattr_dead_list; + struct jffs2_xattr_ref *xref_dead_list; struct jffs2_xattr_ref *xref_temp; struct rw_semaphore xattr_sem; uint32_t xdatum_mem_usage; diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c index 4889d0700c0..8310c95478e 100644 --- a/fs/jffs2/malloc.c +++ b/fs/jffs2/malloc.c @@ -291,6 +291,7 @@ struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) memset(xd, 0, sizeof(struct jffs2_xattr_datum)); xd->class = RAWNODE_CLASS_XATTR_DATUM; + xd->node = (void *)xd; INIT_LIST_HEAD(&xd->xindex); return xd; } @@ -309,6 +310,7 @@ struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) memset(ref, 0, sizeof(struct jffs2_xattr_ref)); ref->class = RAWNODE_CLASS_XATTR_REF; + ref->node = (void *)ref; return ref; } diff --git a/fs/jffs2/nodelist.c b/fs/jffs2/nodelist.c index 927dfe42ba7..7675b33396c 100644 --- a/fs/jffs2/nodelist.c +++ b/fs/jffs2/nodelist.c @@ -906,6 +906,9 @@ void jffs2_del_ino_cache(struct jffs2_sb_info *c, struct jffs2_inode_cache *old) { struct jffs2_inode_cache **prev; +#ifdef CONFIG_JFFS2_FS_XATTR + BUG_ON(old->xref); +#endif dbg_inocache("del %p (ino #%u)\n", old, old->ino); spin_lock(&c->inocache_lock); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 8bedfd2ff68..d88376992ed 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) struct jffs2_eraseblock *ejeb; ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); - list_del(&ejeb->list); - list_add_tail(&ejeb->list, &c->erase_pending_list); + list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", @@ -684,19 +683,26 @@ void jffs2_mark_node_obsolete(struct jffs2_sb_info *c, struct jffs2_raw_node_ref spin_lock(&c->erase_completion_lock); ic = jffs2_raw_ref_to_ic(ref); - /* It seems we should never call jffs2_mark_node_obsolete() for - XATTR nodes.... yet. Make sure we notice if/when we change - that :) */ - BUG_ON(ic->class != RAWNODE_CLASS_INODE_CACHE); for (p = &ic->nodes; (*p) != ref; p = &((*p)->next_in_ino)) ; *p = ref->next_in_ino; ref->next_in_ino = NULL; - if (ic->nodes == (void *)ic && ic->nlink == 0) - jffs2_del_ino_cache(c, ic); - + switch (ic->class) { +#ifdef CONFIG_JFFS2_FS_XATTR + case RAWNODE_CLASS_XATTR_DATUM: + jffs2_release_xattr_datum(c, (struct jffs2_xattr_datum *)ic); + break; + case RAWNODE_CLASS_XATTR_REF: + jffs2_release_xattr_ref(c, (struct jffs2_xattr_ref *)ic); + break; +#endif + default: + if (ic->nodes == (void *)ic && ic->nlink == 0) + jffs2_del_ino_cache(c, ic); + break; + } spin_unlock(&c->erase_completion_lock); } diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 5fec012b02e..cc1899268c4 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -968,6 +968,7 @@ void jffs2_do_clear_inode(struct jffs2_sb_info *c, struct jffs2_inode_info *f) struct jffs2_full_dirent *fd, *fds; int deleted; + jffs2_xattr_delete_inode(c, f->inocache); down(&f->sem); deleted = f->inocache && !f->inocache->nlink; diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index 61618080b86..2bfdc33752d 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -317,20 +317,23 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc struct jffs2_summary *s) { struct jffs2_xattr_datum *xd; - uint32_t totlen, crc; + uint32_t xid, version, totlen, crc; int err; crc = crc32(0, rx, sizeof(struct jffs2_raw_xattr) - 4); if (crc != je32_to_cpu(rx->node_crc)) { - if (je32_to_cpu(rx->node_crc) != 0xffffffff) - JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", - ofs, je32_to_cpu(rx->node_crc), crc); + JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + ofs, je32_to_cpu(rx->node_crc), crc); if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(rx->totlen)))) return err; return 0; } - totlen = PAD(sizeof(*rx) + rx->name_len + 1 + je16_to_cpu(rx->value_len)); + xid = je32_to_cpu(rx->xid); + version = je32_to_cpu(rx->version); + + totlen = PAD(sizeof(struct jffs2_raw_xattr) + + rx->name_len + 1 + je16_to_cpu(rx->value_len)); if (totlen != je32_to_cpu(rx->totlen)) { JFFS2_WARNING("node length mismatch at %#08x, read=%u, calc=%u\n", ofs, je32_to_cpu(rx->totlen), totlen); @@ -339,22 +342,24 @@ static int jffs2_scan_xattr_node(struct jffs2_sb_info *c, struct jffs2_erasebloc return 0; } - xd = jffs2_setup_xattr_datum(c, je32_to_cpu(rx->xid), je32_to_cpu(rx->version)); - if (IS_ERR(xd)) { - if (PTR_ERR(xd) == -EEXIST) { - if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rx->totlen))))) - return err; - return 0; - } + xd = jffs2_setup_xattr_datum(c, xid, version); + if (IS_ERR(xd)) return PTR_ERR(xd); - } - xd->xprefix = rx->xprefix; - xd->name_len = rx->name_len; - xd->value_len = je16_to_cpu(rx->value_len); - xd->data_crc = je32_to_cpu(rx->data_crc); - xd->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL); - /* FIXME */ xd->node->next_in_ino = (void *)xd; + if (xd->version > version) { + struct jffs2_raw_node_ref *raw + = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, NULL); + raw->next_in_ino = xd->node->next_in_ino; + xd->node->next_in_ino = raw; + } else { + xd->version = version; + xd->xprefix = rx->xprefix; + xd->name_len = rx->name_len; + xd->value_len = je16_to_cpu(rx->value_len); + xd->data_crc = je32_to_cpu(rx->data_crc); + + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, totlen, (void *)xd); + } if (jffs2_sum_active()) jffs2_sum_add_xattr_mem(s, rx, ofs - jeb->offset); @@ -373,9 +378,8 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock crc = crc32(0, rr, sizeof(*rr) - 4); if (crc != je32_to_cpu(rr->node_crc)) { - if (je32_to_cpu(rr->node_crc) != 0xffffffff) - JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", - ofs, je32_to_cpu(rr->node_crc), crc); + JFFS2_WARNING("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + ofs, je32_to_cpu(rr->node_crc), crc); if ((err = jffs2_scan_dirty_space(c, jeb, PAD(je32_to_cpu(rr->totlen))))) return err; return 0; @@ -395,6 +399,7 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock return -ENOMEM; /* BEFORE jffs2_build_xattr_subsystem() called, + * and AFTER xattr_ref is marked as a dead xref, * ref->xid is used to store 32bit xid, xd is not used * ref->ino is used to store 32bit inode-number, ic is not used * Thoes variables are declared as union, thus using those @@ -404,11 +409,13 @@ static int jffs2_scan_xref_node(struct jffs2_sb_info *c, struct jffs2_eraseblock */ ref->ino = je32_to_cpu(rr->ino); ref->xid = je32_to_cpu(rr->xid); + ref->xseqno = je32_to_cpu(rr->xseqno); + if (ref->xseqno > c->highest_xseqno) + c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER); ref->next = c->xref_temp; c->xref_temp = ref; - ref->node = jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), NULL); - /* FIXME */ ref->node->next_in_ino = (void *)ref; + jffs2_link_node_ref(c, jeb, ofs | REF_PRISTINE, PAD(je32_to_cpu(rr->totlen)), (void *)ref); if (jffs2_sum_active()) jffs2_sum_add_xref_mem(s, rr, ofs - jeb->offset); diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 0b02fc79e4d..c19bd476e8e 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -5,7 +5,7 @@ * Zoltan Sogor <weth@inf.u-szeged.hu>, * Patrik Kluba <pajko@halom.u-szeged.hu>, * University of Szeged, Hungary - * 2005 KaiGai Kohei <kaigai@ak.jp.nec.com> + * 2006 KaiGai Kohei <kaigai@ak.jp.nec.com> * * For licensing information, see the file 'LICENCE' in this directory. * @@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c) return -ENOMEM; } - dbg_summary("returned succesfully\n"); + dbg_summary("returned successfully\n"); return 0; } @@ -310,8 +310,6 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs, #ifdef CONFIG_JFFS2_FS_XATTR case JFFS2_NODETYPE_XATTR: { struct jffs2_sum_xattr_mem *temp; - if (je32_to_cpu(node->x.version) == 0xffffffff) - return 0; temp = kmalloc(sizeof(struct jffs2_sum_xattr_mem), GFP_KERNEL); if (!temp) goto no_mem; @@ -327,10 +325,6 @@ int jffs2_sum_add_kvec(struct jffs2_sb_info *c, const struct kvec *invecs, } case JFFS2_NODETYPE_XREF: { struct jffs2_sum_xref_mem *temp; - - if (je32_to_cpu(node->r.ino) == 0xffffffff - && je32_to_cpu(node->r.xid) == 0xffffffff) - return 0; temp = kmalloc(sizeof(struct jffs2_sum_xref_mem), GFP_KERNEL); if (!temp) goto no_mem; @@ -483,22 +477,20 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras xd = jffs2_setup_xattr_datum(c, je32_to_cpu(spx->xid), je32_to_cpu(spx->version)); - if (IS_ERR(xd)) { - if (PTR_ERR(xd) == -EEXIST) { - /* a newer version of xd exists */ - if ((err = jffs2_scan_dirty_space(c, jeb, je32_to_cpu(spx->totlen)))) - return err; - sp += JFFS2_SUMMARY_XATTR_SIZE; - break; - } - JFFS2_NOTICE("allocation of xattr_datum failed\n"); + if (IS_ERR(xd)) return PTR_ERR(xd); + if (xd->version > je32_to_cpu(spx->version)) { + /* node is not the newest one */ + struct jffs2_raw_node_ref *raw + = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spx->totlen)), NULL); + raw->next_in_ino = xd->node->next_in_ino; + xd->node->next_in_ino = raw; + } else { + xd->version = je32_to_cpu(spx->version); + sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, + PAD(je32_to_cpu(spx->totlen)), (void *)xd); } - - xd->node = sum_link_node_ref(c, jeb, je32_to_cpu(spx->offset) | REF_UNCHECKED, - PAD(je32_to_cpu(spx->totlen)), NULL); - /* FIXME */ xd->node->next_in_ino = (void *)xd; - *pseudo_random += je32_to_cpu(spx->xid); sp += JFFS2_SUMMARY_XATTR_SIZE; @@ -519,14 +511,11 @@ static int jffs2_sum_process_sum_data(struct jffs2_sb_info *c, struct jffs2_eras JFFS2_NOTICE("allocation of xattr_datum failed\n"); return -ENOMEM; } - ref->ino = 0xfffffffe; - ref->xid = 0xfffffffd; ref->next = c->xref_temp; c->xref_temp = ref; - ref->node = sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED, - PAD(sizeof(struct jffs2_raw_xref)), NULL); - /* FIXME */ ref->node->next_in_ino = (void *)ref; + sum_link_node_ref(c, jeb, je32_to_cpu(spr->offset) | REF_UNCHECKED, + PAD(sizeof(struct jffs2_raw_xref)), (void *)ref); *pseudo_random += ref->node->flash_offset; sp += JFFS2_SUMMARY_XREF_SIZE; diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index a7f153f79ec..b9b700730df 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* Fix up the original jeb now it's on the bad_list */ if (first_raw == jeb->first_node) { D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); } diff --git a/fs/jffs2/xattr.c b/fs/jffs2/xattr.c index 2d82e250be3..18e66dbf23b 100644 --- a/fs/jffs2/xattr.c +++ b/fs/jffs2/xattr.c @@ -23,18 +23,15 @@ * xattr_datum_hashkey(xprefix, xname, xvalue, xsize) * is used to calcurate xdatum hashkey. The reminder of hashkey into XATTRINDEX_HASHSIZE is * the index of the xattr name/value pair cache (c->xattrindex). + * is_xattr_datum_unchecked(c, xd) + * returns 1, if xdatum contains any unchecked raw nodes. if all raw nodes are not + * unchecked, it returns 0. * unload_xattr_datum(c, xd) * is used to release xattr name/value pair and detach from c->xattrindex. * reclaim_xattr_datum(c) * is used to reclaim xattr name/value pairs on the xattr name/value pair cache when * memory usage by cache is over c->xdatum_mem_threshold. Currentry, this threshold * is hard coded as 32KiB. - * delete_xattr_datum_node(c, xd) - * is used to delete a jffs2 node is dominated by xdatum. When EBS(Erase Block Summary) is - * enabled, it overwrites the obsolete node by myself. - * delete_xattr_datum(c, xd) - * is used to delete jffs2_xattr_datum object. It must be called with 0-value of reference - * counter. (It means how many jffs2_xattr_ref object refers this xdatum.) * do_verify_xattr_datum(c, xd) * is used to load the xdatum informations without name/value pair from the medium. * It's necessary once, because those informations are not collected during mounting @@ -53,8 +50,10 @@ * is used to write xdatum to medium. xd->version will be incremented. * create_xattr_datum(c, xprefix, xname, xvalue, xsize) * is used to create new xdatum and write to medium. + * delete_xattr_datum(c, xd) + * is used to delete a xdatum. It marks xd JFFS2_XFLAGS_DEAD, and allows + * GC to reclaim those physical nodes. * -------------------------------------------------- */ - static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char *xvalue, int xsize) { int name_len = strlen(xname); @@ -62,6 +61,22 @@ static uint32_t xattr_datum_hashkey(int xprefix, const char *xname, const char * return crc32(xprefix, xname, name_len) ^ crc32(xprefix, xvalue, xsize); } +static int is_xattr_datum_unchecked(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + struct jffs2_raw_node_ref *raw; + int rc = 0; + + spin_lock(&c->erase_completion_lock); + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + if (ref_flags(raw) == REF_UNCHECKED) { + rc = 1; + break; + } + } + spin_unlock(&c->erase_completion_lock); + return rc; +} + static void unload_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) { /* must be called under down_write(xattr_sem) */ @@ -107,77 +122,33 @@ static void reclaim_xattr_datum(struct jffs2_sb_info *c) before, c->xdatum_mem_usage, before - c->xdatum_mem_usage); } -static void delete_xattr_datum_node(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) -{ - /* must be called under down_write(xattr_sem) */ - struct jffs2_raw_xattr rx; - size_t length; - int rc; - - if (!xd->node) { - JFFS2_WARNING("xdatum (xid=%u) is removed twice.\n", xd->xid); - return; - } - if (jffs2_sum_active()) { - memset(&rx, 0xff, sizeof(struct jffs2_raw_xattr)); - rc = jffs2_flash_read(c, ref_offset(xd->node), - sizeof(struct jffs2_unknown_node), - &length, (char *)&rx); - if (rc || length != sizeof(struct jffs2_unknown_node)) { - JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n", - rc, sizeof(struct jffs2_unknown_node), - length, ref_offset(xd->node)); - } - rc = jffs2_flash_write(c, ref_offset(xd->node), sizeof(rx), - &length, (char *)&rx); - if (rc || length != sizeof(struct jffs2_raw_xattr)) { - JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu ar %#08x\n", - rc, sizeof(rx), length, ref_offset(xd->node)); - } - } - spin_lock(&c->erase_completion_lock); - xd->node->next_in_ino = NULL; - spin_unlock(&c->erase_completion_lock); - jffs2_mark_node_obsolete(c, xd->node); - xd->node = NULL; -} - -static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) -{ - /* must be called under down_write(xattr_sem) */ - BUG_ON(xd->refcnt); - - unload_xattr_datum(c, xd); - if (xd->node) { - delete_xattr_datum_node(c, xd); - xd->node = NULL; - } - jffs2_free_xattr_datum(xd); -} - static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) { /* must be called under down_write(xattr_sem) */ struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; struct jffs2_raw_xattr rx; size_t readlen; - uint32_t crc, totlen; + uint32_t crc, offset, totlen; int rc; - BUG_ON(!xd->node); - BUG_ON(ref_flags(xd->node) != REF_UNCHECKED); + spin_lock(&c->erase_completion_lock); + offset = ref_offset(xd->node); + if (ref_flags(xd->node) == REF_PRISTINE) + goto complete; + spin_unlock(&c->erase_completion_lock); - rc = jffs2_flash_read(c, ref_offset(xd->node), sizeof(rx), &readlen, (char *)&rx); + rc = jffs2_flash_read(c, offset, sizeof(rx), &readlen, (char *)&rx); if (rc || readlen != sizeof(rx)) { JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n", - rc, sizeof(rx), readlen, ref_offset(xd->node)); + rc, sizeof(rx), readlen, offset); return rc ? rc : -EIO; } crc = crc32(0, &rx, sizeof(rx) - 4); if (crc != je32_to_cpu(rx.node_crc)) { - if (je32_to_cpu(rx.node_crc) != 0xffffffff) - JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", - ref_offset(xd->node), je32_to_cpu(rx.hdr_crc), crc); + JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + offset, je32_to_cpu(rx.hdr_crc), crc); + xd->flags |= JFFS2_XFLAGS_INVALID; return EIO; } totlen = PAD(sizeof(rx) + rx.name_len + 1 + je16_to_cpu(rx.value_len)); @@ -188,11 +159,12 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat || je32_to_cpu(rx.version) != xd->version) { JFFS2_ERROR("inconsistent xdatum at %#08x, magic=%#04x/%#04x, " "nodetype=%#04x/%#04x, totlen=%u/%u, xid=%u/%u, version=%u/%u\n", - ref_offset(xd->node), je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK, + offset, je16_to_cpu(rx.magic), JFFS2_MAGIC_BITMASK, je16_to_cpu(rx.nodetype), JFFS2_NODETYPE_XATTR, je32_to_cpu(rx.totlen), totlen, je32_to_cpu(rx.xid), xd->xid, je32_to_cpu(rx.version), xd->version); + xd->flags |= JFFS2_XFLAGS_INVALID; return EIO; } xd->xprefix = rx.xprefix; @@ -200,14 +172,17 @@ static int do_verify_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_dat xd->value_len = je16_to_cpu(rx.value_len); xd->data_crc = je32_to_cpu(rx.data_crc); - /* This JFFS2_NODETYPE_XATTR node is checked */ - jeb = &c->blocks[ref_offset(xd->node) / c->sector_size]; - totlen = PAD(je32_to_cpu(rx.totlen)); - spin_lock(&c->erase_completion_lock); - c->unchecked_size -= totlen; c->used_size += totlen; - jeb->unchecked_size -= totlen; jeb->used_size += totlen; - xd->node->flash_offset = ref_offset(xd->node) | REF_PRISTINE; + complete: + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + if (ref_flags(raw) == REF_UNCHECKED) { + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + } + raw->flash_offset = ref_offset(raw) | ((xd->node==raw) ? REF_PRISTINE : REF_NORMAL); + } spin_unlock(&c->erase_completion_lock); /* unchecked xdatum is chained with c->xattr_unchecked */ @@ -227,7 +202,6 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum uint32_t crc, length; int i, ret, retry = 0; - BUG_ON(!xd->node); BUG_ON(ref_flags(xd->node) != REF_PRISTINE); BUG_ON(!list_empty(&xd->xindex)); retry: @@ -253,6 +227,7 @@ static int do_load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum " at %#08x, read: 0x%08x calculated: 0x%08x\n", ref_offset(xd->node), xd->data_crc, crc); kfree(data); + xd->flags |= JFFS2_XFLAGS_INVALID; return EIO; } @@ -286,16 +261,14 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x * rc > 0 : Unrecoverable error, this node should be deleted. */ int rc = 0; - BUG_ON(xd->xname); - if (!xd->node) + + BUG_ON(xd->flags & JFFS2_XFLAGS_DEAD); + if (xd->xname) + return 0; + if (xd->flags & JFFS2_XFLAGS_INVALID) return EIO; - if (unlikely(ref_flags(xd->node) != REF_PRISTINE)) { + if (unlikely(is_xattr_datum_unchecked(c, xd))) rc = do_verify_xattr_datum(c, xd); - if (rc > 0) { - list_del_init(&xd->xindex); - delete_xattr_datum_node(c, xd); - } - } if (!rc) rc = do_load_xattr_datum(c, xd); return rc; @@ -304,7 +277,6 @@ static int load_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) { /* must be called under down_write(xattr_sem) */ - struct jffs2_raw_node_ref *raw; struct jffs2_raw_xattr rx; struct kvec vecs[2]; size_t length; @@ -312,14 +284,16 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x uint32_t phys_ofs = write_ofs(c); BUG_ON(!xd->xname); + BUG_ON(xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID)); vecs[0].iov_base = ℞ - vecs[0].iov_len = PAD(sizeof(rx)); + vecs[0].iov_len = sizeof(rx); vecs[1].iov_base = xd->xname; vecs[1].iov_len = xd->name_len + 1 + xd->value_len; totlen = vecs[0].iov_len + vecs[1].iov_len; /* Setup raw-xattr */ + memset(&rx, 0, sizeof(rx)); rx.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); rx.nodetype = cpu_to_je16(JFFS2_NODETYPE_XATTR); rx.totlen = cpu_to_je32(PAD(totlen)); @@ -343,14 +317,8 @@ static int save_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *x return rc; } - /* success */ - raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), NULL); - /* FIXME */ raw->next_in_ino = (void *)xd; - - if (xd->node) - delete_xattr_datum_node(c, xd); - xd->node = raw; + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(totlen), (void *)xd); dbg_xattr("success on saving xdatum (xid=%u, version=%u, xprefix=%u, xname='%s')\n", xd->xid, xd->version, xd->xprefix, xd->xname); @@ -377,7 +345,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c, && xd->value_len==xsize && !strcmp(xd->xname, xname) && !memcmp(xd->xvalue, xvalue, xsize)) { - xd->refcnt++; + atomic_inc(&xd->refcnt); return xd; } } @@ -397,7 +365,7 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c, strcpy(data, xname); memcpy(data + name_len + 1, xvalue, xsize); - xd->refcnt = 1; + atomic_set(&xd->refcnt, 1); xd->xid = ++c->highest_xid; xd->flags |= JFFS2_XFLAGS_HOT; xd->xprefix = xprefix; @@ -426,20 +394,36 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c, return xd; } +static void delete_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under down_write(xattr_sem) */ + BUG_ON(atomic_read(&xd->refcnt)); + + unload_xattr_datum(c, xd); + xd->flags |= JFFS2_XFLAGS_DEAD; + spin_lock(&c->erase_completion_lock); + if (xd->node == (void *)xd) { + BUG_ON(!(xd->flags & JFFS2_XFLAGS_INVALID)); + jffs2_free_xattr_datum(xd); + } else { + list_add(&xd->xindex, &c->xattr_dead_list); + } + spin_unlock(&c->erase_completion_lock); + dbg_xattr("xdatum(xid=%u, version=%u) was removed.\n", xd->xid, xd->version); +} + /* -------- xref related functions ------------------ * verify_xattr_ref(c, ref) * is used to load xref information from medium. Because summary data does not * contain xid/ino, it's necessary to verify once while mounting process. - * delete_xattr_ref_node(c, ref) - * is used to delete a jffs2 node is dominated by xref. When EBS is enabled, - * it overwrites the obsolete node by myself. - * delete_xattr_ref(c, ref) - * is used to delete jffs2_xattr_ref object. If the reference counter of xdatum - * is refered by this xref become 0, delete_xattr_datum() is called later. * save_xattr_ref(c, ref) - * is used to write xref to medium. + * is used to write xref to medium. If delete marker is marked, it write + * a delete marker of xref into medium. * create_xattr_ref(c, ic, xd) * is used to create a new xref and write to medium. + * delete_xattr_ref(c, ref) + * is used to delete jffs2_xattr_ref. It marks xref XREF_DELETE_MARKER, + * and allows GC to reclaim those physical nodes. * jffs2_xattr_delete_inode(c, ic) * is called to remove xrefs related to obsolete inode when inode is unlinked. * jffs2_xattr_free_inode(c, ic) @@ -450,25 +434,29 @@ static struct jffs2_xattr_datum *create_xattr_datum(struct jffs2_sb_info *c, static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) { struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; struct jffs2_raw_xref rr; size_t readlen; - uint32_t crc, totlen; + uint32_t crc, offset, totlen; int rc; - BUG_ON(ref_flags(ref->node) != REF_UNCHECKED); + spin_lock(&c->erase_completion_lock); + if (ref_flags(ref->node) != REF_UNCHECKED) + goto complete; + offset = ref_offset(ref->node); + spin_unlock(&c->erase_completion_lock); - rc = jffs2_flash_read(c, ref_offset(ref->node), sizeof(rr), &readlen, (char *)&rr); + rc = jffs2_flash_read(c, offset, sizeof(rr), &readlen, (char *)&rr); if (rc || sizeof(rr) != readlen) { JFFS2_WARNING("jffs2_flash_read()=%d, req=%zu, read=%zu, at %#08x\n", - rc, sizeof(rr), readlen, ref_offset(ref->node)); + rc, sizeof(rr), readlen, offset); return rc ? rc : -EIO; } /* obsolete node */ crc = crc32(0, &rr, sizeof(rr) - 4); if (crc != je32_to_cpu(rr.node_crc)) { - if (je32_to_cpu(rr.node_crc) != 0xffffffff) - JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", - ref_offset(ref->node), je32_to_cpu(rr.node_crc), crc); + JFFS2_ERROR("node CRC failed at %#08x, read=%#08x, calc=%#08x\n", + offset, je32_to_cpu(rr.node_crc), crc); return EIO; } if (je16_to_cpu(rr.magic) != JFFS2_MAGIC_BITMASK @@ -476,22 +464,28 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref || je32_to_cpu(rr.totlen) != PAD(sizeof(rr))) { JFFS2_ERROR("inconsistent xref at %#08x, magic=%#04x/%#04x, " "nodetype=%#04x/%#04x, totlen=%u/%zu\n", - ref_offset(ref->node), je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, + offset, je16_to_cpu(rr.magic), JFFS2_MAGIC_BITMASK, je16_to_cpu(rr.nodetype), JFFS2_NODETYPE_XREF, je32_to_cpu(rr.totlen), PAD(sizeof(rr))); return EIO; } ref->ino = je32_to_cpu(rr.ino); ref->xid = je32_to_cpu(rr.xid); - - /* fixup superblock/eraseblock info */ - jeb = &c->blocks[ref_offset(ref->node) / c->sector_size]; - totlen = PAD(sizeof(rr)); + ref->xseqno = je32_to_cpu(rr.xseqno); + if (ref->xseqno > c->highest_xseqno) + c->highest_xseqno = (ref->xseqno & ~XREF_DELETE_MARKER); spin_lock(&c->erase_completion_lock); - c->unchecked_size -= totlen; c->used_size += totlen; - jeb->unchecked_size -= totlen; jeb->used_size += totlen; - ref->node->flash_offset = ref_offset(ref->node) | REF_PRISTINE; + complete: + for (raw=ref->node; raw != (void *)ref; raw=raw->next_in_ino) { + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + if (ref_flags(raw) == REF_UNCHECKED) { + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + } + raw->flash_offset = ref_offset(raw) | ((ref->node==raw) ? REF_PRISTINE : REF_NORMAL); + } spin_unlock(&c->erase_completion_lock); dbg_xattr("success on verifying xref (ino=%u, xid=%u) at %#08x\n", @@ -499,58 +493,12 @@ static int verify_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref return 0; } -static void delete_xattr_ref_node(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) -{ - struct jffs2_raw_xref rr; - size_t length; - int rc; - - if (jffs2_sum_active()) { - memset(&rr, 0xff, sizeof(rr)); - rc = jffs2_flash_read(c, ref_offset(ref->node), - sizeof(struct jffs2_unknown_node), - &length, (char *)&rr); - if (rc || length != sizeof(struct jffs2_unknown_node)) { - JFFS2_ERROR("jffs2_flash_read()=%d, req=%zu, read=%zu at %#08x\n", - rc, sizeof(struct jffs2_unknown_node), - length, ref_offset(ref->node)); - } - rc = jffs2_flash_write(c, ref_offset(ref->node), sizeof(rr), - &length, (char *)&rr); - if (rc || length != sizeof(struct jffs2_raw_xref)) { - JFFS2_ERROR("jffs2_flash_write()=%d, req=%zu, wrote=%zu at %#08x\n", - rc, sizeof(rr), length, ref_offset(ref->node)); - } - } - spin_lock(&c->erase_completion_lock); - ref->node->next_in_ino = NULL; - spin_unlock(&c->erase_completion_lock); - jffs2_mark_node_obsolete(c, ref->node); - ref->node = NULL; -} - -static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) -{ - /* must be called under down_write(xattr_sem) */ - struct jffs2_xattr_datum *xd; - - BUG_ON(!ref->node); - delete_xattr_ref_node(c, ref); - - xd = ref->xd; - xd->refcnt--; - if (!xd->refcnt) - delete_xattr_datum(c, xd); - jffs2_free_xattr_ref(ref); -} - static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) { /* must be called under down_write(xattr_sem) */ - struct jffs2_raw_node_ref *raw; struct jffs2_raw_xref rr; size_t length; - uint32_t phys_ofs = write_ofs(c); + uint32_t xseqno, phys_ofs = write_ofs(c); int ret; rr.magic = cpu_to_je16(JFFS2_MAGIC_BITMASK); @@ -558,8 +506,16 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) rr.totlen = cpu_to_je32(PAD(sizeof(rr))); rr.hdr_crc = cpu_to_je32(crc32(0, &rr, sizeof(struct jffs2_unknown_node) - 4)); - rr.ino = cpu_to_je32(ref->ic->ino); - rr.xid = cpu_to_je32(ref->xd->xid); + xseqno = (c->highest_xseqno += 2); + if (is_xattr_ref_dead(ref)) { + xseqno |= XREF_DELETE_MARKER; + rr.ino = cpu_to_je32(ref->ino); + rr.xid = cpu_to_je32(ref->xid); + } else { + rr.ino = cpu_to_je32(ref->ic->ino); + rr.xid = cpu_to_je32(ref->xd->xid); + } + rr.xseqno = cpu_to_je32(xseqno); rr.node_crc = cpu_to_je32(crc32(0, &rr, sizeof(rr) - 4)); ret = jffs2_flash_write(c, phys_ofs, sizeof(rr), &length, (char *)&rr); @@ -572,12 +528,9 @@ static int save_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) return ret; } - - raw = jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), NULL); - /* FIXME */ raw->next_in_ino = (void *)ref; - if (ref->node) - delete_xattr_ref_node(c, ref); - ref->node = raw; + /* success */ + ref->xseqno = xseqno; + jffs2_add_physical_node_ref(c, phys_ofs | REF_PRISTINE, PAD(sizeof(rr)), (void *)ref); dbg_xattr("success on saving xref (ino=%u, xid=%u)\n", ref->ic->ino, ref->xd->xid); @@ -610,6 +563,27 @@ static struct jffs2_xattr_ref *create_xattr_ref(struct jffs2_sb_info *c, struct return ref; /* success */ } +static void delete_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under down_write(xattr_sem) */ + struct jffs2_xattr_datum *xd; + + xd = ref->xd; + ref->xseqno |= XREF_DELETE_MARKER; + ref->ino = ref->ic->ino; + ref->xid = ref->xd->xid; + spin_lock(&c->erase_completion_lock); + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + spin_unlock(&c->erase_completion_lock); + + dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) was removed.\n", + ref->ino, ref->xid, ref->xseqno); + + if (atomic_dec_and_test(&xd->refcnt)) + delete_xattr_datum(c, xd); +} + void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic) { /* It's called from jffs2_clear_inode() on inode removing. @@ -638,8 +612,7 @@ void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *i for (ref = ic->xref; ref; ref = _ref) { _ref = ref->next; xd = ref->xd; - xd->refcnt--; - if (!xd->refcnt) { + if (atomic_dec_and_test(&xd->refcnt)) { unload_xattr_datum(c, xd); jffs2_free_xattr_datum(xd); } @@ -655,7 +628,7 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac * duplicate name/value pairs. If duplicate name/value pair would be found, * one will be removed. */ - struct jffs2_xattr_ref *ref, *cmp, **pref; + struct jffs2_xattr_ref *ref, *cmp, **pref, **pcmp; int rc = 0; if (likely(ic->flags & INO_FLAGS_XATTR_CHECKED)) @@ -673,13 +646,13 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac } else if (unlikely(rc < 0)) goto out; } - for (cmp=ref->next, pref=&ref->next; cmp; pref=&cmp->next, cmp=cmp->next) { + for (cmp=ref->next, pcmp=&ref->next; cmp; pcmp=&cmp->next, cmp=cmp->next) { if (!cmp->xd->xname) { ref->xd->flags |= JFFS2_XFLAGS_BIND; rc = load_xattr_datum(c, cmp->xd); ref->xd->flags &= ~JFFS2_XFLAGS_BIND; if (unlikely(rc > 0)) { - *pref = cmp->next; + *pcmp = cmp->next; delete_xattr_ref(c, cmp); goto retry; } else if (unlikely(rc < 0)) @@ -687,8 +660,13 @@ static int check_xattr_ref_inode(struct jffs2_sb_info *c, struct jffs2_inode_cac } if (ref->xd->xprefix == cmp->xd->xprefix && !strcmp(ref->xd->xname, cmp->xd->xname)) { - *pref = cmp->next; - delete_xattr_ref(c, cmp); + if (ref->xseqno > cmp->xseqno) { + *pcmp = cmp->next; + delete_xattr_ref(c, cmp); + } else { + *pref = ref->next; + delete_xattr_ref(c, ref); + } goto retry; } } @@ -719,9 +697,13 @@ void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c) for (i=0; i < XATTRINDEX_HASHSIZE; i++) INIT_LIST_HEAD(&c->xattrindex[i]); INIT_LIST_HEAD(&c->xattr_unchecked); + INIT_LIST_HEAD(&c->xattr_dead_list); + c->xref_dead_list = NULL; c->xref_temp = NULL; init_rwsem(&c->xattr_sem); + c->highest_xid = 0; + c->highest_xseqno = 0; c->xdatum_mem_usage = 0; c->xdatum_mem_threshold = 32 * 1024; /* Default 32KB */ } @@ -751,7 +733,11 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) _ref = ref->next; jffs2_free_xattr_ref(ref); } - c->xref_temp = NULL; + + for (ref=c->xref_dead_list; ref; ref = _ref) { + _ref = ref->next; + jffs2_free_xattr_ref(ref); + } for (i=0; i < XATTRINDEX_HASHSIZE; i++) { list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { @@ -761,100 +747,143 @@ void jffs2_clear_xattr_subsystem(struct jffs2_sb_info *c) jffs2_free_xattr_datum(xd); } } + + list_for_each_entry_safe(xd, _xd, &c->xattr_dead_list, xindex) { + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); + } } +#define XREF_TMPHASH_SIZE (128) void jffs2_build_xattr_subsystem(struct jffs2_sb_info *c) { struct jffs2_xattr_ref *ref, *_ref; + struct jffs2_xattr_ref *xref_tmphash[XREF_TMPHASH_SIZE]; struct jffs2_xattr_datum *xd, *_xd; struct jffs2_inode_cache *ic; - int i, xdatum_count =0, xdatum_unchecked_count = 0, xref_count = 0; + struct jffs2_raw_node_ref *raw; + int i, xdatum_count = 0, xdatum_unchecked_count = 0, xref_count = 0; + int xdatum_orphan_count = 0, xref_orphan_count = 0, xref_dead_count = 0; BUG_ON(!(c->flags & JFFS2_SB_FLAG_BUILDING)); - /* Phase.1 */ + /* Phase.1 : Merge same xref */ + for (i=0; i < XREF_TMPHASH_SIZE; i++) + xref_tmphash[i] = NULL; for (ref=c->xref_temp; ref; ref=_ref) { + struct jffs2_xattr_ref *tmp; + _ref = ref->next; - /* checking REF_UNCHECKED nodes */ if (ref_flags(ref->node) != REF_PRISTINE) { if (verify_xattr_ref(c, ref)) { - delete_xattr_ref_node(c, ref); + BUG_ON(ref->node->next_in_ino != (void *)ref); + ref->node->next_in_ino = NULL; + jffs2_mark_node_obsolete(c, ref->node); jffs2_free_xattr_ref(ref); continue; } } - /* At this point, ref->xid and ref->ino contain XID and inode number. - ref->xd and ref->ic are not valid yet. */ - xd = jffs2_find_xattr_datum(c, ref->xid); - ic = jffs2_get_ino_cache(c, ref->ino); - if (!xd || !ic) { - if (ref_flags(ref->node) != REF_UNCHECKED) - JFFS2_WARNING("xref(ino=%u, xid=%u) is orphan. \n", - ref->ino, ref->xid); - delete_xattr_ref_node(c, ref); + + i = (ref->ino ^ ref->xid) % XREF_TMPHASH_SIZE; + for (tmp=xref_tmphash[i]; tmp; tmp=tmp->next) { + if (tmp->ino == ref->ino && tmp->xid == ref->xid) + break; + } + if (tmp) { + raw = ref->node; + if (ref->xseqno > tmp->xseqno) { + tmp->xseqno = ref->xseqno; + raw->next_in_ino = tmp->node; + tmp->node = raw; + } else { + raw->next_in_ino = tmp->node->next_in_ino; + tmp->node->next_in_ino = raw; + } jffs2_free_xattr_ref(ref); continue; + } else { + ref->next = xref_tmphash[i]; + xref_tmphash[i] = ref; } - ref->xd = xd; - ref->ic = ic; - xd->refcnt++; - ref->next = ic->xref; - ic->xref = ref; - xref_count++; } c->xref_temp = NULL; - /* After this, ref->xid/ino are NEVER used. */ - /* Phase.2 */ + /* Phase.2 : Bind xref with inode_cache and xattr_datum */ + for (i=0; i < XREF_TMPHASH_SIZE; i++) { + for (ref=xref_tmphash[i]; ref; ref=_ref) { + xref_count++; + _ref = ref->next; + if (is_xattr_ref_dead(ref)) { + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + xref_dead_count++; + continue; + } + /* At this point, ref->xid and ref->ino contain XID and inode number. + ref->xd and ref->ic are not valid yet. */ + xd = jffs2_find_xattr_datum(c, ref->xid); + ic = jffs2_get_ino_cache(c, ref->ino); + if (!xd || !ic) { + dbg_xattr("xref(ino=%u, xid=%u, xseqno=%u) is orphan.\n", + ref->ino, ref->xid, ref->xseqno); + ref->xseqno |= XREF_DELETE_MARKER; + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + xref_orphan_count++; + continue; + } + ref->xd = xd; + ref->ic = ic; + atomic_inc(&xd->refcnt); + ref->next = ic->xref; + ic->xref = ref; + } + } + + /* Phase.3 : Link unchecked xdatum to xattr_unchecked list */ for (i=0; i < XATTRINDEX_HASHSIZE; i++) { list_for_each_entry_safe(xd, _xd, &c->xattrindex[i], xindex) { + xdatum_count++; list_del_init(&xd->xindex); - if (!xd->refcnt) { - if (ref_flags(xd->node) != REF_UNCHECKED) - JFFS2_WARNING("orphan xdatum(xid=%u, version=%u) at %#08x\n", - xd->xid, xd->version, ref_offset(xd->node)); - delete_xattr_datum(c, xd); + if (!atomic_read(&xd->refcnt)) { + dbg_xattr("xdatum(xid=%u, version=%u) is orphan.\n", + xd->xid, xd->version); + xd->flags |= JFFS2_XFLAGS_DEAD; + list_add(&xd->xindex, &c->xattr_unchecked); + xdatum_orphan_count++; continue; } - if (ref_flags(xd->node) != REF_PRISTINE) { - dbg_xattr("unchecked xdatum(xid=%u) at %#08x\n", - xd->xid, ref_offset(xd->node)); + if (is_xattr_datum_unchecked(c, xd)) { + dbg_xattr("unchecked xdatum(xid=%u, version=%u)\n", + xd->xid, xd->version); list_add(&xd->xindex, &c->xattr_unchecked); xdatum_unchecked_count++; } - xdatum_count++; } } /* build complete */ - JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum (%u unchecked) and " - "%u of xref found.\n", xdatum_count, xdatum_unchecked_count, xref_count); + JFFS2_NOTICE("complete building xattr subsystem, %u of xdatum" + " (%u unchecked, %u orphan) and " + "%u of xref (%u dead, %u orphan) found.\n", + xdatum_count, xdatum_unchecked_count, xdatum_orphan_count, + xref_count, xref_dead_count, xref_orphan_count); } struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c, uint32_t xid, uint32_t version) { - struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_xattr_datum *xd; - _xd = jffs2_find_xattr_datum(c, xid); - if (_xd) { - dbg_xattr("duplicate xdatum (xid=%u, version=%u/%u) at %#08x\n", - xid, version, _xd->version, ref_offset(_xd->node)); - if (version < _xd->version) - return ERR_PTR(-EEXIST); - } - xd = jffs2_alloc_xattr_datum(); - if (!xd) - return ERR_PTR(-ENOMEM); - xd->xid = xid; - xd->version = version; - if (xd->xid > c->highest_xid) - c->highest_xid = xd->xid; - list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]); - - if (_xd) { - list_del_init(&_xd->xindex); - delete_xattr_datum_node(c, _xd); - jffs2_free_xattr_datum(_xd); + xd = jffs2_find_xattr_datum(c, xid); + if (!xd) { + xd = jffs2_alloc_xattr_datum(); + if (!xd) + return ERR_PTR(-ENOMEM); + xd->xid = xid; + xd->version = version; + if (xd->xid > c->highest_xid) + c->highest_xid = xd->xid; + list_add_tail(&xd->xindex, &c->xattrindex[xid % XATTRINDEX_HASHSIZE]); } return xd; } @@ -1080,9 +1109,23 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, goto out; } if (!buffer) { - *pref = ref->next; - delete_xattr_ref(c, ref); - rc = 0; + ref->ino = ic->ino; + ref->xid = xd->xid; + ref->xseqno |= XREF_DELETE_MARKER; + rc = save_xattr_ref(c, ref); + if (!rc) { + *pref = ref->next; + spin_lock(&c->erase_completion_lock); + ref->next = c->xref_dead_list; + c->xref_dead_list = ref; + spin_unlock(&c->erase_completion_lock); + if (atomic_dec_and_test(&xd->refcnt)) + delete_xattr_datum(c, xd); + } else { + ref->ic = ic; + ref->xd = xd; + ref->xseqno &= ~XREF_DELETE_MARKER; + } goto out; } goto found; @@ -1094,7 +1137,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, goto out; } if (!buffer) { - rc = -EINVAL; + rc = -ENODATA; goto out; } found: @@ -1110,16 +1153,14 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, request = PAD(sizeof(struct jffs2_raw_xref)); rc = jffs2_reserve_space(c, request, &length, ALLOC_NORMAL, JFFS2_SUMMARY_XREF_SIZE); + down_write(&c->xattr_sem); if (rc) { JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, request); - down_write(&c->xattr_sem); - xd->refcnt--; - if (!xd->refcnt) + if (atomic_dec_and_test(&xd->refcnt)) delete_xattr_datum(c, xd); up_write(&c->xattr_sem); return rc; } - down_write(&c->xattr_sem); if (ref) *pref = ref->next; newref = create_xattr_ref(c, ic, xd); @@ -1129,8 +1170,7 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, ic->xref = ref; } rc = PTR_ERR(newref); - xd->refcnt--; - if (!xd->refcnt) + if (atomic_dec_and_test(&xd->refcnt)) delete_xattr_datum(c, xd); } else if (ref) { delete_xattr_ref(c, ref); @@ -1142,38 +1182,40 @@ int do_jffs2_setxattr(struct inode *inode, int xprefix, const char *xname, } /* -------- garbage collector functions ------------- - * jffs2_garbage_collect_xattr_datum(c, xd) + * jffs2_garbage_collect_xattr_datum(c, xd, raw) * is used to move xdatum into new node. - * jffs2_garbage_collect_xattr_ref(c, ref) + * jffs2_garbage_collect_xattr_ref(c, ref, raw) * is used to move xref into new node. * jffs2_verify_xattr(c) * is used to call do_verify_xattr_datum() before garbage collecting. + * jffs2_release_xattr_datum(c, xd) + * is used to release an in-memory object of xdatum. + * jffs2_release_xattr_ref(c, ref) + * is used to release an in-memory object of xref. * -------------------------------------------------- */ -int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, + struct jffs2_raw_node_ref *raw) { uint32_t totlen, length, old_ofs; - int rc = -EINVAL; + int rc = 0; down_write(&c->xattr_sem); - BUG_ON(!xd->node); - - old_ofs = ref_offset(xd->node); - totlen = ref_totlen(c, c->gcblock, xd->node); - if (totlen < sizeof(struct jffs2_raw_xattr)) + if (xd->node != raw) + goto out; + if (xd->flags & (JFFS2_XFLAGS_DEAD|JFFS2_XFLAGS_INVALID)) goto out; - if (!xd->xname) { - rc = load_xattr_datum(c, xd); - if (unlikely(rc > 0)) { - delete_xattr_datum_node(c, xd); - rc = 0; - goto out; - } else if (unlikely(rc < 0)) - goto out; + rc = load_xattr_datum(c, xd); + if (unlikely(rc)) { + rc = (rc > 0) ? 0 : rc; + goto out; } + old_ofs = ref_offset(xd->node); + totlen = PAD(sizeof(struct jffs2_raw_xattr) + + xd->name_len + 1 + xd->value_len); rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XATTR_SIZE); - if (rc || length < totlen) { - JFFS2_WARNING("jffs2_reserve_space()=%d, request=%u\n", rc, totlen); + if (rc) { + JFFS2_WARNING("jffs2_reserve_space_gc()=%d, request=%u\n", rc, totlen); rc = rc ? rc : -EBADFD; goto out; } @@ -1182,27 +1224,32 @@ int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xatt dbg_xattr("xdatum (xid=%u, version=%u) GC'ed from %#08x to %08x\n", xd->xid, xd->version, old_ofs, ref_offset(xd->node)); out: + if (!rc) + jffs2_mark_node_obsolete(c, raw); up_write(&c->xattr_sem); return rc; } - -int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, + struct jffs2_raw_node_ref *raw) { uint32_t totlen, length, old_ofs; - int rc = -EINVAL; + int rc = 0; down_write(&c->xattr_sem); BUG_ON(!ref->node); + if (ref->node != raw) + goto out; + if (is_xattr_ref_dead(ref) && (raw->next_in_ino == (void *)ref)) + goto out; + old_ofs = ref_offset(ref->node); totlen = ref_totlen(c, c->gcblock, ref->node); - if (totlen != sizeof(struct jffs2_raw_xref)) - goto out; rc = jffs2_reserve_space_gc(c, totlen, &length, JFFS2_SUMMARY_XREF_SIZE); - if (rc || length < totlen) { - JFFS2_WARNING("%s: jffs2_reserve_space() = %d, request = %u\n", + if (rc) { + JFFS2_WARNING("%s: jffs2_reserve_space_gc() = %d, request = %u\n", __FUNCTION__, rc, totlen); rc = rc ? rc : -EBADFD; goto out; @@ -1212,6 +1259,8 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ dbg_xattr("xref (ino=%u, xid=%u) GC'ed from %#08x to %08x\n", ref->ic->ino, ref->xd->xid, old_ofs, ref_offset(ref->node)); out: + if (!rc) + jffs2_mark_node_obsolete(c, raw); up_write(&c->xattr_sem); return rc; } @@ -1219,20 +1268,59 @@ int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ int jffs2_verify_xattr(struct jffs2_sb_info *c) { struct jffs2_xattr_datum *xd, *_xd; + struct jffs2_eraseblock *jeb; + struct jffs2_raw_node_ref *raw; + uint32_t totlen; int rc; down_write(&c->xattr_sem); list_for_each_entry_safe(xd, _xd, &c->xattr_unchecked, xindex) { rc = do_verify_xattr_datum(c, xd); - if (rc == 0) { - list_del_init(&xd->xindex); - break; - } else if (rc > 0) { - list_del_init(&xd->xindex); - delete_xattr_datum_node(c, xd); + if (rc < 0) + continue; + list_del_init(&xd->xindex); + spin_lock(&c->erase_completion_lock); + for (raw=xd->node; raw != (void *)xd; raw=raw->next_in_ino) { + if (ref_flags(raw) != REF_UNCHECKED) + continue; + jeb = &c->blocks[ref_offset(raw) / c->sector_size]; + totlen = PAD(ref_totlen(c, jeb, raw)); + c->unchecked_size -= totlen; c->used_size += totlen; + jeb->unchecked_size -= totlen; jeb->used_size += totlen; + raw->flash_offset = ref_offset(raw) + | ((xd->node == (void *)raw) ? REF_PRISTINE : REF_NORMAL); } + if (xd->flags & JFFS2_XFLAGS_DEAD) + list_add(&xd->xindex, &c->xattr_dead_list); + spin_unlock(&c->erase_completion_lock); } up_write(&c->xattr_sem); - return list_empty(&c->xattr_unchecked) ? 1 : 0; } + +void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd) +{ + /* must be called under spin_lock(&c->erase_completion_lock) */ + if (atomic_read(&xd->refcnt) || xd->node != (void *)xd) + return; + + list_del(&xd->xindex); + jffs2_free_xattr_datum(xd); +} + +void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref) +{ + /* must be called under spin_lock(&c->erase_completion_lock) */ + struct jffs2_xattr_ref *tmp, **ptmp; + + if (ref->node != (void *)ref) + return; + + for (tmp=c->xref_dead_list, ptmp=&c->xref_dead_list; tmp; ptmp=&tmp->next, tmp=tmp->next) { + if (ref == tmp) { + *ptmp = tmp->next; + break; + } + } + jffs2_free_xattr_ref(ref); +} diff --git a/fs/jffs2/xattr.h b/fs/jffs2/xattr.h index 2c199856c58..06a5c69dcf8 100644 --- a/fs/jffs2/xattr.h +++ b/fs/jffs2/xattr.h @@ -16,6 +16,8 @@ #define JFFS2_XFLAGS_HOT (0x01) /* This datum is HOT */ #define JFFS2_XFLAGS_BIND (0x02) /* This datum is not reclaimed */ +#define JFFS2_XFLAGS_DEAD (0x40) /* This datum is already dead */ +#define JFFS2_XFLAGS_INVALID (0x80) /* This datum contains crc error */ struct jffs2_xattr_datum { @@ -23,10 +25,10 @@ struct jffs2_xattr_datum struct jffs2_raw_node_ref *node; uint8_t class; uint8_t flags; - uint16_t xprefix; /* see JFFS2_XATTR_PREFIX_* */ + uint16_t xprefix; /* see JFFS2_XATTR_PREFIX_* */ struct list_head xindex; /* chained from c->xattrindex[n] */ - uint32_t refcnt; /* # of xattr_ref refers this */ + atomic_t refcnt; /* # of xattr_ref refers this */ uint32_t xid; uint32_t version; @@ -47,6 +49,7 @@ struct jffs2_xattr_ref uint8_t flags; /* Currently unused */ u16 unused; + uint32_t xseqno; union { struct jffs2_inode_cache *ic; /* reference to jffs2_inode_cache */ uint32_t ino; /* only used in scanning/building */ @@ -58,6 +61,12 @@ struct jffs2_xattr_ref struct jffs2_xattr_ref *next; /* chained from ic->xref_list */ }; +#define XREF_DELETE_MARKER (0x00000001) +static inline int is_xattr_ref_dead(struct jffs2_xattr_ref *ref) +{ + return ((ref->xseqno & XREF_DELETE_MARKER) != 0); +} + #ifdef CONFIG_JFFS2_FS_XATTR extern void jffs2_init_xattr_subsystem(struct jffs2_sb_info *c); @@ -70,9 +79,13 @@ extern struct jffs2_xattr_datum *jffs2_setup_xattr_datum(struct jffs2_sb_info *c extern void jffs2_xattr_delete_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); extern void jffs2_xattr_free_inode(struct jffs2_sb_info *c, struct jffs2_inode_cache *ic); -extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd); -extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref); +extern int jffs2_garbage_collect_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd, + struct jffs2_raw_node_ref *raw); +extern int jffs2_garbage_collect_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref, + struct jffs2_raw_node_ref *raw); extern int jffs2_verify_xattr(struct jffs2_sb_info *c); +extern void jffs2_release_xattr_datum(struct jffs2_sb_info *c, struct jffs2_xattr_datum *xd); +extern void jffs2_release_xattr_ref(struct jffs2_sb_info *c, struct jffs2_xattr_ref *ref); extern int do_jffs2_getxattr(struct inode *inode, int xprefix, const char *xname, char *buffer, size_t size); diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 5549378358b..4d52593a5fc 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr) /* allocate the disk blocks for the extent. initially, extBalloc() * will try to allocate disk blocks for the requested size (xlen). - * if this fails (xlen contigious free blocks not avaliable), it'll + * if this fails (xlen contiguous free blocks not avaliable), it'll * try to allocate a smaller number of blocks (producing a smaller * extent), with this smaller number of blocks consisting of the * requested number of blocks rounded down to the next smaller @@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp) * * initially, we will try to allocate disk blocks for the * requested size (nblocks). if this fails (nblocks - * contigious free blocks not avaliable), we'll try to allocate + * contiguous free blocks not avaliable), we'll try to allocate * a smaller number of blocks (producing a smaller extent), with * this smaller number of blocks consisting of the requested * number of blocks rounded down to the next smaller power of 2 @@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) /* get the number of blocks to initially attempt to allocate. * we'll first try the number of blocks requested unless this - * number is greater than the maximum number of contigious free + * number is greater than the maximum number of contiguous free * blocks in the map. in that case, we'll start off with the * maximum free. */ @@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * in place. if this fails, we'll try to move the extent * to a new set of blocks. if moving the extent, we initially * will try to allocate disk blocks for the requested size - * (nnew). if this fails (nnew contigious free blocks not + * (nnew). if this fails (new contiguous free blocks not * avaliable), we'll try to allocate a smaller number of * blocks (producing a smaller extent), with this smaller * number of blocks consisting of the requested number of diff --git a/fs/libfs.c b/fs/libfs.c index fc785d8befb..ac02ea602c3 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) /* fallthrough */ default: spin_lock(&dcache_lock); - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &dentry->d_subdirs); - } + if (filp->f_pos == 2) + list_move(q, &dentry->d_subdirs); + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); @@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; spin_lock(&dcache_lock); /* next is still alive */ - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/namespace.c b/fs/namespace.c index 866430bb024..b3ed212ea41 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { struct vfsmount *p; - for (p = mnt; p; p = next_mnt(p, mnt)) { - list_del(&p->mnt_hash); - list_add(&p->mnt_hash, kill); - } + for (p = mnt; p; p = next_mnt(p, mnt)) + list_move(&p->mnt_hash, kill); if (propagate) propagate_umount(kill); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 402005c35ab..8ca9707be6c 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -909,7 +909,7 @@ int __init nfs_init_directcache(void) * nfs_destroy_directcache - destroy the slab cache for nfs_direct_req structures * */ -void __exit nfs_destroy_directcache(void) +void nfs_destroy_directcache(void) { if (kmem_cache_destroy(nfs_direct_cachep)) printk(KERN_INFO "nfs_direct_cache: not all structures were freed\n"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 51bc88b662f..c5b916605fb 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1132,7 +1132,7 @@ static int __init nfs_init_inodecache(void) return 0; } -static void __exit nfs_destroy_inodecache(void) +static void nfs_destroy_inodecache(void) { if (kmem_cache_destroy(nfs_inode_cachep)) printk(KERN_INFO "nfs_inode_cache: not all structures were freed\n"); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index bd2815e2dec..4fe51c1292b 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -31,15 +31,15 @@ extern struct svc_version nfs4_callback_version1; /* pagelist.c */ extern int __init nfs_init_nfspagecache(void); -extern void __exit nfs_destroy_nfspagecache(void); +extern void nfs_destroy_nfspagecache(void); extern int __init nfs_init_readpagecache(void); -extern void __exit nfs_destroy_readpagecache(void); +extern void nfs_destroy_readpagecache(void); extern int __init nfs_init_writepagecache(void); -extern void __exit nfs_destroy_writepagecache(void); +extern void nfs_destroy_writepagecache(void); #ifdef CONFIG_NFS_DIRECTIO extern int __init nfs_init_directcache(void); -extern void __exit nfs_destroy_directcache(void); +extern void nfs_destroy_directcache(void); #else #define nfs_init_directcache() (0) #define nfs_destroy_directcache() do {} while(0) diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index ef9429643eb..d89f6fb3b3a 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -390,7 +390,7 @@ int __init nfs_init_nfspagecache(void) return 0; } -void __exit nfs_destroy_nfspagecache(void) +void nfs_destroy_nfspagecache(void) { if (kmem_cache_destroy(nfs_page_cachep)) printk(KERN_INFO "nfs_page: not all structures were freed\n"); diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 41c2ffee24f..32cf3773af0 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -711,7 +711,7 @@ int __init nfs_init_readpagecache(void) return 0; } -void __exit nfs_destroy_readpagecache(void) +void nfs_destroy_readpagecache(void) { mempool_destroy(nfs_rdata_mempool); if (kmem_cache_destroy(nfs_rdata_cachep)) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index b383fdd3a15..8fccb9cb173 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1551,7 +1551,7 @@ int __init nfs_init_writepagecache(void) return 0; } -void __exit nfs_destroy_writepagecache(void) +void nfs_destroy_writepagecache(void) { mempool_destroy(nfs_commit_mempool); mempool_destroy(nfs_wdata_mempool); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 96c7578cbe1..7c7d01672d3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -123,7 +123,7 @@ static void release_stateid(struct nfs4_stateid *stp, int flags); */ /* recall_lock protects the del_recall_lru */ -static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(recall_lock); static struct list_head del_recall_lru; static void @@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp) dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_del_init(&clp->cl_strhash); - list_del_init(&clp->cl_idhash); - list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); strhashval = clientstr_hashval(clp->cl_recdir); list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); renew_client(clp); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d852ebb538e..fdf7cf3dfad 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -103,8 +103,7 @@ nfsd_cache_shutdown(void) static void lru_put_end(struct svc_cacherep *rp) { - list_del(&rp->c_lru); - list_add_tail(&rp->c_lru, &lru_head); + list_move_tail(&rp->c_lru, &lru_head); } /* diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 21f38accd03..1d26cfcd9f8 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -54,7 +54,7 @@ static DECLARE_RWSEM(o2hb_callback_sem); * multiple hb threads are watching multiple regions. A node is live * whenever any of the threads sees activity from the node in its region. */ -static spinlock_t o2hb_live_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(o2hb_live_lock); static struct list_head o2hb_live_slots[O2NM_MAX_NODES]; static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; static LIST_HEAD(o2hb_node_events); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0f60cc0d398..1591eb37a72 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -108,7 +108,7 @@ ##args); \ } while (0) -static rwlock_t o2net_handler_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(o2net_handler_lock); static struct rb_root o2net_handler_tree = RB_ROOT; static struct o2net_node o2net_nodes[O2NM_MAX_NODES]; diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 355593dd8ef..42775e2bbe2 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, lock->ml.node == dlm->node_num ? "master" : "remote"); memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); - } else if (lksb->flags & DLM_LKSB_PUT_LVB) { - mlog(0, "setting lvb from lockres for %s node\n", - lock->ml.node == dlm->node_num ? "master" : - "remote"); - memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); } + /* Do nothing for lvb put requests - they should be done in + * place when the lock is downconverted - otherwise we risk + * racing gets and puts which could result in old lvb data + * being propagated. We leave the put flag set and clear it + * here. In the future we might want to clear it at the time + * the put is actually done. + */ spin_unlock(&res->spinlock); } @@ -381,8 +383,7 @@ do_ast: ret = DLM_NORMAL; if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); mlog(0, "ast: adding to granted list... type=%d, " "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 88cc43df18f..9bdc9cf6599 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,7 +37,17 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_SIZE_DEFAULT (1 << 14) +#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE +# define DLM_HASH_PAGES 1 +#else +# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) +#endif +#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) + +/* Intended to make it easier for us to switch out hash functions */ +#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) enum dlm_ast_type { DLM_AST = 0, @@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) return 0; } -#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_FINALIZE 0x0002 struct dlm_recovery_ctxt { @@ -85,7 +96,7 @@ enum dlm_ctxt_state { struct dlm_ctxt { struct list_head list; - struct hlist_head *lockres_hash; + struct hlist_head **lockres_hash; struct list_head dirty_list; struct list_head purge_list; struct list_head pending_asts; @@ -120,6 +131,7 @@ struct dlm_ctxt struct o2hb_callback_func dlm_hb_down; struct task_struct *dlm_thread_task; struct task_struct *dlm_reco_thread_task; + struct workqueue_struct *dlm_worker; wait_queue_head_t dlm_thread_wq; wait_queue_head_t dlm_reco_thread_wq; wait_queue_head_t ast_wq; @@ -132,6 +144,11 @@ struct dlm_ctxt struct list_head dlm_eviction_callbacks; }; +static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) +{ + return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); +} + /* these keventd work queue items are for less-frequently * called functions that cannot be directly called from the * net message handlers for some reason, usually because @@ -216,20 +233,29 @@ struct dlm_lock_resource /* WARNING: Please see the comment in dlm_init_lockres before * adding fields here. */ struct hlist_node hash_node; + struct qstr lockname; struct kref refs; - /* please keep these next 3 in this order - * some funcs want to iterate over all lists */ + /* + * Please keep granted, converting, and blocked in this order, + * as some funcs want to iterate over all lists. + * + * All four lists are protected by the hash's reference. + */ struct list_head granted; struct list_head converting; struct list_head blocked; + struct list_head purge; + /* + * These two lists require you to hold an additional reference + * while they are on the list. + */ struct list_head dirty; struct list_head recovering; // dlm_recovery_ctxt.resources list /* unused lock resources have their last_used stamped and are * put on a list for the dlm thread to run. */ - struct list_head purge; unsigned long last_used; unsigned migration_pending:1; @@ -238,7 +264,6 @@ struct dlm_lock_resource wait_queue_head_t wq; u8 owner; //node which owns the lock resource, or unknown u16 state; - struct qstr lockname; char lvb[DLM_LVB_LEN]; }; @@ -300,6 +325,15 @@ enum dlm_lockres_list { DLM_BLOCKED_LIST }; +static inline int dlm_lvb_is_empty(char *lvb) +{ + int i; + for (i=0; i<DLM_LVB_LEN; i++) + if (lvb[i]) + return 0; + return 1; +} + static inline struct list_head * dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) { @@ -609,7 +643,8 @@ struct dlm_finalize_reco { u8 node_idx; u8 dead_node; - __be16 pad1; + u8 flags; + u8 pad1; __be32 pad2; }; @@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm); void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); @@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres); -void dlm_lockres_get(struct dlm_lock_resource *res); +static inline void dlm_lockres_get(struct dlm_lock_resource *res) +{ + /* This is called on every lookup, so it might be worth + * inlining. */ + kref_get(&res->refs); +} void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, - unsigned int len); + unsigned int len, + unsigned int hash); struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len); @@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); +int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) { diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 8285228d9e3..c764dc8e40a 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -214,6 +214,9 @@ grant: if (lock->ml.node == dlm->node_num) mlog(0, "doing in-place convert for nonlocal lock\n"); lock->ml.type = type; + if (lock->lksb->flags & DLM_LKSB_PUT_LVB) + memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; @@ -231,8 +234,7 @@ switch_queues: lock->ml.convert_type = type; /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); unlock_exit: spin_unlock(&lock->spinlock); @@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); } @@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); lock->convert_pending = 1; lock->ml.convert_type = type; @@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL) { + spin_unlock(&res->spinlock); + dlm_error(status); + goto leave; + } list_for_each(iter, &res->granted) { lock = list_entry(iter, struct dlm_lock, list); if (lock->ml.cookie == cnv->cookie && @@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } lock = NULL; } + if (!lock) { + __dlm_print_one_lock_resource(res); + list_for_each(iter, &res->granted) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.node == cnv->node_idx) { + mlog(ML_ERROR, "There is something here " + "for node %u, lock->ml.cookie=%llu, " + "cnv->cookie=%llu\n", cnv->node_idx, + (unsigned long long)lock->ml.cookie, + (unsigned long long)cnv->cookie); + break; + } + } + lock = NULL; + } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c7eae5d3324..3f6c8d88f7a 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -37,10 +37,8 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" -#include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" @@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid) } EXPORT_SYMBOL_GPL(dlm_print_one_lock); +#if 0 void dlm_dump_lock_resources(struct dlm_ctxt *dlm) { struct dlm_lock_resource *res; @@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); for (i=0; i<DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) dlm_print_one_lock_resource(res); } spin_unlock(&dlm->spinlock); } +#endif /* 0 */ static const char *dlm_errnames[] = { [DLM_NORMAL] = "DLM_NORMAL", diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h deleted file mode 100644 index 6858510c3cc..00000000000 --- a/fs/ocfs2/dlm/dlmdebug.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmdebug.h - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef DLMDEBUG_H -#define DLMDEBUG_H - -void dlm_dump_lock_resources(struct dlm_ctxt *dlm); - -#endif diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8f3a9e3106f..b8c23f7ba67 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -41,7 +41,6 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #include "dlmver.h" @@ -49,6 +48,33 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +static void dlm_free_pagevec(void **vec, int pages) +{ + while (pages--) + free_page((unsigned long)vec[pages]); + kfree(vec); +} + +static void **dlm_alloc_pagevec(int pages) +{ + void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + int i; + + if (!vec) + return NULL; + + for (i = 0; i < pages; i++) + if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) + goto out_free; + + mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", + pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE); + return vec; +out_free: + dlm_free_pagevec(vec, i); + return NULL; +} + /* * * spinlock lock ordering: if multiple locks are needed, obey this ordering: @@ -62,7 +88,7 @@ * */ -spinlock_t dlm_domain_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(dlm_domain_lock); LIST_HEAD(dlm_domains); static DECLARE_WAIT_QUEUE_HEAD(dlm_domain_events); @@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, assert_spin_locked(&dlm->spinlock); q = &res->lockname; - q->hash = full_name_hash(q->name, q->len); - bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); + bucket = dlm_lockres_hash(dlm, q->hash); /* get a reference for our hashtable */ dlm_lockres_get(res); @@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, } struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len) + const char *name, + unsigned int len, + unsigned int hash) { - unsigned int hash; - struct hlist_node *iter; - struct dlm_lock_resource *tmpres=NULL; struct hlist_head *bucket; + struct hlist_node *list; mlog_entry("%.*s\n", len, name); assert_spin_locked(&dlm->spinlock); - hash = full_name_hash(name, len); - - bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); - - /* check for pre-existing lock */ - hlist_for_each(iter, bucket) { - tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); - if (tmpres->lockname.len == len && - memcmp(tmpres->lockname.name, name, len) == 0) { - dlm_lockres_get(tmpres); - break; - } + bucket = dlm_lockres_hash(dlm, hash); - tmpres = NULL; + hlist_for_each(list, bucket) { + struct dlm_lock_resource *res = hlist_entry(list, + struct dlm_lock_resource, hash_node); + if (res->lockname.name[0] != name[0]) + continue; + if (unlikely(res->lockname.len != len)) + continue; + if (memcmp(res->lockname.name + 1, name + 1, len - 1)) + continue; + dlm_lockres_get(res); + return res; } - return tmpres; + return NULL; } struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, @@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len) { struct dlm_lock_resource *res; + unsigned int hash = dlm_lockid_hash(name, len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, len); + res = __dlm_lookup_lockres(dlm, name, len, hash); spin_unlock(&dlm->spinlock); return res; } @@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain) static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { if (dlm->lockres_hash) - free_page((unsigned long) dlm->lockres_hash); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); if (dlm->name) kfree(dlm->name); @@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) return ret; } +static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_worker) { + flush_workqueue(dlm->dlm_worker); + destroy_workqueue(dlm->dlm_worker); + dlm->dlm_worker = NULL; + } +} + static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); /* We've left the domain. Now we can take ourselves out of the * list and allow the kref stuff to help us free the @@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) restart: spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(&dlm->lockres_hash[i])) { - res = hlist_entry(dlm->lockres_hash[i].first, + while (!hlist_empty(dlm_lockres_hash(dlm, i))) { + res = hlist_entry(dlm_lockres_hash(dlm, i)->first, struct dlm_lock_resource, hash_node); /* need reference when manually grabbing lockres */ dlm_lockres_get(res); @@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } + dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + if (!dlm->dlm_worker) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + do { unsigned int backoff; status = dlm_try_to_join_domain(dlm); @@ -1166,6 +1207,7 @@ bail: dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); } return status; @@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); + dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { mlog_errno(-ENOMEM); kfree(dlm->name); @@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - for (i=0; i<DLM_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dlm->lockres_hash[i]); + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); strcpy(dlm->name, domain); dlm->key = key; @@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; + dlm->dlm_worker = NULL; init_waitqueue_head(&dlm->dlm_thread_wq); init_waitqueue_head(&dlm->dlm_reco_thread_wq); init_waitqueue_head(&dlm->reco.event); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 7273d9fa6ba..033ad170123 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode, * doesn't make sense for LVB writes. */ file->f_flags &= ~O_APPEND; - fp = kmalloc(sizeof(*fp), GFP_KERNEL); + fp = kmalloc(sizeof(*fp), GFP_NOFS); if (!fp) { status = -ENOMEM; goto bail; @@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp, else readlen = count - *ppos; - lvb_buf = kmalloc(readlen, GFP_KERNEL); + lvb_buf = kmalloc(readlen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; @@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp, else writelen = count - *ppos; - lvb_buf = kmalloc(writelen, GFP_KERNEL); + lvb_buf = kmalloc(writelen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 6fea28318d6..5ca57ec650c 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -53,7 +53,7 @@ #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" -static spinlock_t dlm_cookie_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(dlm_cookie_lock); static u64 dlm_next_cookie = 1; static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, @@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags) { enum dlm_status status = DLM_DENIED; + int lockres_changed = 1; mlog_entry("type=%d\n", lock->ml.type); mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, @@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, res->state &= ~DLM_LOCK_RES_IN_PROGRESS; lock->lock_pending = 0; if (status != DLM_NORMAL) { - if (status != DLM_NOTQUEUED) + if (status == DLM_RECOVERING && + dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* recovery lock was mastered by dead node. + * we need to have calc_usage shoot down this + * lockres and completely remaster it. */ + mlog(0, "%s: recovery lock was owned by " + "dead node %u, remaster it now.\n", + dlm->name, res->owner); + } else if (status != DLM_NOTQUEUED) { + /* + * DO NOT call calc_usage, as this would unhash + * the remote lockres before we ever get to use + * it. treat as if we never made any change to + * the lockres. + */ + lockres_changed = 0; dlm_error(status); + } dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); } else if (dlm_is_recovery_lock(res->lockname.name, @@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, mlog(0, "%s: $RECOVERY lock for this node (%u) is " "mastered by %u; got lock, manually granting (no ast)\n", dlm->name, dlm->node_num, res->owner); - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); - dlm_lockres_calc_usage(dlm, res); + if (lockres_changed) + dlm_lockres_calc_usage(dlm, res); wake_up(&res->wq); return status; @@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, if (tmpret >= 0) { // successfully sent and received ret = status; // this is already a dlm_status + if (ret == DLM_REJECTED) { + mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " + "no longer owned by %u. that node is coming back " + "up currently.\n", dlm->name, create.namelen, + create.name, res->owner); + dlm_print_one_lock_resource(res); + BUG(); + } } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { @@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); + lock = kcalloc(1, sizeof(*lock), GFP_NOFS); if (!lock) return NULL; if (!lksb) { /* zero memory only if kernel-allocated */ - lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); + lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); if (!lksb) { kfree(lock); return NULL; @@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return DLM_REJECTED; - mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), - "Domain %s not fully joined!\n", dlm->name); - name = create->name; namelen = create->namelen; + status = DLM_REJECTED; + if (!dlm_domain_fully_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " + "sending a create_lock message for lock %.*s!\n", + dlm->name, create->node_idx, namelen, name); + dlm_error(status); + goto leave; + } status = DLM_IVBUFLEN; if (namelen > DLM_LOCKID_NAME_MAX) { @@ -669,18 +700,22 @@ retry_lock: msleep(100); /* no waiting for dlm_reco_thread */ if (recovery) { - if (status == DLM_RECOVERING) { - mlog(0, "%s: got RECOVERING " - "for $REOCVERY lock, master " - "was %u\n", dlm->name, - res->owner); - dlm_wait_for_node_death(dlm, res->owner, - DLM_NODE_DEATH_WAIT_MAX); - } + if (status != DLM_RECOVERING) + goto retry_lock; + + mlog(0, "%s: got RECOVERING " + "for $RECOVERY lock, master " + "was %u\n", dlm->name, + res->owner); + /* wait to see the node go down, then + * drop down and allow the lockres to + * get cleaned up. need to remaster. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); } else { dlm_wait_for_recovery(dlm); + goto retry_lock; } - goto retry_lock; } if (status != DLM_NORMAL) { diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 940be4c13b1..1b8346dd057 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -47,7 +47,6 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) @@ -74,6 +73,7 @@ struct dlm_master_list_entry wait_queue_head_t wq; atomic_t woken; struct kref mle_refs; + int inuse; unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -#if 0 -/* Code here is included but defined out as it aids debugging */ +#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) +static void _dlm_print_nodemap(unsigned long *map, const char *mapname) +{ + int i; + printk("%s=[ ", mapname); + for (i=0; i<O2NM_MAX_NODES; i++) + if (test_bit(i, map)) + printk("%d ", i); + printk("]"); +} -void dlm_print_one_mle(struct dlm_master_list_entry *mle) +static void dlm_print_one_mle(struct dlm_master_list_entry *mle) { - int i = 0, refs; + int refs; char *type; char attached; u8 master; unsigned int namelen; const char *name; struct kref *k; + unsigned long *maybe = mle->maybe_map, + *vote = mle->vote_map, + *resp = mle->response_map, + *node = mle->node_map; k = &mle->mle_refs; if (mle->type == DLM_MLE_BLOCK) @@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) name = mle->u.res->lockname.name; } - mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", - i, type, refs, master, mle->new_master, attached, - namelen, namelen, name); + mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", + namelen, name, type, refs, master, mle->new_master, attached, + mle->inuse); + dlm_print_nodemap(maybe); + printk(", "); + dlm_print_nodemap(vote); + printk(", "); + dlm_print_nodemap(resp); + printk(", "); + dlm_print_nodemap(node); + printk(", "); + printk("\n"); } +#if 0 +/* Code here is included but defined out as it aids debugging */ + static void dlm_dump_mles(struct dlm_ctxt *dlm) { struct dlm_master_list_entry *mle; struct list_head *iter; mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); - mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); spin_lock(&dlm->master_lock); list_for_each(iter, &dlm->master_list) { mle = list_entry(iter, struct dlm_master_list_entry, list); @@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } +static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + mle->inuse++; + kref_get(&mle->mle_refs); +} + +static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + mle->inuse--; + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +} + /* remove from list and free */ static void __dlm_put_mle(struct dlm_master_list_entry *mle) { @@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - BUG_ON(!atomic_read(&mle->mle_refs.refcount)); - - kref_put(&mle->mle_refs, dlm_mle_release); + if (!atomic_read(&mle->mle_refs.refcount)) { + /* this may or may not crash, but who cares. + * it's a BUG. */ + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + BUG(); + } else + kref_put(&mle->mle_refs, dlm_mle_release); } @@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, memset(mle->response_map, 0, sizeof(mle->response_map)); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; + mle->inuse = 0; if (mle->type == DLM_MLE_MASTER) { BUG_ON(!res); @@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + if (!hlist_unhashed(&res->hash_node) || + !list_empty(&res->granted) || + !list_empty(&res->converting) || + !list_empty(&res->blocked) || + !list_empty(&res->dirty) || + !list_empty(&res->recovering) || + !list_empty(&res->purge)) { + mlog(ML_ERROR, + "Going to BUG for resource %.*s." + " We're on a list! [%c%c%c%c%c%c%c]\n", + res->lockname.len, res->lockname.name, + !hlist_unhashed(&res->hash_node) ? 'H' : ' ', + !list_empty(&res->granted) ? 'G' : ' ', + !list_empty(&res->converting) ? 'C' : ' ', + !list_empty(&res->blocked) ? 'B' : ' ', + !list_empty(&res->dirty) ? 'D' : ' ', + !list_empty(&res->recovering) ? 'R' : ' ', + !list_empty(&res->purge) ? 'P' : ' '); + + dlm_print_one_lock_resource(res); + } + /* By the time we're ready to blow this guy away, we shouldn't * be on any lists. */ BUG_ON(!hlist_unhashed(&res->hash_node)); @@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref) kfree(res); } -void dlm_lockres_get(struct dlm_lock_resource *res) -{ - kref_get(&res->refs); -} - void dlm_lockres_put(struct dlm_lock_resource *res) { kref_put(&res->refs, dlm_lockres_release); @@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, memcpy(qname, name, namelen); res->lockname.len = namelen; - res->lockname.hash = full_name_hash(name, namelen); + res->lockname.hash = dlm_lockid_hash(name, namelen); init_waitqueue_head(&res->wq); spin_lock_init(&res->spinlock); @@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res; - res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); if (!res) return NULL; - res->lockname.name = kmalloc(namelen, GFP_KERNEL); + res->lockname.name = kmalloc(namelen, GFP_NOFS); if (!res->lockname.name) { kfree(res); return NULL; @@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, int blocked = 0; int ret, nodenum; struct dlm_node_iter iter; - unsigned int namelen; + unsigned int namelen, hash; int tries = 0; int bit, wait_on_recovery = 0; BUG_ON(!lockid); namelen = strlen(lockid); + hash = dlm_lockid_hash(lockid, namelen); mlog(0, "get lockres %s (len %d)\n", lockid, namelen); lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); + tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); if (tmpres) { spin_unlock(&dlm->spinlock); mlog(0, "found in hash!\n"); @@ -704,7 +776,7 @@ lookup: mlog(0, "allocating a new resource\n"); /* nothing found and we need to allocate one. */ alloc_mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!alloc_mle) goto leave; res = dlm_new_lockres(dlm, lockid, namelen); @@ -790,10 +862,11 @@ lookup: * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we * need an extra one to keep from a bad ptr deref. */ - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); +redo_request: while (wait_on_recovery) { /* any cluster changes that occurred after dropping the * dlm spinlock would be detectable be a change on the mle, @@ -812,7 +885,7 @@ lookup: } dlm_kick_recovery_thread(dlm); - msleep(100); + msleep(1000); dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); @@ -825,13 +898,15 @@ lookup: } else wait_on_recovery = 0; spin_unlock(&dlm->spinlock); + + if (wait_on_recovery) + dlm_wait_for_node_recovery(dlm, bit, 10000); } /* must wait for lock to be mastered elsewhere */ if (blocked) goto wait; -redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { @@ -856,6 +931,7 @@ wait: /* keep going until the response map includes all nodes */ ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); if (ret < 0) { + wait_on_recovery = 1; mlog(0, "%s:%.*s: node map changed, redo the " "master request now, blocked=%d\n", dlm->name, res->lockname.len, @@ -866,7 +942,7 @@ wait: dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); - /* dlm_print_one_mle(mle); */ + dlm_print_one_mle(mle); tries = 0; } goto redo_request; @@ -880,7 +956,7 @@ wait: dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); /* put the extra ref */ - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); wake_waiters: spin_lock(&res->spinlock); @@ -921,12 +997,14 @@ recheck: spin_unlock(&res->spinlock); /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ - ret = dlm_do_master_request(mle, res->owner); - if (ret < 0) { - /* give recovery a chance to run */ - mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); - msleep(500); - goto recheck; + if (res->owner != dlm->node_num) { + ret = dlm_do_master_request(mle, res->owner); + if (ret < 0) { + /* give recovery a chance to run */ + mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); + msleep(500); + goto recheck; + } } ret = 0; goto leave; @@ -962,6 +1040,12 @@ recheck: "rechecking now\n", dlm->name, res->lockname.len, res->lockname.name); goto recheck; + } else { + if (!voting_done) { + mlog(0, "map not changed and voting not done " + "for %s:%.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + } } if (m != O2NM_MAX_NODES) { @@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, set_bit(node, mle->vote_map); } else { mlog(ML_ERROR, "node down! %d\n", node); - - /* if the node wasn't involved in mastery skip it, - * but clear it out from the maps so that it will - * not affect mastery of this lockres */ - clear_bit(node, mle->response_map); - clear_bit(node, mle->vote_map); - if (!test_bit(node, mle->maybe_map)) - goto next; - - /* if we're already blocked on lock mastery, and the - * dead node wasn't the expected master, or there is - * another node in the maybe_map, keep waiting */ if (blocked) { int lowest = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); @@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* act like it was never there */ clear_bit(node, mle->maybe_map); - if (node != lowest) - goto next; - - mlog(ML_ERROR, "expected master %u died while " - "this node was blocked waiting on it!\n", - node); - lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, - lowest+1); - if (lowest < O2NM_MAX_NODES) { - mlog(0, "still blocked. waiting " - "on %u now\n", lowest); - goto next; + if (node == lowest) { + mlog(0, "expected master %u died" + " while this node was blocked " + "waiting on it!\n", node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "%s:%.*s:still " + "blocked. waiting on %u " + "now\n", dlm->name, + res->lockname.len, + res->lockname.name, + lowest); + } else { + /* mle is an MLE_BLOCK, but + * there is now nothing left to + * block on. we need to return + * all the way back out and try + * again with an MLE_MASTER. + * dlm_do_local_recovery_cleanup + * has already run, so the mle + * refcount is ok */ + mlog(0, "%s:%.*s: no " + "longer blocking. try to " + "master this here\n", + dlm->name, + res->lockname.len, + res->lockname.name); + mle->type = DLM_MLE_MASTER; + mle->u.res = res; + } } - - /* mle is an MLE_BLOCK, but there is now - * nothing left to block on. we need to return - * all the way back out and try again with - * an MLE_MASTER. dlm_do_local_recovery_cleanup - * has already run, so the mle refcount is ok */ - mlog(0, "no longer blocking. we can " - "try to master this here\n"); - mle->type = DLM_MLE_MASTER; - memset(mle->maybe_map, 0, - sizeof(mle->maybe_map)); - memset(mle->response_map, 0, - sizeof(mle->maybe_map)); - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); - mle->u.res = res; - set_bit(dlm->node_num, mle->maybe_map); - - ret = -EAGAIN; - goto next; } - clear_bit(node, mle->maybe_map); - if (node > dlm->node_num) - goto next; - - mlog(0, "dead node in map!\n"); - /* yuck. go back and re-contact all nodes - * in the vote_map, removing this node. */ - memset(mle->response_map, 0, - sizeof(mle->response_map)); + /* now blank out everything, as if we had never + * contacted anyone */ + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + memset(mle->response_map, 0, sizeof(mle->response_map)); + /* reset the vote_map to the current node_map */ + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + /* put myself into the maybe map */ + if (mle->type != DLM_MLE_BLOCK) + set_bit(dlm->node_num, mle->maybe_map); } ret = -EAGAIN; -next: node = dlm_bitmap_diff_iter_next(&bdi, &sc); } return ret; @@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; int found, ret; int set_maybe; int dispatch_assert = 0; @@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) name = request->name; namelen = request->namelen; + hash = dlm_lockid_hash(name, namelen); if (namelen > DLM_LOCKID_NAME_MAX) { response = DLM_IVBUFLEN; @@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) way_up_top: spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_unlock(&dlm->spinlock); @@ -1459,21 +1531,18 @@ way_up_top: spin_unlock(&dlm->spinlock); mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { response = DLM_MASTER_RESP_ERROR; mlog_errno(-ENOMEM); goto send_response; } - spin_lock(&dlm->spinlock); - dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, - name, namelen); - spin_unlock(&dlm->spinlock); goto way_up_top; } // mlog(0, "this is second time thru, already allocated, " // "add the block.\n"); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); list_add(&mle->list, &dlm->master_list); response = DLM_MASTER_RESP_NO; @@ -1556,6 +1625,8 @@ again: dlm_node_iter_init(nodemap, &iter); while ((to = dlm_node_iter_next(&iter)) >= 0) { int r = 0; + struct dlm_master_list_entry *mle = NULL; + mlog(0, "sending assert master to %d (%.*s)\n", to, namelen, lockname); memset(&assert, 0, sizeof(assert)); @@ -1567,20 +1638,28 @@ again: tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, &assert, sizeof(assert), to, &r); if (tmpret < 0) { - mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); + mlog(0, "assert_master returned %d!\n", tmpret); if (!dlm_is_host_down(tmpret)) { - mlog(ML_ERROR, "unhandled error!\n"); + mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); BUG(); } /* a node died. finish out the rest of the nodes. */ - mlog(ML_ERROR, "link to %d went down!\n", to); + mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " "got %d.\n", namelen, lockname, to, r); - dlm_dump_lock_resources(dlm); + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + if (dlm_find_mle(dlm, &mle, (char *)lockname, + namelen)) { + dlm_print_one_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); BUG(); } else if (r == EAGAIN) { mlog(0, "%.*s: node %u create mles on other " @@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; struct dlm_lock_resource *res = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; u32 flags; int master_request = 0; int ret = 0; @@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) name = assert->name; namelen = assert->namelen; + hash = dlm_lockid_hash(name, namelen); flags = be32_to_cpu(assert->flags); if (namelen > DLM_LOCKID_NAME_MAX) { @@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ - mlog(ML_ERROR, "no bits set in the maybe_map, but %u " + mlog(0, "no bits set in the maybe_map, but %u " "is asserting! (%.*s)\n", assert->node_idx, namelen, name); } else if (bit != assert->node_idx) { @@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) * number winning the mastery will respond * YES to mastery requests, but this node * had no way of knowing. let it pass. */ - mlog(ML_ERROR, "%u is the lowest node, " + mlog(0, "%u is the lowest node, " "%u is asserting. (%.*s) %u must " "have begun after %u won.\n", bit, assert->node_idx, namelen, name, bit, assert->node_idx); } } + if (mle->type == DLM_MLE_MIGRATION) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "%s:%.*s: got cleanup assert" + " from %u for migration\n", + dlm->name, namelen, name, + assert->node_idx); + } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { + mlog(0, "%s:%.*s: got unrelated assert" + " from %u for migration, ignoring\n", + dlm->name, namelen, name, + assert->node_idx); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + goto done; + } + } } spin_unlock(&dlm->master_lock); /* ok everything checks out with the MLE * now check to see if there is a lockres */ - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) goto kill; } if (!mle) { - if (res->owner != assert->node_idx) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && + res->owner != assert->node_idx) { mlog(ML_ERROR, "assert_master from " "%u, but current owner is " "%u! (%.*s)\n", @@ -1732,6 +1830,7 @@ ok: if (mle) { int extra_ref = 0; int nn = -1; + int rr, err = 0; spin_lock(&mle->spinlock); if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) @@ -1751,27 +1850,64 @@ ok: wake_up(&mle->wq); spin_unlock(&mle->spinlock); - if (mle->type == DLM_MLE_MIGRATION && res) { - mlog(0, "finishing off migration of lockres %.*s, " - "from %u to %u\n", - res->lockname.len, res->lockname.name, - dlm->node_num, mle->new_master); + if (res) { spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - dlm_change_lockres_owner(dlm, res, mle->new_master); - BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + if (mle->type == DLM_MLE_MIGRATION) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + } else { + dlm_change_lockres_owner(dlm, res, mle->master); + } spin_unlock(&res->spinlock); } - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - + + /* master is known, detach if not already detached. + * ensures that only one assert_master call will happen + * on this mle. */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + rr = atomic_read(&mle->mle_refs.refcount); + if (mle->inuse > 0) { + if (extra_ref && rr < 3) + err = 1; + else if (!extra_ref && rr < 2) + err = 1; + } else { + if (extra_ref && rr < 2) + err = 1; + else if (!extra_ref && rr < 1) + err = 1; + } + if (err) { + mlog(ML_ERROR, "%s:%.*s: got assert master from %u " + "that will mess up this node, refs=%d, extra=%d, " + "inuse=%d\n", dlm->name, namelen, name, + assert->node_idx, rr, extra_ref, mle->inuse); + dlm_print_one_mle(mle); + } + list_del_init(&mle->list); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); if (extra_ref) { /* the assert master message now balances the extra * ref given by the master / migration request message. * if this is the last put, it will be removed * from the list. */ - dlm_put_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + } else if (res) { + if (res->owner != assert->node_idx) { + mlog(0, "assert_master from %u, but current " + "owner is %u (%.*s), no mle\n", assert->node_idx, + res->owner, namelen, name); } } @@ -1788,12 +1924,12 @@ done: kill: /* kill the caller! */ + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); dlm_lockres_put(res); - mlog(ML_ERROR, "Bad message received from another node. Dumping state " - "and killing the other node now! This node is OK and can continue.\n"); - dlm_dump_lock_resources(dlm); dlm_put(dlm); return -EINVAL; } @@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) return -ENOMEM; @@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); return 0; } @@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) } } + /* + * If we're migrating this lock to someone else, we are no + * longer allowed to assert out own mastery. OTOH, we need to + * prevent migration from starting while we're still asserting + * our dominance. The reserved ast delays migration. + */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Someone asked us to assert mastery, but we're " + "in the middle of migration. Skipping assert, " + "the new master will handle that.\n"); + spin_unlock(&res->spinlock); + goto put; + } else + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + /* this call now finishes out the nodemap * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", @@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ - mlog_errno(ret); + if (!dlm_is_host_down(ret)) + mlog_errno(ret); } + /* Ok, we've asserted ourselves. Let's let migration start. */ + dlm_lockres_release_ast(dlm, res); + +put: dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, BUG(); /* host is down, so answer for that node would be * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + ret = 0; } if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { @@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, */ ret = -ENOMEM; - mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); if (!mres) { mlog_errno(ret); goto leave; } mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { mlog_errno(ret); goto leave; @@ -2117,7 +2276,7 @@ fail: * take both dlm->spinlock and dlm->master_lock */ spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2134,7 +2293,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } @@ -2164,8 +2326,8 @@ fail: /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { - mlog(0, "%s:%.*s: expected migration target %u " - "is no longer up. restarting.\n", + mlog(0, "%s:%.*s: expected migration " + "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); ret = -ERESTARTSYS; @@ -2175,7 +2337,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } /* TODO: if node died: stop, clean up, return error */ @@ -2191,7 +2356,7 @@ fail: /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); ret = 0; dlm_lockres_calc_usage(dlm, res); @@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; const char *name; - unsigned int namelen; + unsigned int namelen, hash; int ret = 0; if (!dlm_grab(dlm)) @@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) name = migrate->name; namelen = migrate->namelen; + hash = dlm_lockid_hash(name, namelen); /* preallocate.. if this fails, abort */ mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { ret = -ENOMEM; @@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) /* check for pre-existing lock */ spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); spin_lock(&dlm->master_lock); if (res) { @@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); + __dlm_mle_detach_hb_events(dlm, mle); } spin_unlock(&tmp->spinlock); } @@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct list_head *iter, *iter2; struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; + unsigned int hash; mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); top: @@ -2640,7 +2808,7 @@ top: * may result in the mle being unlinked and * freed, but there may still be a process * waiting in the dlmlock path which is fine. */ - mlog(ML_ERROR, "node %u was expected master\n", + mlog(0, "node %u was expected master\n", dead_node); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); @@ -2673,19 +2841,21 @@ top: /* remove from the list early. NOTE: unlinking * list_head while in list_for_each_safe */ + __dlm_mle_detach_hb_events(dlm, mle); spin_lock(&mle->spinlock); list_del_init(&mle->list); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - mlog(0, "node %u died during migration from " - "%u to %u!\n", dead_node, + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, mle->new_master); /* if there is a lockres associated with this * mle, find it and set its owner to UNKNOWN */ + hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); res = __dlm_lookup_lockres(dlm, mle->u.name.name, - mle->u.name.len); + mle->u.name.len, hash); if (res) { /* unfortunately if we hit this rare case, our * lock ordering is messed. we need to drop diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 805cbabac05..29b2845f370 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -98,8 +98,8 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data); static u64 dlm_get_next_mig_cookie(void); -static spinlock_t dlm_reco_state_lock = SPIN_LOCK_UNLOCKED; -static spinlock_t dlm_mig_cookie_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(dlm_reco_state_lock); +static DEFINE_SPINLOCK(dlm_mig_cookie_lock); static u64 dlm_mig_cookie = 1; static u64 dlm_get_next_mig_cookie(void) @@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void) return c; } +static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, + u8 dead_node) +{ + assert_spin_locked(&dlm->spinlock); + if (dlm->reco.dead_node != dead_node) + mlog(0, "%s: changing dead_node from %u to %u\n", + dlm->name, dlm->reco.dead_node, dead_node); + dlm->reco.dead_node = dead_node; +} + +static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, + u8 master) +{ + assert_spin_locked(&dlm->spinlock); + mlog(0, "%s: changing new_master from %u to %u\n", + dlm->name, dlm->reco.new_master, master); + dlm->reco.new_master = master; +} + +static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + assert_spin_locked(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); +} + static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) { spin_lock(&dlm->spinlock); - clear_bit(dlm->reco.dead_node, dlm->recovery_map); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); } @@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data) struct list_head *iter, *iter2; struct dlm_work_item *item; dlm_workfunc_t *workfunc; + int tot=0; + + if (!dlm_joined(dlm)) + return; spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); list_for_each_safe(iter, iter2, &tmp_list) { + tot++; + } + mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); + + list_for_each_safe(iter, iter2, &tmp_list) { item = list_entry(iter, struct dlm_work_item, list); workfunc = item->func; list_del_init(&item->list); @@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) * */ +static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) +{ + struct dlm_reco_node_data *ndata; + struct dlm_lock_resource *res; + + mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", + dlm->reco.dead_node, dlm->reco.new_master); + + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + char *st = "unknown"; + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + st = "init"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + st = "requesting"; + break; + case DLM_RECO_NODE_DATA_DEAD: + st = "dead"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + st = "receiving"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + st = "requested"; + break; + case DLM_RECO_NODE_DATA_DONE: + st = "done"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + st = "finalize-sent"; + break; + default: + st = "bad"; + break; + } + mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", + dlm->name, ndata->node_num, st); + } + list_for_each_entry(res, &dlm->reco.resources, recovering) { + mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", + dlm->name, res->lockname.len, res->lockname.name); + } +} #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) @@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) { int dead; spin_lock(&dlm->spinlock); - dead = test_bit(node, dlm->domain_map); + dead = !test_bit(node, dlm->domain_map); spin_unlock(&dlm->spinlock); return dead; } +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) +{ + int recovered; + spin_lock(&dlm->spinlock); + recovered = !test_bit(node, dlm->recovery_map); + spin_unlock(&dlm->spinlock); + return recovered; +} + + int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) { if (timeout) { @@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) return 0; } +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(0, "%s: waiting %dms for notification of " + "recovery of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(0, "%s: waiting indefinitely for notification " + "of recovery of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm) void dlm_wait_for_recovery(struct dlm_ctxt *dlm) { + if (dlm_in_recovery(dlm)) { + mlog(0, "%s: reco thread %d in recovery: " + "state=%d, master=%u, dead=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state, dlm->reco.new_master, + dlm->reco.dead_node); + } wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); } @@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "new master %u died while recovering %u!\n", dlm->reco.new_master, dlm->reco.dead_node); /* unset the new_master, leave dead_node */ - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } /* select a target to recover */ @@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); if (bit >= O2NM_MAX_NODES || bit < 0) - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else - dlm->reco.dead_node = bit; + dlm_set_reco_dead_node(dlm, bit); } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { /* BUG? */ mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", dlm->reco.dead_node); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { @@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) /* return to main thread loop and sleep. */ return 0; } - mlog(0, "recovery thread found node %u in the recovery map!\n", + mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.dead_node); spin_unlock(&dlm->spinlock); @@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } mlog(0, "another node will master this recovery session.\n"); } - mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, dlm->reco.new_master, + mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, dlm->node_num, dlm->reco.dead_node); /* it is safe to start everything back up here @@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) return 0; master_here: - mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", + mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", + dlm->dlm_reco_thread_task->pid, dlm->name, dlm->reco.dead_node, dlm->node_num); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); if (status < 0) { + /* we should never hit this anymore */ mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); /* yield a bit to allow any final network messages @@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) int destroy = 0; int pass = 0; - status = dlm_init_recovery_area(dlm, dead_node); - if (status < 0) - goto leave; + do { + /* we have become recovery master. there is no escaping + * this, so just keep trying until we get it. */ + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) { + mlog(ML_ERROR, "%s: failed to alloc recovery area, " + "retrying\n", dlm->name); + msleep(1000); + } + } while (status != 0); /* safe to access the node data list without a lock, since this * process is the only one to change the list */ @@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) continue; } - status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); - if (status < 0) { - mlog_errno(status); - if (dlm_is_host_down(status)) - ndata->state = DLM_RECO_NODE_DATA_DEAD; - else { - destroy = 1; - goto leave; + do { + status = dlm_request_all_locks(dlm, ndata->node_num, + dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) { + /* node died, ignore it for recovery */ + status = 0; + ndata->state = DLM_RECO_NODE_DATA_DEAD; + /* wait for the domain map to catch up + * with the network state. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); + } else { + /* -ENOMEM on the other node */ + mlog(0, "%s: node %u returned " + "%d during recovery, retrying " + "after a short wait\n", + dlm->name, ndata->node_num, + status); + msleep(100); + } } - } + } while (status != 0); switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: @@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) mlog(0, "node %u died after requesting " "recovery info for node %u\n", ndata->node_num, dead_node); - // start all over - destroy = 1; - status = -EAGAIN; - goto leave; + /* fine. don't need this node's info. + * continue without it. */ + break; case DLM_RECO_NODE_DATA_REQUESTING: ndata->state = DLM_RECO_NODE_DATA_REQUESTED; mlog(0, "now receiving recovery data from " @@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(ML_NOTICE, "node %u died after " + mlog(0, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); - spin_unlock(&dlm_reco_state_lock); - // start all over - destroy = 1; - status = -EAGAIN; - /* instead of spinning like crazy here, - * wait for the domain map to catch up - * with the network state. otherwise this - * can be hit hundreds of times before - * the node is really seen as dead. */ - wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_dead(dlm, - ndata->node_num), - msecs_to_jiffies(1000)); - mlog(0, "waited 1 sec for %u, " - "dead? %s\n", ndata->node_num, - dlm_is_node_dead(dlm, ndata->node_num) ? - "yes" : "no"); - goto leave; + break; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: + mlog(0, "%s: node %u still in state %s\n", + dlm->name, ndata->node_num, + ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? + "receiving" : "requested"); all_nodes_done = 0; break; case DLM_RECO_NODE_DATA_DONE: + mlog(0, "%s: node %u state is done\n", + dlm->name, ndata->node_num); break; case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(0, "%s: node %u state is finalize\n", + dlm->name, ndata->node_num); break; } } @@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) jiffies, dlm->reco.dead_node, dlm->node_num, dlm->reco.new_master); destroy = 1; - status = ret; + status = 0; /* rescan everything marked dirty along the way */ dlm_kick_thread(dlm, NULL); break; @@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } -leave: if (destroy) dlm_destroy_recovery_area(dlm, dead_node); @@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) } BUG_ON(num == dead_node); - ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); + ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); if (!ndata) { dlm_destroy_recovery_area(dlm, dead_node); return -ENOMEM; @@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return -EINVAL; + if (lr->dead_node != dlm->reco.dead_node) { + mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " + "dead_node is %u\n", dlm->name, lr->node_idx, + lr->dead_node, dlm->reco.dead_node); + dlm_print_reco_node_status(dlm); + /* this is a hack */ + dlm_put(dlm); + return -ENOMEM; + } BUG_ON(lr->dead_node != dlm->reco.dead_node); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) { dlm_put(dlm); return -ENOMEM; } /* this will get freed by dlm_request_all_locks_worker */ - buf = (char *) __get_free_page(GFP_KERNEL); + buf = (char *) __get_free_page(GFP_NOFS); if (!buf) { kfree(item); dlm_put(dlm); @@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); dlm_put(dlm); return 0; @@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) struct list_head *iter; int ret; u8 dead_node, reco_master; + int skip_all_done = 0; dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; mres = (struct dlm_migratable_lockres *)data; + mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", + dlm->name, dead_node, reco_master); + if (dead_node != dlm->reco.dead_node || reco_master != dlm->reco.new_master) { - /* show extra debug info if the recovery state is messed */ - mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " - "request(dead=%u, master=%u)\n", - dlm->name, dlm->reco.dead_node, dlm->reco.new_master, - dead_node, reco_master); - mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " - "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", - dlm->name, mres->lockname_len, mres->lockname, mres->master, - mres->num_locks, mres->total_locks, mres->flags, - dlm_get_lock_cookie_node(mres->ml[0].cookie), - dlm_get_lock_cookie_seq(mres->ml[0].cookie), - mres->ml[0].list, mres->ml[0].flags, - mres->ml[0].type, mres->ml[0].convert_type, - mres->ml[0].highest_blocked, mres->ml[0].node); - BUG(); + /* worker could have been created before the recovery master + * died. if so, do not continue, but do not error. */ + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: will not send recovery state, " + "recovery master %u died, thread=(dead=%u,mas=%u)" + " current=(dead=%u,mas=%u)\n", dlm->name, + reco_master, dead_node, reco_master, + dlm->reco.dead_node, dlm->reco.new_master); + } else { + mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " + "master=%u), request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, + dlm->reco.new_master, dead_node, reco_master); + } + goto leave; } - BUG_ON(dead_node != dlm->reco.dead_node); - BUG_ON(reco_master != dlm->reco.new_master); /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list @@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) dlm_move_reco_locks_to_list(dlm, &resources, dead_node); /* now we can begin blasting lockreses without the dlm lock */ + + /* any errors returned will be due to the new_master dying, + * the dlm_reco_thread should detect this */ list_for_each(iter, &resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); ret = dlm_send_one_lockres(dlm, res, mres, reco_master, DLM_MRES_RECOVERY); - if (ret < 0) - mlog_errno(ret); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery state for dead node %u, ret=%d\n", dlm->name, + reco_master, dead_node, ret); + skip_all_done = 1; + break; + } } /* move the resources back to the list */ @@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) list_splice_init(&resources, &dlm->reco.resources); spin_unlock(&dlm->spinlock); - ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); - if (ret < 0) - mlog_errno(ret); - + if (!skip_all_done) { + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery all-done for dead node %u, ret=%d\n", + dlm->name, reco_master, dead_node, ret); + } + } +leave: free_page((unsigned long)data); } @@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); - /* negative status is ignored by the caller */ - if (ret >= 0) + if (ret < 0) { + if (!dlm_is_host_down(ret)) { + mlog_errno(ret); + mlog(ML_ERROR, "%s: unknown error sending data-done " + "to %u\n", dlm->name, send_to); + BUG(); + } + } else ret = tmpret; return ret; } @@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " "node_idx=%u, this node=%u\n", done->dead_node, dlm->reco.dead_node, done->node_idx, dlm->node_num); - BUG_ON(done->dead_node != dlm->reco.dead_node); + + mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), + "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); spin_lock(&dlm_reco_state_lock); list_for_each(iter, &dlm->reco.node_data) { @@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "found UNKNOWN owner while doing recovery " "for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } } spin_unlock(&dlm->spinlock); @@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, ml->type == LKM_PRMODE) { /* if it is already set, this had better be a PR * and it has to match */ - if (mres->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { + if (!dlm_lvb_is_empty(mres->lvb) && + (ml->type == LKM_EXMODE || + memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { mlog(ML_ERROR, "mismatched lvbs!\n"); __dlm_print_one_lock_resource(lock->lockres); BUG(); @@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * we must send it immediately. */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg " - "returned %d, TODO\n", ret); - BUG(); - } + if (ret < 0) + goto error; } } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " - "TODO\n", ret); + if (ret < 0) + goto error; + return ret; + +error: + mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", + dlm->name, ret); + if (!dlm_is_host_down(ret)) BUG(); - } + mlog(0, "%s: node %u went down while sending %s " + "lockres %.*s\n", dlm->name, send_to, + flags & DLM_MRES_RECOVERY ? "recovery" : "migration", + res->lockname.len, res->lockname.name); return ret; } @@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "all done flag. all lockres data received!\n"); ret = -ENOMEM; - buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!buf || !item) goto leave; @@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); leave: dlm_put(dlm); @@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; struct dlm_lock_resource *res = NULL; + unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; @@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) return master; } + hash = dlm_lockid_hash(req->name, req->namelen); + spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, req->name, req->namelen); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); if (res) { spin_lock(&res->spinlock); master = res->owner; @@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i; + int i, bad; struct list_head *iter; struct dlm_lock *lock = NULL; @@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* move the lock to its proper place */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, queue); + list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); mlog(0, "just reordered a local lock!\n"); @@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, } lksb->flags |= (ml->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); - - if (mres->lvb[0]) { + + if (ml->type == LKM_NLMODE) + goto skip_lvb; + + if (!dlm_lvb_is_empty(mres->lvb)) { if (lksb->flags & DLM_LKSB_PUT_LVB) { /* other node was trying to update * lvb when node died. recreate the * lksb with the updated lvb. */ memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + /* the lock resource lvb update must happen + * NOW, before the spinlock is dropped. + * we no longer wait for the AST to update + * the lvb. */ + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } else { /* otherwise, the node is sending its * most recent valid lvb info */ BUG_ON(ml->type != LKM_EXMODE && ml->type != LKM_PRMODE); - if (res->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { - mlog(ML_ERROR, "received bad lvb!\n"); - __dlm_print_one_lock_resource(res); - BUG(); + if (!dlm_lvb_is_empty(res->lvb) && + (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + int i; + mlog(ML_ERROR, "%s:%.*s: received bad " + "lvb! type=%d\n", dlm->name, + res->lockname.len, + res->lockname.name, ml->type); + printk("lockres lvb=["); + for (i=0; i<DLM_LVB_LEN; i++) + printk("%02x", res->lvb[i]); + printk("]\nmigrated lvb=["); + for (i=0; i<DLM_LVB_LEN; i++) + printk("%02x", mres->lvb[i]); + printk("]\n"); + dlm_print_one_lock_resource(res); + BUG(); } memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } } - +skip_lvb: /* NOTE: * wrt lock queue ordering and recovery: @@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, * relative to each other, but clearly *not* * preserved relative to locks from other nodes. */ + bad = 0; spin_lock(&res->spinlock); - dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == ml->cookie) { + u64 c = lock->ml.cookie; + mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " + "exists on this lockres!\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(c), + dlm_get_lock_cookie_seq(c)); + + mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " + "node=%u, cookie=%u:%llu, queue=%d\n", + ml->type, ml->convert_type, ml->node, + dlm_get_lock_cookie_node(ml->cookie), + dlm_get_lock_cookie_seq(ml->cookie), + ml->list); + + __dlm_print_one_lock_resource(res); + bad = 1; + break; + } + } + if (!bad) { + dlm_lock_get(newlock); + list_add_tail(&newlock->list, queue); + } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); @@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct dlm_lock *lock; res->state |= DLM_LOCK_RES_RECOVERING; - if (!list_empty(&res->recovering)) + if (!list_empty(&res->recovering)) { + mlog(0, + "Recovering res %s:%.*s, is already on recovery list!\n", + dlm->name, res->lockname.len, res->lockname.name); list_del_init(&res->recovering); + } + /* We need to hold a reference while on the recovery list */ + dlm_lockres_get(res); list_add_tail(&res->recovering, &dlm->reco.resources); /* find any pending locks and put them back on proper list */ @@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); + dlm_lockres_put(res); } } @@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * the RECOVERING state and set the owner * if necessary */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, hash_iter, bucket, hash_node) { if (res->state & DLM_LOCK_RES_RECOVERING) { if (res->owner == dead_node) { @@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, dlm->name, res->lockname.len, res->lockname.name, res->owner); list_del_init(&res->recovering); + dlm_lockres_put(res); } spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); } @@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) * need to be fired as a result. */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ @@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) { assert_spin_locked(&dlm->spinlock); + if (dlm->reco.new_master == idx) { + mlog(0, "%s: recovery master %d just died\n", + dlm->name, idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + /* finalize1 was reached, so it is safe to clear + * the new_master and dead_node. that recovery + * is complete. */ + mlog(0, "%s: dead master %d had reached " + "finalize1 state, clearing\n", dlm->name, idx); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + } + } + /* check to see if the node is already considered dead */ if (!test_bit(idx, dlm->live_nodes_map)) { mlog(0, "for domain %s, node %d is already dead. " @@ -2087,7 +2329,7 @@ again: /* set the new_master to this node */ spin_lock(&dlm->spinlock); - dlm->reco.new_master = dlm->node_num; + dlm_set_reco_master(dlm, dlm->node_num); spin_unlock(&dlm->spinlock); } @@ -2125,6 +2367,10 @@ again: mlog(0, "%s: reco master %u is ready to recover %u\n", dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else if (ret == DLM_RECOVERING) { + mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", + dlm->name, dlm->node_num); + goto again; } else { struct dlm_lock_resource *res; @@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) mlog_entry("%u\n", dead_node); - mlog(0, "dead node is %u\n", dead_node); + mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); @@ -2214,6 +2460,14 @@ retry: * another ENOMEM */ msleep(100); goto retry; + } else if (ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + /* TODO Look into replacing msleep with cond_resched() */ + msleep(100); + goto retry; } } @@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u wants to recover node %u\n", - br->node_idx, br->dead_node); + spin_lock(&dlm->spinlock); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(0, "%s: node %u wants to recover node %u (%u:%u) " + "but this node is in finalize state, waiting on finalize2\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + spin_unlock(&dlm->spinlock); + return EAGAIN; + } + spin_unlock(&dlm->spinlock); + + mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); @@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) "node %u changing it to %u\n", dlm->name, dlm->reco.dead_node, br->node_idx, br->dead_node); } - dlm->reco.new_master = br->node_idx; - dlm->reco.dead_node = br->dead_node; + dlm_set_reco_master(dlm, br->node_idx); + dlm_set_reco_dead_node(dlm, br->dead_node); if (!test_bit(br->dead_node, dlm->recovery_map)) { mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", @@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); + + mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + dlm_put(dlm); return 0; } +#define DLM_FINALIZE_STAGE2 0x01 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) { int ret = 0; @@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) struct dlm_node_iter iter; int nodenum; int status; + int stage = 1; - mlog(0, "finishing recovery for node %s:%u\n", - dlm->name, dlm->reco.dead_node); + mlog(0, "finishing recovery for node %s:%u, " + "stage %d\n", dlm->name, dlm->reco.dead_node, stage); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); spin_unlock(&dlm->spinlock); +stage2: memset(&fr, 0, sizeof(fr)); fr.node_idx = dlm->node_num; fr.dead_node = dlm->reco.dead_node; + if (stage == 2) + fr.flags |= DLM_FINALIZE_STAGE2; while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { if (nodenum == dlm->node_num) continue; ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, &fr, sizeof(fr), nodenum, &status); - if (ret >= 0) { + if (ret >= 0) ret = status; + if (ret < 0) { + mlog_errno(ret); if (dlm_is_host_down(ret)) { /* this has no effect on this recovery * session, so set the status to zero to @@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) mlog(ML_ERROR, "node %u went down after this " "node finished recovery.\n", nodenum); ret = 0; + continue; } - } - if (ret < 0) { - mlog_errno(ret); break; } } + if (stage == 1) { + /* reset the node_iter back to the top and send finalize2 */ + iter.curnode = -1; + stage = 2; + goto stage2; + } return ret; } @@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + int stage = 1; /* ok to return 0, domain has gone away */ if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u finalizing recovery of node %u\n", - fr->node_idx, fr->dead_node); + if (fr->flags & DLM_FINALIZE_STAGE2) + stage = 2; + mlog(0, "%s: node %u finalizing recovery stage%d of " + "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, + fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); + spin_lock(&dlm->spinlock); if (dlm->reco.new_master != fr->node_idx) { @@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) BUG(); } - dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); - - spin_unlock(&dlm->spinlock); + switch (stage) { + case 1: + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(ML_ERROR, "%s: received finalize1 from " + "new master %u for dead node %u, but " + "this node has already received it!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + break; + case 2: + if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { + mlog(ML_ERROR, "%s: received finalize2 from " + "new master %u for dead node %u, but " + "this node did not have finalize1!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + dlm_reset_recovery(dlm); + dlm_kick_recovery_thread(dlm); + break; + default: + BUG(); + } - dlm_reset_recovery(dlm); + mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", + dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); - dlm_kick_recovery_thread(dlm); dlm_put(dlm); return 0; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5be9d14f12c..0c822f3ffb0 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -39,6 +39,7 @@ #include <linux/inet.h> #include <linux/timer.h> #include <linux/kthread.h> +#include <linux/delay.h> #include "cluster/heartbeat.h" @@ -53,6 +54,8 @@ #include "cluster/masklog.h" static int dlm_thread(void *data); +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres); static void dlm_flush_asts(struct dlm_ctxt *dlm); @@ -80,7 +83,7 @@ repeat: } -static int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_unused(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && @@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ + /* For now, just keep any resource we master */ + if (res->owner == dlm->node_num) + { + if (!list_empty(&res->purge)) { + mlog(0, "we master %s:%.*s, but it is on " + "the purge list. Removing\n", + dlm->name, res->lockname.len, + res->lockname.name); + list_del_init(&res->purge); + dlm->purge_count--; + } + return; + } + if (list_empty(&res->purge)) { mlog(0, "putting lockres %.*s from purge list\n", res->lockname.len, res->lockname.name); @@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, res->last_used = jiffies; list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; + + /* if this node is not the owner, there is + * no way to keep track of who the owner could be. + * unhash it to avoid serious problems. */ + if (res->owner != dlm->node_num) { + mlog(0, "%s:%.*s: doing immediate " + "purge of lockres owned by %u\n", + dlm->name, res->lockname.len, + res->lockname.name, res->owner); + + dlm_purge_lockres_now(dlm, res); + } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "removing lockres %.*s from purge list, " + "owner=%u\n", res->lockname.len, res->lockname.name, + res->owner); list_del_init(&res->purge); dlm->purge_count--; @@ -165,6 +195,7 @@ again: } else if (ret < 0) { mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", lockres->lockname.len, lockres->lockname.name); + msleep(100); goto again; } @@ -178,6 +209,24 @@ finish: __dlm_unhash_lockres(lockres); } +/* make an unused lockres go away immediately. + * as soon as the dlm spinlock is dropped, this lockres + * will not be found. kfree still happens on last put. */ +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&lockres->spinlock); + + BUG_ON(!__dlm_lockres_unused(lockres)); + + if (!list_empty(&lockres->purge)) { + list_del_init(&lockres->purge); + dlm->purge_count--; + } + __dlm_unhash_lockres(lockres); +} + static void dlm_run_purge_list(struct dlm_ctxt *dlm, int purge_now) { @@ -318,8 +367,7 @@ converting: target->ml.type = target->ml.convert_type; target->ml.convert_type = LKM_IVMODE; - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -380,8 +428,7 @@ blocked: target->ml.type, target->ml.node); // target->ml.type is already correct - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) /* don't shuffle secondary queues */ if ((res->owner == dlm->node_num) && !(res->state & DLM_LOCK_RES_DIRTY)) { + /* ref for dirty_list */ + dlm_lockres_get(res); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; } @@ -606,6 +655,8 @@ static int dlm_thread(void *data) list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); + /* Drop dirty_list ref */ + dlm_lockres_put(res); /* lockres can be re-dirtied/re-added to the * dirty_list in this gap, but that is ok */ @@ -642,8 +693,9 @@ static int dlm_thread(void *data) * spinlock and do NOT have the dlm lock. * safe to reserve/queue asts and run the lists. */ - mlog(0, "calling dlm_shuffle_lists with dlm=%p, " - "res=%p\n", dlm, res); + mlog(0, "calling dlm_shuffle_lists with dlm=%s, " + "res=%.*s\n", dlm->name, + res->lockname.len, res->lockname.name); /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); @@ -657,6 +709,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { + /* ref for dirty_list */ + dlm_lockres_get(res); spin_lock(&res->spinlock); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; @@ -677,7 +731,7 @@ in_progress: /* yield and continue right away if there is more work to do */ if (!n) { - yield(); + cond_resched(); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 7b1a2754267..b0c3134f4f7 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res, void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock) { - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; } @@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + if (owner == dlm->node_num) { + /* ended up trying to contact ourself. this means + * that the lockres had been remote but became local + * via a migration. just retry it, now as local */ + mlog(0, "%s:%.*s: this node became the master due to a " + "migration, re-evaluate now\n", dlm->name, + res->lockname.len, res->lockname.name); + return DLM_FORWARD; + } + memset(&unlock, 0, sizeof(unlock)); unlock.node_idx = dlm->node_num; unlock.flags = cpu_to_be32(flags); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 74ca4e5f976..e641b084b34 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name) u32 dlm_key; char *domain; - domain = kmalloc(name->len + 1, GFP_KERNEL); + domain = kmalloc(name->len + 1, GFP_NOFS); if (!domain) { mlog_errno(-ENOMEM); return ERR_PTR(-ENOMEM); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 64cd52860c8..4acd37286bd 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -242,7 +242,7 @@ static void ocfs2_build_lock_name(enum ocfs2_lock_type type, mlog_exit_void(); } -static spinlock_t ocfs2_dlm_tracking_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(ocfs2_dlm_tracking_lock); static void ocfs2_add_lockres_tracking(struct ocfs2_lock_res *res, struct ocfs2_dlm_debug *dlm_debug) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index eebc3cfa6be..910a601b2e9 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -49,7 +49,7 @@ #include "buffer_head_io.h" -spinlock_t trans_inc_lock = SPIN_LOCK_UNLOCKED; +DEFINE_SPINLOCK(trans_inc_lock); static int ocfs2_force_read_journal(struct inode *inode); static int ocfs2_recover_node(struct ocfs2_super *osb, @@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); OCFS2_I(inode)->ip_handle = handle; - list_del(&(OCFS2_I(inode)->ip_handle_list)); - list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); + list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); } static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c index ee42765a855..cf70fe2075b 100644 --- a/fs/ocfs2/vote.c +++ b/fs/ocfs2/vote.c @@ -988,9 +988,7 @@ int ocfs2_request_mount_vote(struct ocfs2_super *osb) } bail: - if (request) - kfree(request); - + kfree(request); return status; } @@ -1021,9 +1019,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb) } bail: - if (request) - kfree(request); - + kfree(request); return status; } diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index efc7c91128a..93a56bd4a2b 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -1,5 +1,4 @@ -/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $ - * openpromfs.c: /proc/openprom handling routines +/* inode.c: /proc/openprom handling routines * * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) @@ -12,762 +11,245 @@ #include <linux/openprom_fs.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/smp_lock.h> +#include <linux/seq_file.h> #include <asm/openprom.h> #include <asm/oplib.h> +#include <asm/prom.h> #include <asm/uaccess.h> -#define ALIASES_NNODES 64 - -typedef struct { - u16 parent; - u16 next; - u16 child; - u16 first_prop; - u32 node; -} openpromfs_node; - -typedef struct { -#define OPP_STRING 0x10 -#define OPP_STRINGLIST 0x20 -#define OPP_BINARY 0x40 -#define OPP_HEXSTRING 0x80 -#define OPP_DIRTY 0x01 -#define OPP_QUOTED 0x02 -#define OPP_NOTQUOTED 0x04 -#define OPP_ASCIIZ 0x08 - u32 flag; - u32 alloclen; - u32 len; - char *value; - char name[8]; -} openprom_property; - -static openpromfs_node *nodes; -static int alloced; -static u16 last_node; -static u16 first_prop; -static u16 options = 0xffff; -static u16 aliases = 0xffff; -static int aliases_nodes; -static char *alias_names [ALIASES_NNODES]; - -#define OPENPROM_ROOT_INO 16 -#define OPENPROM_FIRST_INO OPENPROM_ROOT_INO -#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO] -#define NODE2INO(node) (node + OPENPROM_FIRST_INO) -#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node) - -static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *); -static int openpromfs_readdir(struct file *, void *, filldir_t); -static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd); -static int openpromfs_unlink (struct inode *, struct dentry *dentry); +static DEFINE_MUTEX(op_mutex); -static inline u16 ptr_nod(void *p) -{ - return (long)p & 0xFFFF; -} +#define OPENPROM_ROOT_INO 0 -static ssize_t nodenum_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +enum op_inode_type { + op_inode_node, + op_inode_prop, +}; + +union op_inode_data { + struct device_node *node; + struct property *prop; +}; + +struct op_inode_info { + struct inode vfs_inode; + enum op_inode_type type; + union op_inode_data u; +}; + +static inline struct op_inode_info *OP_I(struct inode *inode) { - struct inode *inode = file->f_dentry->d_inode; - char buffer[10]; - - if (count < 0 || !inode->u.generic_ip) - return -EINVAL; - sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip); - if (file->f_pos >= 9) - return 0; - if (count > 9 - file->f_pos) - count = 9 - file->f_pos; - if (copy_to_user(buf, buffer + file->f_pos, count)) - return -EFAULT; - *ppos += count; - return count; + return container_of(inode, struct op_inode_info, vfs_inode); } -static ssize_t property_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) +static int is_string(unsigned char *p, int len) { - struct inode *inode = filp->f_dentry->d_inode; - int i, j, k; - u32 node; - char *p, *s; - u32 *q; - openprom_property *op; - char buffer[64]; - - if (!filp->private_data) { - node = nodes[ptr_nod(inode->u.generic_ip)].node; - i = ((u32)(long)inode->u.generic_ip) >> 16; - if (ptr_nod(inode->u.generic_ip) == aliases) { - if (i >= aliases_nodes) - p = NULL; - else - p = alias_names [i]; - } else - for (p = prom_firstprop (node, buffer); - i && p && *p; - p = prom_nextprop (node, p, buffer), i--) - /* nothing */ ; - if (!p || !*p) - return -EIO; - i = prom_getproplen (node, p); - if (i < 0) { - if (ptr_nod(inode->u.generic_ip) == aliases) - i = 0; - else - return -EIO; - } - k = i; - if (i < 64) i = 64; - filp->private_data = kmalloc (sizeof (openprom_property) - + (j = strlen (p)) + 2 * i, - GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - op = filp->private_data; - op->flag = 0; - op->alloclen = 2 * i; - strcpy (op->name, p); - op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3); - op->len = k; - if (k && prom_getproperty (node, p, op->value, i) < 0) - return -EIO; - op->value [k] = 0; - if (k) { - for (s = NULL, p = op->value; p < op->value + k; p++) { - if ((*p >= ' ' && *p <= '~') || *p == '\n') { - op->flag |= OPP_STRING; - s = p; - continue; - } - if (p > op->value && !*p && s == p - 1) { - if (p < op->value + k - 1) - op->flag |= OPP_STRINGLIST; - else - op->flag |= OPP_ASCIIZ; - continue; - } - if (k == 1 && !*p) { - op->flag |= (OPP_STRING|OPP_ASCIIZ); - break; - } - op->flag &= ~(OPP_STRING|OPP_STRINGLIST); - if (k & 3) - op->flag |= OPP_HEXSTRING; - else - op->flag |= OPP_BINARY; - break; - } - if (op->flag & OPP_STRINGLIST) - op->flag &= ~(OPP_STRING); - if (op->flag & OPP_ASCIIZ) - op->len--; - } - } else - op = filp->private_data; - if (!count || !(op->len || (op->flag & OPP_ASCIIZ))) - return 0; - if (*ppos >= 0xffffff || count >= 0xffffff) - return -EINVAL; - if (op->flag & OPP_STRINGLIST) { - for (k = 0, p = op->value; p < op->value + op->len; p++) - if (!*p) - k++; - i = op->len + 4 * k + 3; - } else if (op->flag & OPP_STRING) { - i = op->len + 3; - } else if (op->flag & OPP_BINARY) { - i = (op->len * 9) >> 2; - } else { - i = (op->len << 1) + 1; - } - k = *ppos; - if (k >= i) return 0; - if (count > i - k) count = i - k; - if (op->flag & OPP_STRING) { - if (!k) { - if (put_user('\'', buf)) - return -EFAULT; - k++; - count--; - } + int i; - if (k + count >= i - 2) - j = i - 2 - k; - else - j = count; - - if (j >= 0) { - if (copy_to_user(buf + k - *ppos, - op->value + k - 1, j)) - return -EFAULT; - count -= j; - k += j; - } + for (i = 0; i < len; i++) { + unsigned char val = p[i]; - if (count) { - if (put_user('\'', &buf [k++ - *ppos])) - return -EFAULT; - } - if (count > 1) { - if (put_user('\n', &buf [k++ - *ppos])) - return -EFAULT; - } - } else if (op->flag & OPP_STRINGLIST) { - char *tmp; - - tmp = kmalloc (i, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - s = tmp; - *s++ = '\''; - for (p = op->value; p < op->value + op->len; p++) { - if (!*p) { - strcpy(s, "' + '"); - s += 5; - continue; - } - *s++ = *p; - } - strcpy(s, "'\n"); - - if (copy_to_user(buf, tmp + k, count)) - return -EFAULT; - - kfree(tmp); - k += count; - - } else if (op->flag & OPP_BINARY) { - char buffer[10]; - u32 *first, *last; - int first_off, last_cnt; - - first = ((u32 *)op->value) + k / 9; - first_off = k % 9; - last = ((u32 *)op->value) + (k + count - 1) / 9; - last_cnt = (k + count) % 9; - if (!last_cnt) last_cnt = 9; - - if (first == last) { - sprintf (buffer, "%08x.", *first); - if (copy_to_user(buf, buffer + first_off, - last_cnt - first_off)) - return -EFAULT; - buf += last_cnt - first_off; - } else { - for (q = first; q <= last; q++) { - sprintf (buffer, "%08x.", *q); - if (q == first) { - if (copy_to_user(buf, buffer + first_off, - 9 - first_off)) - return -EFAULT; - buf += 9 - first_off; - } else if (q == last) { - if (copy_to_user(buf, buffer, last_cnt)) - return -EFAULT; - buf += last_cnt; - } else { - if (copy_to_user(buf, buffer, 9)) - return -EFAULT; - buf += 9; - } - } - } + if ((i && !val) || + (val >= ' ' && val <= '~')) + continue; - if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) { - if (put_user('\n', (buf - 1))) - return -EFAULT; - } + return 0; + } - k += count; + return 1; +} - } else if (op->flag & OPP_HEXSTRING) { - char buffer[3]; +static int property_show(struct seq_file *f, void *v) +{ + struct property *prop = f->private; + void *pval; + int len; - if ((k < i - 1) && (k & 1)) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (put_user(buffer[1], &buf[k++ - *ppos])) - return -EFAULT; - count--; - } + len = prop->length; + pval = prop->value; - for (; (count > 1) && (k < i - 1); k += 2) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (copy_to_user(buf + k - *ppos, buffer, 2)) - return -EFAULT; - count -= 2; - } + if (is_string(pval, len)) { + while (len > 0) { + int n = strlen(pval); - if (count && (k < i - 1)) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (put_user(buffer[0], &buf[k++ - *ppos])) - return -EFAULT; - count--; - } + seq_printf(f, "%s", (char *) pval); - if (count) { - if (put_user('\n', &buf [k++ - *ppos])) - return -EFAULT; - } - } - count = k - *ppos; - *ppos = k; - return count; -} + /* Skip over the NULL byte too. */ + pval += n + 1; + len -= n + 1; -static ssize_t property_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - int i, j, k; - char *p; - u32 *q; - void *b; - openprom_property *op; - - if (*ppos >= 0xffffff || count >= 0xffffff) - return -EINVAL; - if (!filp->private_data) { - i = property_read (filp, NULL, 0, NULL); - if (i) - return i; - } - k = *ppos; - op = filp->private_data; - if (!(op->flag & OPP_STRING)) { - u32 *first, *last; - int first_off, last_cnt; - u32 mask, mask2; - char tmp [9]; - int forcelen = 0; - - j = k % 9; - for (i = 0; i < count; i++, j++) { - if (j == 9) j = 0; - if (!j) { - char ctmp; - if (get_user(ctmp, &buf[i])) - return -EFAULT; - if (ctmp != '.') { - if (ctmp != '\n') { - if (op->flag & OPP_BINARY) - return -EINVAL; - else - goto write_try_string; - } else { - count = i + 1; - forcelen = 1; - break; - } - } - } else { - char ctmp; - if (get_user(ctmp, &buf[i])) - return -EFAULT; - if (ctmp < '0' || - (ctmp > '9' && ctmp < 'A') || - (ctmp > 'F' && ctmp < 'a') || - ctmp > 'f') { - if (op->flag & OPP_BINARY) - return -EINVAL; - else - goto write_try_string; - } - } - } - op->flag |= OPP_BINARY; - tmp [8] = 0; - i = ((count + k + 8) / 9) << 2; - if (op->alloclen <= i) { - b = kmalloc (sizeof (openprom_property) + 2 * i, - GFP_KERNEL); - if (!b) - return -ENOMEM; - memcpy (b, filp->private_data, - sizeof (openprom_property) - + strlen (op->name) + op->alloclen); - memset (b + sizeof (openprom_property) - + strlen (op->name) + op->alloclen, - 0, 2 * i - op->alloclen); - op = b; - op->alloclen = 2*i; - b = filp->private_data; - filp->private_data = op; - kfree (b); + if (len > 0) + seq_printf(f, " + "); } - first = ((u32 *)op->value) + (k / 9); - first_off = k % 9; - last = (u32 *)(op->value + i); - last_cnt = (k + count) % 9; - if (first + 1 == last) { - memset (tmp, '0', 8); - if (copy_from_user(tmp + first_off, buf, - (count + first_off > 8) ? - 8 - first_off : count)) - return -EFAULT; - mask = 0xffffffff; - mask2 = 0xffffffff; - for (j = 0; j < first_off; j++) - mask >>= 1; - for (j = 8 - count - first_off; j > 0; j--) - mask2 <<= 1; - mask &= mask2; - if (mask) { - *first &= ~mask; - *first |= simple_strtoul (tmp, NULL, 16); - op->flag |= OPP_DIRTY; + } else { + if (len & 3) { + while (len) { + len--; + if (len) + seq_printf(f, "%02x.", + *(unsigned char *) pval); + else + seq_printf(f, "%02x", + *(unsigned char *) pval); + pval++; } } else { - op->flag |= OPP_DIRTY; - for (q = first; q < last; q++) { - if (q == first) { - if (first_off < 8) { - memset (tmp, '0', 8); - if (copy_from_user(tmp + first_off, - buf, - 8 - first_off)) - return -EFAULT; - mask = 0xffffffff; - for (j = 0; j < first_off; j++) - mask >>= 1; - *q &= ~mask; - *q |= simple_strtoul (tmp,NULL,16); - } - buf += 9; - } else if ((q == last - 1) && last_cnt - && (last_cnt < 8)) { - memset (tmp, '0', 8); - if (copy_from_user(tmp, buf, last_cnt)) - return -EFAULT; - mask = 0xffffffff; - for (j = 0; j < 8 - last_cnt; j++) - mask <<= 1; - *q &= ~mask; - *q |= simple_strtoul (tmp, NULL, 16); - buf += last_cnt; - } else { - char tchars[2 * sizeof(long) + 1]; - - if (copy_from_user(tchars, buf, sizeof(tchars) - 1)) - return -EFAULT; - tchars[sizeof(tchars) - 1] = '\0'; - *q = simple_strtoul (tchars, NULL, 16); - buf += 9; - } - } - } - if (!forcelen) { - if (op->len < i) - op->len = i; - } else - op->len = i; - *ppos += count; - } -write_try_string: - if (!(op->flag & OPP_BINARY)) { - if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) { - char ctmp; - - /* No way, if somebody starts writing from the middle, - * we don't know whether he uses quotes around or not - */ - if (k > 0) - return -EINVAL; - if (get_user(ctmp, buf)) - return -EFAULT; - if (ctmp == '\'') { - op->flag |= OPP_QUOTED; - buf++; - count--; - (*ppos)++; - if (!count) { - op->flag |= OPP_STRING; - return 1; - } - } else - op->flag |= OPP_NOTQUOTED; - } - op->flag |= OPP_STRING; - if (op->alloclen <= count + *ppos) { - b = kmalloc (sizeof (openprom_property) - + 2 * (count + *ppos), GFP_KERNEL); - if (!b) - return -ENOMEM; - memcpy (b, filp->private_data, - sizeof (openprom_property) - + strlen (op->name) + op->alloclen); - memset (b + sizeof (openprom_property) - + strlen (op->name) + op->alloclen, - 0, 2*(count - *ppos) - op->alloclen); - op = b; - op->alloclen = 2*(count + *ppos); - b = filp->private_data; - filp->private_data = op; - kfree (b); - } - p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0); - if (copy_from_user(p, buf, count)) - return -EFAULT; - op->flag |= OPP_DIRTY; - for (i = 0; i < count; i++, p++) - if (*p == '\n') { - *p = 0; - break; + while (len >= 4) { + len -= 4; + + if (len) + seq_printf(f, "%08x.", + *(unsigned int *) pval); + else + seq_printf(f, "%08x", + *(unsigned int *) pval); + pval += 4; } - if (i < count) { - op->len = p - op->value; - *ppos += i + 1; - if ((p > op->value) && (op->flag & OPP_QUOTED) - && (*(p - 1) == '\'')) - op->len--; - } else { - if (p - op->value > op->len) - op->len = p - op->value; - *ppos += count; } } - return *ppos - k; + seq_printf(f, "\n"); + + return 0; } -int property_release (struct inode *inode, struct file *filp) +static void *property_start(struct seq_file *f, loff_t *pos) { - openprom_property *op = filp->private_data; - int error; - u32 node; - - if (!op) - return 0; - lock_kernel(); - node = nodes[ptr_nod(inode->u.generic_ip)].node; - if (ptr_nod(inode->u.generic_ip) == aliases) { - if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) { - char *p = op->name; - int i = (op->value - op->name) - strlen (op->name) - 1; - op->value [op->len] = 0; - *(op->value - 1) = ' '; - if (i) { - for (p = op->value - i - 2; p >= op->name; p--) - p[i] = *p; - p = op->name + i; - } - memcpy (p - 8, "nvalias ", 8); - prom_feval (p - 8); - } - } else if (op->flag & OPP_DIRTY) { - if (op->flag & OPP_STRING) { - op->value [op->len] = 0; - error = prom_setprop (node, op->name, - op->value, op->len + 1); - if (error <= 0) - printk (KERN_WARNING "openpromfs: " - "Couldn't write property %s\n", - op->name); - } else if ((op->flag & OPP_BINARY) || !op->len) { - error = prom_setprop (node, op->name, - op->value, op->len); - if (error <= 0) - printk (KERN_WARNING "openpromfs: " - "Couldn't write property %s\n", - op->name); - } else { - printk (KERN_WARNING "openpromfs: " - "Unknown property type of %s\n", - op->name); - } + if (*pos == 0) + return pos; + return NULL; +} + +static void *property_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void property_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations property_op = { + .start = property_start, + .next = property_next, + .stop = property_stop, + .show = property_show +}; + +static int property_open(struct inode *inode, struct file *file) +{ + struct op_inode_info *oi = OP_I(inode); + int ret; + + BUG_ON(oi->type != op_inode_prop); + + ret = seq_open(file, &property_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = oi->u.prop; } - unlock_kernel(); - kfree (filp->private_data); - return 0; + return ret; } static const struct file_operations openpromfs_prop_ops = { - .read = property_read, - .write = property_write, - .release = property_release, + .open = property_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; -static const struct file_operations openpromfs_nodenum_ops = { - .read = nodenum_read, -}; +static int openpromfs_readdir(struct file *, void *, filldir_t); static const struct file_operations openprom_operations = { .read = generic_read_dir, .readdir = openpromfs_readdir, }; -static struct inode_operations openprom_alias_inode_operations = { - .create = openpromfs_create, - .lookup = openpromfs_lookup, - .unlink = openpromfs_unlink, -}; +static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); static struct inode_operations openprom_inode_operations = { .lookup = openpromfs_lookup, }; -static int lookup_children(u16 n, const char * name, int len) +static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - int ret; - u16 node; - for (; n != 0xffff; n = nodes[n].next) { - node = nodes[n].child; - if (node != 0xffff) { - char buffer[128]; - int i; - char *p; - - while (node != 0xffff) { - if (prom_getname (nodes[node].node, - buffer, 128) >= 0) { - i = strlen (buffer); - if ((len == i) - && !strncmp (buffer, name, len)) - return NODE2INO(node); - p = strchr (buffer, '@'); - if (p && (len == p - buffer) - && !strncmp (buffer, name, len)) - return NODE2INO(node); - } - node = nodes[node].next; - } - } else - continue; - ret = lookup_children (nodes[n].child, name, len); - if (ret) return ret; - } - return 0; -} - -static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) -{ - int ino = 0; -#define OPFSL_DIR 0 -#define OPFSL_PROPERTY 1 -#define OPFSL_NODENUM 2 - int type = 0; - char buffer[128]; - char *p; + struct op_inode_info *ent_oi, *oi = OP_I(dir); + struct device_node *dp, *child; + struct property *prop; + enum op_inode_type ent_type; + union op_inode_data ent_data; const char *name; - u32 n; - u16 dirnode; - unsigned int len; - int i; struct inode *inode; - char buffer2[64]; + unsigned int ino; + int len; - inode = NULL; + BUG_ON(oi->type != op_inode_node); + + dp = oi->u.node; + name = dentry->d_name.name; len = dentry->d_name.len; - lock_kernel(); - if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) { - ino = NODEP2INO(NODE(dir->i_ino).first_prop); - type = OPFSL_NODENUM; - } - if (!ino) { - u16 node = NODE(dir->i_ino).child; - while (node != 0xffff) { - if (prom_getname (nodes[node].node, buffer, 128) >= 0) { - i = strlen (buffer); - if (len == i && !strncmp (buffer, name, len)) { - ino = NODE2INO(node); - type = OPFSL_DIR; - break; - } - p = strchr (buffer, '@'); - if (p && (len == p - buffer) - && !strncmp (buffer, name, len)) { - ino = NODE2INO(node); - type = OPFSL_DIR; - break; - } - } - node = nodes[node].next; - } - } - n = NODE(dir->i_ino).node; - dirnode = dir->i_ino - OPENPROM_FIRST_INO; - if (!ino) { - int j = NODEP2INO(NODE(dir->i_ino).first_prop); - if (dirnode != aliases) { - for (p = prom_firstprop (n, buffer2); - p && *p; - p = prom_nextprop (n, p, buffer2)) { - j++; - if ((len == strlen (p)) - && !strncmp (p, name, len)) { - ino = j; - type = OPFSL_PROPERTY; - break; - } - } - } else { - int k; - for (k = 0; k < aliases_nodes; k++) { - j++; - if (alias_names [k] - && (len == strlen (alias_names [k])) - && !strncmp (alias_names [k], name, len)) { - ino = j; - type = OPFSL_PROPERTY; - break; - } - } + + mutex_lock(&op_mutex); + + child = dp->child; + while (child) { + int n = strlen(child->path_component_name); + + if (len == n && + !strncmp(child->path_component_name, name, len)) { + ent_type = op_inode_node; + ent_data.node = child; + ino = child->unique_id; + goto found; } + child = child->sibling; } - if (!ino) { - ino = lookup_children (NODE(dir->i_ino).child, name, len); - if (ino) - type = OPFSL_DIR; - else { - unlock_kernel(); - return ERR_PTR(-ENOENT); + + prop = dp->properties; + while (prop) { + int n = strlen(prop->name); + + if (len == n && !strncmp(prop->name, name, len)) { + ent_type = op_inode_prop; + ent_data.prop = prop; + ino = prop->unique_id; + goto found; } + + prop = prop->next; } - inode = iget (dir->i_sb, ino); - unlock_kernel(); + + mutex_unlock(&op_mutex); + return ERR_PTR(-ENOENT); + +found: + inode = iget(dir->i_sb, ino); + mutex_unlock(&op_mutex); if (!inode) return ERR_PTR(-EINVAL); - switch (type) { - case OPFSL_DIR: + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (ino == OPENPROM_FIRST_INO + aliases) { - inode->i_mode |= S_IWUSR; - inode->i_op = &openprom_alias_inode_operations; - } else - inode->i_op = &openprom_inode_operations; + inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; inode->i_nlink = 2; break; - case OPFSL_NODENUM: - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &openpromfs_nodenum_ops; - inode->i_nlink = 1; - inode->u.generic_ip = (void *)(long)(n); - break; - case OPFSL_PROPERTY: - if ((dirnode == options) && (len == 17) - && !strncmp (name, "security-password", 17)) + case op_inode_prop: + if (!strcmp(dp->name, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; - else { + else inode->i_mode = S_IFREG | S_IRUGO; - if (dirnode == options || dirnode == aliases) { - if (len != 4 || strncmp (name, "name", 4)) - inode->i_mode |= S_IWUSR; - } - } inode->i_fop = &openpromfs_prop_ops; inode->i_nlink = 1; - if (inode->i_size < 0) - inode->i_size = 0; - inode->u.generic_ip = (void *)(long)(((u16)dirnode) | - (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16)); + inode->i_size = ent_oi->u.prop->length; break; } @@ -781,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; + struct op_inode_info *oi = OP_I(inode); + struct device_node *dp = oi->u.node; + struct device_node *child; + struct property *prop; unsigned int ino; - u32 n; - int i, j; - char buffer[128]; - u16 node; - char *p; - char buffer2[64]; - - lock_kernel(); + int i; + + mutex_lock(&op_mutex); ino = inode->i_ino; i = filp->f_pos; switch (i) { case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; i++; filp->f_pos++; /* fall thru */ case 1: - if (filldir(dirent, "..", 2, i, - (NODE(ino).parent == 0xffff) ? - OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) + if (filldir(dirent, "..", 2, i, + (dp->parent == NULL ? + OPENPROM_ROOT_INO : + dp->parent->unique_id), DT_DIR) < 0) goto out; i++; filp->f_pos++; /* fall thru */ default: i -= 2; - node = NODE(ino).child; - while (i && node != 0xffff) { - node = nodes[node].next; + + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; i--; } - while (node != 0xffff) { - if (prom_getname (nodes[node].node, buffer, 128) < 0) - goto out; - if (filldir(dirent, buffer, strlen(buffer), - filp->f_pos, NODE2INO(node), DT_DIR) < 0) + while (child) { + if (filldir(dirent, + child->path_component_name, + strlen(child->path_component_name), + filp->f_pos, child->unique_id, DT_DIR) < 0) goto out; + filp->f_pos++; - node = nodes[node].next; + child = child->sibling; } - j = NODEP2INO(NODE(ino).first_prop); - if (!i) { - if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0) + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; + } + while (prop) { + if (filldir(dirent, prop->name, strlen(prop->name), + filp->f_pos, prop->unique_id, DT_REG) < 0) goto out; + filp->f_pos++; - } else - i--; - n = NODE(ino).node; - if (ino == OPENPROM_FIRST_INO + aliases) { - for (j++; i < aliases_nodes; i++, j++) { - if (alias_names [i]) { - if (filldir (dirent, alias_names [i], - strlen (alias_names [i]), - filp->f_pos, j, DT_REG) < 0) goto out; - filp->f_pos++; - } - } - } else { - for (p = prom_firstprop (n, buffer2); - p && *p; - p = prom_nextprop (n, p, buffer2)) { - j++; - if (i) i--; - else { - if (filldir(dirent, p, strlen(p), - filp->f_pos, j, DT_REG) < 0) - goto out; - filp->f_pos++; - } - } + prop = prop->next; } } out: - unlock_kernel(); - return 0; -} - -static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - char *p; - struct inode *inode; - - if (!dir) - return -ENOENT; - if (dentry->d_name.len > 256) - return -EINVAL; - p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL); - if (!p) - return -ENOMEM; - strncpy (p, dentry->d_name.name, dentry->d_name.len); - p [dentry->d_name.len] = 0; - lock_kernel(); - if (aliases_nodes == ALIASES_NNODES) { - kfree(p); - unlock_kernel(); - return -EIO; - } - alias_names [aliases_nodes++] = p; - inode = iget (dir->i_sb, - NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes); - if (!inode) { - unlock_kernel(); - return -EINVAL; - } - inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR; - inode->i_fop = &openpromfs_prop_ops; - inode->i_nlink = 1; - if (inode->i_size < 0) inode->i_size = 0; - inode->u.generic_ip = (void *)(long)(((u16)aliases) | - (((u16)(aliases_nodes - 1)) << 16)); - unlock_kernel(); - d_instantiate(dentry, inode); + mutex_unlock(&op_mutex); return 0; } -static int openpromfs_unlink (struct inode *dir, struct dentry *dentry) -{ - unsigned int len; - char *p; - const char *name; - int i; - - name = dentry->d_name.name; - len = dentry->d_name.len; - lock_kernel(); - for (i = 0; i < aliases_nodes; i++) - if ((strlen (alias_names [i]) == len) - && !strncmp (name, alias_names[i], len)) { - char buffer[512]; - - p = alias_names [i]; - alias_names [i] = NULL; - kfree (p); - strcpy (buffer, "nvunalias "); - memcpy (buffer + 10, name, len); - buffer [10 + len] = 0; - prom_feval (buffer); - } - unlock_kernel(); - return 0; -} +static kmem_cache_t *op_inode_cachep; -/* {{{ init section */ -static int __init check_space (u16 n) +static struct inode *openprom_alloc_inode(struct super_block *sb) { - unsigned long pages; + struct op_inode_info *oi; - if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) { - pages = __get_free_pages (GFP_KERNEL, alloced + 1); - if (!pages) - return -1; + oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL); + if (!oi) + return NULL; - if (nodes) { - memcpy ((char *)pages, nodes, - (1 << alloced) * PAGE_SIZE); - free_pages ((unsigned long)nodes, alloced); - } - alloced++; - nodes = (openpromfs_node *)pages; - } - return 0; + return &oi->vfs_inode; } -static u16 __init get_nodes (u16 parent, u32 node) +static void openprom_destroy_inode(struct inode *inode) { - char *p; - u16 n = last_node++, i; - char buffer[64]; - - if (check_space (n) < 0) - return 0xffff; - nodes[n].parent = parent; - nodes[n].node = node; - nodes[n].next = 0xffff; - nodes[n].child = 0xffff; - nodes[n].first_prop = first_prop++; - if (!parent) { - char buffer[8]; - int j; - - if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) { - buffer[j] = 0; - if (!strcmp (buffer, "options")) - options = n; - else if (!strcmp (buffer, "aliases")) - aliases = n; - } - } - if (n != aliases) - for (p = prom_firstprop (node, buffer); - p && p != (char *)-1 && *p; - p = prom_nextprop (node, p, buffer)) - first_prop++; - else { - char *q; - for (p = prom_firstprop (node, buffer); - p && p != (char *)-1 && *p; - p = prom_nextprop (node, p, buffer)) { - if (aliases_nodes == ALIASES_NNODES) - break; - for (i = 0; i < aliases_nodes; i++) - if (!strcmp (p, alias_names [i])) - break; - if (i < aliases_nodes) - continue; - q = kmalloc (strlen (p) + 1, GFP_KERNEL); - if (!q) - return 0xffff; - strcpy (q, p); - alias_names [aliases_nodes++] = q; - } - first_prop += ALIASES_NNODES; - } - node = prom_getchild (node); - if (node) { - parent = get_nodes (n, node); - if (parent == 0xffff) - return 0xffff; - nodes[n].child = parent; - while ((node = prom_getsibling (node)) != 0) { - i = get_nodes (n, node); - if (i == 0xffff) - return 0xffff; - nodes[parent].next = i; - parent = i; - } - } - return n; + kmem_cache_free(op_inode_cachep, OP_I(inode)); } static void openprom_read_inode(struct inode * inode) @@ -1031,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data) } static struct super_operations openprom_sops = { + .alloc_inode = openprom_alloc_inode, + .destroy_inode = openprom_destroy_inode, .read_inode = openprom_read_inode, .statfs = simple_statfs, .remount_fs = openprom_remount, @@ -1038,7 +374,8 @@ static struct super_operations openprom_sops = { static int openprom_fill_super(struct super_block *s, void *data, int silent) { - struct inode * root_inode; + struct inode *root_inode; + struct op_inode_info *oi; s->s_flags |= MS_NOATIME; s->s_blocksize = 1024; @@ -1049,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent) root_inode = iget(s, OPENPROM_ROOT_INO); if (!root_inode) goto out_no_root; + + oi = OP_I(root_inode); + oi->type = op_inode_node; + oi->u.node = of_find_node_by_path("/"); + s->s_root = d_alloc_root(root_inode); if (!s->s_root) goto out_no_root; @@ -1073,29 +415,39 @@ static struct file_system_type openprom_fs_type = { .kill_sb = kill_anon_super, }; +static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags) +{ + struct op_inode_info *oi = (struct op_inode_info *) data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&oi->vfs_inode); +} + static int __init init_openprom_fs(void) { - nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0); - if (!nodes) { - printk (KERN_WARNING "openpromfs: can't get free page\n"); - return -EIO; - } - if (get_nodes (0xffff, prom_root_node) == 0xffff) { - printk (KERN_WARNING "openpromfs: couldn't setup tree\n"); - return -EIO; - } - nodes[last_node].first_prop = first_prop; - return register_filesystem(&openprom_fs_type); + int err; + + op_inode_cachep = kmem_cache_create("op_inode_cache", + sizeof(struct op_inode_info), + 0, + (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + op_inode_init_once, NULL); + if (!op_inode_cachep) + return -ENOMEM; + + err = register_filesystem(&openprom_fs_type); + if (err) + kmem_cache_destroy(op_inode_cachep); + + return err; } static void __exit exit_openprom_fs(void) { - int i; unregister_filesystem(&openprom_fs_type); - free_pages ((unsigned long)nodes, alloced); - for (i = 0; i < aliases_nodes; i++) - kfree (alias_names [i]); - nodes = NULL; + kmem_cache_destroy(op_inode_cachep); } module_init(init_openprom_fs) diff --git a/fs/pnode.c b/fs/pnode.c index 37b568ed0e0..da42ee61c1d 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt) if (master) { list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; - list_del(&mnt->mnt_slave); - list_add(&mnt->mnt_slave, &master->mnt_slave_list); + list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); } else { @@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt) * umount the child only if the child has no * other children */ - if (child && list_empty(&child->mnt_mounts)) { - list_del(&child->mnt_hash); - list_add_tail(&child->mnt_hash, &mnt->mnt_hash); - } + if (child && list_empty(&child->mnt_mounts)) + list_move_tail(&child->mnt_hash, &mnt->mnt_hash); } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 6afff725a8c..6ba7785319d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -74,6 +74,16 @@ #include <linux/poll.h> #include "internal.h" +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + /* * For hysterical raisins we keep the same inumbers as in the old procfs. * Feel free to change the macro below - just keep the range distinct from @@ -121,6 +131,8 @@ enum pid_directory_inos { PROC_TGID_ATTR_PREV, PROC_TGID_ATTR_EXEC, PROC_TGID_ATTR_FSCREATE, + PROC_TGID_ATTR_KEYCREATE, + PROC_TGID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TGID_LOGINUID, @@ -162,6 +174,8 @@ enum pid_directory_inos { PROC_TID_ATTR_PREV, PROC_TID_ATTR_EXEC, PROC_TID_ATTR_FSCREATE, + PROC_TID_ATTR_KEYCREATE, + PROC_TID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TID_LOGINUID, @@ -173,6 +187,9 @@ enum pid_directory_inos { PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ }; +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 10 + struct pid_entry { int type; int len; @@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = { E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; static struct pid_entry tid_attr_stuff[] = { @@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = { E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; #endif @@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = { static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct task_struct *task = proc_task(inode); - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; - int fd = proc_type(inode) - PROC_TID_FD_DIR; + int fd = proc_fd(inode); - files = get_files_struct(task); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } if (files) { /* * We are not taking a ref to the file structure, so we must @@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) return fs; } -static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int get_nr_threads(struct task_struct *tsk) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); - int result = -ENOENT; - if (fs) { - read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); - read_unlock(&fs->lock); - result = 0; - put_fs_struct(fs); + /* Must be called with the rcu_read_lock held */ + unsigned long flags; + int count = 0; + + if (lock_task_sighand(tsk, &flags)) { + count = atomic_read(&tsk->signal->count); + unlock_task_sighand(tsk, &flags); } - return result; + return count; } -static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); - *mnt = mntget(fs->rootmnt); - *dentry = dget(fs->root); + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); read_unlock(&fs->lock); result = 0; put_fs_struct(fs); @@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf return result; } - -/* Same as proc_root_link, but this addionally tries to get fs from other - * threads in the group */ -static int proc_task_root_link(struct inode *inode, struct dentry **dentry, - struct vfsmount **mnt) +static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs; + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; - struct task_struct *leader = proc_task(inode); - task_lock(leader); - fs = leader->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(leader); - } else { - /* Try to get fs from other threads */ - task_unlock(leader); - read_lock(&tasklist_lock); - if (pid_alive(leader)) { - struct task_struct *task = leader; - - while ((task = next_thread(task)) != leader) { - task_lock(task); - fs = task->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(task); - break; - } - task_unlock(task); - } - } - read_unlock(&tasklist_lock); + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); } - if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->rootmnt); @@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry, return result; } - #define MAY_PTRACE(task) \ (task == current || \ (task->parent == current && \ @@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer) /************************************************************************/ /* permission checks */ - -/* If the process being read is separated by chroot from the reading process, - * don't let the reader access the threads. - * - * note: this does dput(root) and mntput(vfsmnt) on exit. - */ -static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt) -{ - struct dentry *de, *base; - struct vfsmount *our_vfsmnt, *mnt; - int res = 0; - - read_lock(¤t->fs->lock); - our_vfsmnt = mntget(current->fs->rootmnt); - base = dget(current->fs->root); - read_unlock(¤t->fs->lock); - - spin_lock(&vfsmount_lock); - de = root; - mnt = vfsmnt; - - while (mnt != our_vfsmnt) { - if (mnt == mnt->mnt_parent) - goto out; - de = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - } - - if (!is_subdir(de, base)) - goto out; - spin_unlock(&vfsmount_lock); - -exit: - dput(base); - mntput(our_vfsmnt); - dput(root); - mntput(vfsmnt); - return res; -out: - spin_unlock(&vfsmount_lock); - res = -EACCES; - goto exit; -} - -static int proc_check_root(struct inode *inode) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ - return -ENOENT; - return proc_check_chroot(root, vfsmnt); -} - -static int proc_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - return proc_check_root(inode); -} - -static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - - if (proc_task_root_link(inode, &root, &vfsmnt)) - return -ENOENT; - - return proc_check_chroot(root, vfsmnt); -} - -extern struct seq_operations proc_pid_maps_op; -static int maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_NUMA -extern struct seq_operations proc_pid_numa_maps_op; -static int numa_maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_numa_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - -#ifdef CONFIG_MMU -extern struct seq_operations proc_pid_smaps_op; -static int smaps_open(struct inode *inode, struct file *file) +static int proc_fd_access_allowed(struct inode *inode) { - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_smaps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_attach(task); + put_task_struct(task); } - return ret; + return allowed; } -static struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -679,16 +560,19 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); - struct namespace *namespace; + struct task_struct *task = get_proc_task(inode); + struct namespace *namespace = NULL; struct proc_mounts *p; int ret = -EINVAL; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) { ret = -ENOMEM; @@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = { extern struct seq_operations mountstats_op; static int mountstats_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); int ret = seq_open(file, &mountstats_op); if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + struct namespace *namespace = NULL; + struct task_struct *task = get_proc_task(inode); + + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) m->private = namespace; @@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PROC_BLOCK_SIZE) count = PROC_BLOCK_SIZE; + + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = PROC_I(inode)->op.proc_read(task, (char*)page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file) static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) goto out; @@ -865,6 +765,8 @@ out_put: out_free: free_page((unsigned long) page); out: + put_task_struct(task); +out_no_task: return ret; } @@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf, { int copied = 0; char *page; - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; + copied = -ESRCH; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) - return -ESRCH; + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; while (count > 0) { int this_len, retval; @@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf, } *ppos = dst; free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return copied; } #endif @@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = { static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8]; + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust = task->oomkilladj; + int oom_adjust; loff_t __ppos = *ppos; - len = sprintf(buffer, "%i\n", oom_adjust); + if (!task) + return -ESRCH; + oom_adjust = task->oomkilladj; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); if (__ppos >= len) return 0; if (count > len-__ppos) @@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8], *end; + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; int oom_adjust; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, 8); - if (count > 6) - count = 6; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; oom_adjust = simple_strtol(buffer, &end, 0); @@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EINVAL; if (*end == '\n') end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; task->oomkilladj = oom_adjust; + put_task_struct(task); if (end - buffer == 0) return -EIO; return end - buffer; @@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = { .write = oom_adjust_write, }; -static struct inode_operations proc_mem_inode_operations = { - .permission = proc_permission, -}; - #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file->f_dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; + if (!task) + return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_loginuid(task->audit_context)); + put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page, *tmp; ssize_t length; - struct task_struct *task = proc_task(inode); uid_t loginuid; if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != task) + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) return -EPERM; if (count >= PAGE_SIZE) @@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(task, loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; @@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = { static ssize_t seccomp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20]; loff_t __ppos = *ppos; size_t len; + if (!tsk) + return -ESRCH; /* no need to print the trailing zero, so use only len */ len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + put_task_struct(tsk); if (__ppos >= len) return 0; if (count > len - __ppos) @@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf, static ssize_t seccomp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20], *end; unsigned int seccomp_mode; + ssize_t result; + + result = -ESRCH; + if (!tsk) + goto out_no_task; /* can set it only once to be even more secure */ + result = -EPERM; if (unlikely(tsk->seccomp.mode)) - return -EPERM; + goto out; + result = -EFAULT; memset(__buf, 0, sizeof(__buf)); count = min(count, sizeof(__buf) - 1); if (copy_from_user(__buf, buf, count)) - return -EFAULT; + goto out; + seccomp_mode = simple_strtoul(__buf, &end, 0); if (*end == '\n') end++; + result = -EINVAL; if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { tsk->seccomp.mode = seccomp_mode; set_tsk_thread_flag(tsk, TIF_SECCOMP); } else - return -EINVAL; + goto out; + result = -EIO; if (unlikely(!(end - __buf))) - return -EIO; - return end - __buf; + goto out; + result = end - __buf; +out: + put_task_struct(tsk); +out_no_task: + return result; } static struct file_operations proc_seccomp_operations = { @@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) /* We don't need a base pointer in the /proc filesystem */ path_release(nd); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); @@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt); @@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = { .follow_link = proc_pid_follow_link }; -#define NUMBUF 10 - static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; int retval; - char buf[NUMBUF]; + char buf[PROC_NUMBUF]; struct files_struct * files; struct fdtable *fdt; retval = -ENOENT; - if (!pid_alive(p)) - goto out; + if (!p) + goto out_no_task; retval = 0; tid = p->pid; @@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) goto out; filp->f_pos++; case 1: - ino = fake_ino(tid, PROC_TID_INO); + ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) goto out; filp->f_pos++; @@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) continue; rcu_read_unlock(); - j = NUMBUF; + j = PROC_NUMBUF; i = fd; do { j--; @@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) } while (i); ino = fake_ino(tid, PROC_TID_FD_DIR + fd); - if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) { + if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) { rcu_read_lock(); break; } @@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) put_files_struct(files); } out: + put_task_struct(p); +out_no_task: return retval; } @@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp, int pid; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); struct pid_entry *p; ino_t ino; int ret; ret = -ENOENT; - if (!pid_alive(proc_task(inode))) + if (!task) goto out; ret = 0; - pid = proc_task(inode)->pid; + pid = task->pid; + put_task_struct(task); i = filp->f_pos; switch (i) { case 0: @@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); - ei->task = NULL; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) - goto out_unlock; - /* * grab the reference to task. */ - get_task_struct(task); - ei->task = task; - ei->type = ino; + ei->pid = get_pid(task->pids[PIDTYPE_PID].pid); + if (!ei->pid) + goto out_unlock; + inode->i_uid = 0; inode->i_gid = 0; - if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) { + if (task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } @@ -1379,7 +1306,6 @@ out: return inode; out_unlock: - ei->pde = NULL; iput(inode); return NULL; } @@ -1393,13 +1319,21 @@ out_unlock: * * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - if (pid_alive(task)) { - if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) { + struct task_struct *task = get_proc_task(inode); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } else { @@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = 0; } security_task_to_inode(task, inode); + put_task_struct(task); return 1; } d_drop(dentry); return 0; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - int fd = proc_type(inode) - PROC_TID_FD_DIR; + struct task_struct *task = get_proc_task(inode); + int fd = proc_fd(inode); struct files_struct *files; - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - if (fcheck_files(files, fd)) { + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - security_task_to_inode(task, inode); - return 1; } - rcu_read_unlock(); - put_files_struct(files); + put_task_struct(task); } d_drop(dentry); return 0; } -static void pid_base_iput(struct dentry *dentry, struct inode *inode) -{ - struct task_struct *task = proc_task(inode); - spin_lock(&task->proc_lock); - if (task->proc_dentry == dentry) - task->proc_dentry = NULL; - spin_unlock(&task->proc_lock); - iput(inode); -} - static int pid_delete_dentry(struct dentry * dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !pid_alive(proc_task(dentry->d_inode)); + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } static struct dentry_operations tid_fd_dentry_operations = @@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations = .d_delete = pid_delete_dentry, }; -static struct dentry_operations pid_base_dentry_operations = -{ - .d_revalidate = pid_revalidate, - .d_iput = pid_base_iput, - .d_delete = pid_delete_dentry, -}; - /* Lookups */ static unsigned name_to_int(struct dentry *dentry) @@ -1508,22 +1451,24 @@ out: /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); struct file * file; struct files_struct * files; struct inode *inode; struct proc_inode *ei; + if (!task) + goto out_no_task; if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); if (!inode) goto out; ei = PROC_I(inode); + ei->fd = fd; files = get_files_struct(task); if (!files) goto out_unlock; @@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, ei->op.proc_get_link = proc_fd_link; dentry->d_op = &tid_fd_dentry_operations; d_add(dentry, inode); - return NULL; + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + result = NULL; +out: + put_task_struct(task); +out_no_task: + return result; out_unlock2: spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -out: - return ERR_PTR(-ENOENT); + goto out; } static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); static struct file_operations proc_fd_operations = { .read = generic_read_dir, @@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, - .permission = proc_permission, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, - .permission = proc_task_permission, + .getattr = proc_task_getattr, }; #ifdef CONFIG_SECURITY @@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = security_getprocattr(task, (char*)file->f_dentry->d_name.name, @@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; - if (*ppos != 0) { - /* No partial writes. */ - return -EINVAL; - } + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; page = (char*)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; + length = -EFAULT; if (copy_from_user(page, buf, count)) - goto out; + goto out_free; length = security_setprocattr(task, (char*)file->f_dentry->d_name.name, (void*)page, count); -out: +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations; static struct inode_operations proc_tgid_attr_inode_operations; #endif -static int get_tid_list(int index, unsigned int *tids, struct inode *dir); - /* SMP-safe */ static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, struct pid_entry *ents) { struct inode *inode; - int error; - struct task_struct *task = proc_task(dir); + struct dentry *error; + struct task_struct *task = get_proc_task(dir); struct pid_entry *p; struct proc_inode *ei; - error = -ENOENT; + error = ERR_PTR(-ENOENT); inode = NULL; - if (!pid_alive(task)) - goto out; + if (!task) + goto out_no_task; for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) @@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, if (!p->name) goto out; - error = -EINVAL; + error = ERR_PTR(-EINVAL); inode = proc_pid_make_inode(dir->i_sb, task, p->type); if (!inode) goto out; @@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, */ switch(p->type) { case PROC_TGID_TASK: - inode->i_nlink = 2 + get_tid_list(2, NULL, dir); + inode->i_nlink = 2; inode->i_op = &proc_task_inode_operations; inode->i_fop = &proc_task_operations; break; @@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir, #endif case PROC_TID_MEM: case PROC_TGID_MEM: - inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; #ifdef CONFIG_SECCOMP @@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir, case PROC_TGID_ATTR_EXEC: case PROC_TID_ATTR_FSCREATE: case PROC_TGID_ATTR_FSCREATE: + case PROC_TID_ATTR_KEYCREATE: + case PROC_TGID_ATTR_KEYCREATE: + case PROC_TID_ATTR_SOCKCREATE: + case PROC_TGID_ATTR_SOCKCREATE: inode->i_fop = &proc_pid_attr_operations; break; #endif @@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir, default: printk("procfs: impossible type (%d)",p->type); iput(inode); - return ERR_PTR(-EINVAL); + error = ERR_PTR(-EINVAL); + goto out; } dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); - return NULL; - + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; out: - return ERR_PTR(error); + put_task_struct(task); +out_no_task: + return error; } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ @@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, }; #ifdef CONFIG_SECURITY @@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, + .getattr = pid_getattr, }; #endif @@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = { static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return vfs_readlink(dentry,buffer,buflen,tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return ERR_PTR(vfs_follow_link(nd,tmp)); } @@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = { }; /** - * proc_pid_unhash - Unhash /proc/@pid entry from the dcache. - * @p: task that should be flushed. + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * + * @task: task that should be flushed. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. * - * Drops the /proc/@pid dcache entry from the hash chains. + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. * - * Dropping /proc/@pid entries and detach_pid must be synchroneous, - * otherwise e.g. /proc/@pid/exe might point to the wrong executable, - * if the pid value is immediately reused. This is enforced by - * - caller must acquire spin_lock(p->proc_lock) - * - must be called before detach_pid() - * - proc_pid_lookup acquires proc_lock, and checks that - * the target is not dead by looking at the attach count - * of PIDTYPE_PID. + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. */ - -struct dentry *proc_pid_unhash(struct task_struct *p) +void proc_flush_task(struct task_struct *task) { - struct dentry *proc_dentry; + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } - proc_dentry = p->proc_dentry; - if (proc_dentry != NULL) { + if (thread_group_leader(task)) + goto out; - spin_lock(&dcache_lock); - spin_lock(&proc_dentry->d_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - spin_unlock(&proc_dentry->d_lock); - } else { - spin_unlock(&proc_dentry->d_lock); - proc_dentry = NULL; - } - spin_unlock(&dcache_lock); - } - return proc_dentry; -} + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->tgid); + leader = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (!leader) + goto out; -/** - * proc_pid_flush - recover memory used by stale /proc/@pid/x entries - * @proc_dentry: directoy to prune. - * - * Shrink the /proc directory that was used by the just killed thread. - */ - -void proc_pid_flush(struct dentry *proc_dentry) -{ - might_sleep(); - if(proc_dentry != NULL) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; } /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; struct inode *inode; struct proc_inode *ei; unsigned tgid; - int died; if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { inode = new_inode(dir->i_sb); @@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct if (tgid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tgid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); + if (!inode) + goto out_put_task; - - if (!inode) { - put_task_struct(task); - goto out; - } inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; @@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct inode->i_nlink = 4; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; - died = 0; d_add(dentry, inode); - spin_lock(&task->proc_lock); - task->proc_dentry = dentry; - if (!pid_alive(task)) { - dentry = proc_pid_unhash(task); - died = 1; - } - spin_unlock(&task->proc_lock); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; +out_put_task: put_task_struct(task); - if (died) { - proc_pid_flush(dentry); - goto out; - } - return NULL; out: - return ERR_PTR(-ENOENT); + return result; } /* SMP-safe */ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = proc_task(dir); + struct task_struct *leader = get_proc_task(dir); struct inode *inode; unsigned tid; + if (!leader) + goto out_no_task; + tid = name_to_int(dentry); if (tid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; if (leader->tgid != task->tgid) @@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry inode->i_nlink = 3; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; - put_task_struct(task); - return NULL; out_drop_task: put_task_struct(task); out: - return ERR_PTR(-ENOENT); + put_task_struct(leader); +out_no_task: + return result; } -#define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - /* - * Get a few tgid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the first tgid to return to user space. + * + * Usually this is just whatever follows &init_task, but if the users + * buffer was too small to hold the full list or there was a seek into + * the middle of the directory we have more work to do. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with &init_task and walk nr + * threads past it. */ -static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) -{ - struct task_struct *p; - int nr_tgids = 0; - - index--; - read_lock(&tasklist_lock); - p = NULL; - if (version) { - p = find_task_by_pid(version); - if (p && !thread_group_leader(p)) - p = NULL; +static struct task_struct *first_tgid(int tgid, unsigned int nr) +{ + struct task_struct *pos; + rcu_read_lock(); + if (tgid && nr) { + pos = find_task_by_pid(tgid); + if (pos && thread_group_leader(pos)) + goto found; } + /* If nr exceeds the number of processes get out quickly */ + pos = NULL; + if (nr && nr >= nr_processes()) + goto done; - if (p) - index = 0; - else - p = next_task(&init_task); - - for ( ; p != &init_task; p = next_task(p)) { - int tgid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - tgids[nr_tgids] = tgid; - nr_tgids++; - if (nr_tgids >= PROC_MAXPIDS) - break; + /* If we haven't found our starting place yet start with + * the init_task and walk nr tasks forward. + */ + for (pos = next_task(&init_task); nr > 0; --nr) { + pos = next_task(pos); + if (pos == &init_task) { + pos = NULL; + goto done; + } } - read_unlock(&tasklist_lock); - return nr_tgids; +found: + get_task_struct(pos); +done: + rcu_read_unlock(); + return pos; } /* - * Get a few tid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the next task in the task list. + * Return NULL if we loop or there is any error. + * + * The reference to the input task_struct is released. */ -static int get_tid_list(int index, unsigned int *tids, struct inode *dir) -{ - struct task_struct *leader_task = proc_task(dir); - struct task_struct *task = leader_task; - int nr_tids = 0; - - index -= 2; - read_lock(&tasklist_lock); - /* - * The starting point task (leader_task) might be an already - * unlinked task, which cannot be used to access the task-list - * via next_thread(). - */ - if (pid_alive(task)) do { - int tid = task->pid; - - if (--index >= 0) - continue; - if (tids != NULL) - tids[nr_tids] = tid; - nr_tids++; - if (nr_tids >= PROC_MAXPIDS) - break; - } while ((task = next_thread(task)) != leader_task); - read_unlock(&tasklist_lock); - return nr_tids; +static struct task_struct *next_tgid(struct task_struct *start) +{ + struct task_struct *pos; + rcu_read_lock(); + pos = start; + if (pid_alive(start)) + pos = next_task(start); + if (pid_alive(pos) && (pos != &init_task)) { + get_task_struct(pos); + goto done; + } + pos = NULL; +done: + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_tgids, i; - int next_tgid; + struct task_struct *task; + int tgid; if (!nr) { ino_t ino = fake_ino(0,PROC_TGID_INO); @@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; nr++; } + nr -= 1; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - next_tgid = filp->f_version; + tgid = filp->f_version; filp->f_version = 0; - for (;;) { - nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); - if (!nr_tgids) { - /* no more entries ! */ + for (task = first_tgid(tgid, nr); + task; + task = next_tgid(task), filp->f_pos++) { + int len; + ino_t ino; + tgid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tgid); + ino = fake_ino(tgid, PROC_TGID_INO); + if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tgid; + put_task_struct(task); break; } - next_tgid = 0; + } + return 0; +} - /* do not use the last found pid, reserve it for next_tgid */ - if (nr_tgids == PROC_MAXPIDS) { - nr_tgids--; - next_tgid = tgid_array[nr_tgids]; - } +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct task_struct *leader, + int tid, int nr) +{ + struct task_struct *pos; - for (i=0;i<nr_tgids;i++) { - int tgid = tgid_array[i]; - ino_t ino = fake_ino(tgid,PROC_TGID_INO); - unsigned long j = PROC_NUMBUF; + rcu_read_lock(); + /* Attempt to start with the pid of a thread */ + if (tid && (nr > 0)) { + pos = find_task_by_pid(tid); + if (pos && (pos->group_leader == leader)) + goto found; + } - do - buf[--j] = '0' + (tgid % 10); - while ((tgid /= 10) != 0); + /* If nr exceeds the number of threads there is nothing todo */ + pos = NULL; + if (nr && nr >= get_nr_threads(leader)) + goto out; - if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) { - /* returning this tgid failed, save it as the first - * pid for the next readir call */ - filp->f_version = tgid_array[i]; - goto out; - } - filp->f_pos++; - nr++; + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + for (pos = leader; nr > 0; --nr) { + pos = next_thread(pos); + if (pos == leader) { + pos = NULL; + goto out; } } +found: + get_task_struct(pos); out: - return 0; + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; - unsigned int nr_tids, i; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *leader = get_proc_task(inode); + struct task_struct *task; int retval = -ENOENT; ino_t ino; + int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ - if (!pid_alive(proc_task(inode))) - goto out; + if (!leader) + goto out_no_task; retval = 0; switch (pos) { @@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi /* fall through */ } - nr_tids = get_tid_list(pos, tid_array, inode); - inode->i_nlink = pos + nr_tids; - - for (i = 0; i < nr_tids; i++) { - unsigned long j = PROC_NUMBUF; - int tid = tid_array[i]; - - ino = fake_ino(tid,PROC_TID_INO); - - do - buf[--j] = '0' + (tid % 10); - while ((tid /= 10) != 0); - - if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + tid = filp->f_version; + filp->f_version = 0; + for (task = first_tid(leader, tid, pos - 2); + task; + task = next_tid(task), pos++) { + int len; + tid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tid); + ino = fake_ino(tid, PROC_TID_INO); + if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tid; + put_task_struct(task); break; - pos++; + } } out: filp->f_pos = pos; + put_task_struct(leader); +out_no_task: return retval; } + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + rcu_read_lock(); + stat->nlink += get_nr_threads(p); + rcu_read_unlock(); + put_task_struct(p); + } + + return 0; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 722b9c46311..6dcef089e18 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de) static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; - struct task_struct *tsk; truncate_inode_pages(&inode->i_data, 0); - /* Let go of any associated process */ - tsk = PROC_I(inode)->task; - if (tsk) - put_task_struct(tsk); + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; @@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; - ei->task = NULL; - ei->type = 0; + ei->pid = NULL; + ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; inode = &ei->vfs_inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0502f17b860..146a434ba94 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + + void free_proc_entry(struct proc_dir_entry *de); int proc_init_inodecache(void); -static inline struct task_struct *proc_task(struct inode *inode) +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) { - return PROC_I(inode)->task; + return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_type(struct inode *inode) +static inline int proc_fd(struct inode *inode) { - return PROC_I(inode)->type; + return PROC_I(inode)->fd; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 91b7c15ab37..0a163a4f776 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * { struct vm_area_struct * vma; int result = -ENOENT; - struct task_struct *task = proc_task(inode); - struct mm_struct * mm = get_task_mm(task); + struct task_struct *task = get_proc_task(inode); + struct mm_struct * mm = NULL; + if (task) { + mm = get_task_mm(task); + put_task_struct(task); + } if (!mm) goto out; down_read(&mm->mmap_sem); @@ -118,9 +122,15 @@ struct mem_size_stats unsigned long private_dirty; }; +__attribute__((weak)) const char *arch_vma_name(struct vm_area_struct *vma) +{ + return NULL; +} + static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -153,22 +163,23 @@ static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats pad_len_spaces(m, len); seq_path(m, file->f_vfsmnt, file->f_dentry, "\n"); } else { - if (mm) { - if (vma->vm_start <= mm->start_brk && + const char *name = arch_vma_name(vma); + if (!name) { + if (mm) { + if (vma->vm_start <= mm->start_brk && vma->vm_end >= mm->brk) { - pad_len_spaces(m, len); - seq_puts(m, "[heap]"); - } else { - if (vma->vm_start <= mm->start_stack && - vma->vm_end >= mm->start_stack) { - - pad_len_spaces(m, len); - seq_puts(m, "[stack]"); + name = "[heap]"; + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + name = "[stack]"; } + } else { + name = "[vdso]"; } - } else { + } + if (name) { pad_len_spaces(m, len); - seq_puts(m, "[vdso]"); + seq_puts(m, name); } } seq_putc(m, '\n'); @@ -295,12 +306,16 @@ static int show_smap(struct seq_file *m, void *v) static void *m_start(struct seq_file *m, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma; + struct vm_area_struct *vma, *tail_vma = NULL; loff_t l = *pos; + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + /* * We remember last_addr rather than next_addr to hit with * mmap_cache most of the time. We have zero last_addr at @@ -311,11 +326,15 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (last_addr == -1UL) return NULL; - mm = get_task_mm(task); + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); if (!mm) return NULL; - tail_vma = get_gate_vma(task); + priv->tail_vma = tail_vma = get_gate_vma(priv->task); down_read(&mm->mmap_sem); /* Start with last addr hint */ @@ -350,11 +369,9 @@ out: return tail_vma; } -static void m_stop(struct seq_file *m, void *v) +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - if (vma && vma != get_gate_vma(task)) { + if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; up_read(&mm->mmap_sem); mmput(mm); @@ -363,38 +380,103 @@ static void m_stop(struct seq_file *m, void *v) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = get_gate_vma(task); + struct vm_area_struct *tail_vma = priv->tail_vma; (*pos)++; if (vma && (vma != tail_vma) && vma->vm_next) return vma->vm_next; - m_stop(m, v); + vma_stop(priv, vma); return (vma != tail_vma)? tail_vma: NULL; } -struct seq_operations proc_pid_maps_op = { +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; -struct seq_operations proc_pid_smaps_op = { +static struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -struct seq_operations proc_pid_numa_maps_op = { +static struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map }; + +static int numa_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +struct file_operations proc_numa_maps_operations = { + .open = numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8f68827ed10..af69f28277b 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos) { return NULL; } -struct seq_operations proc_pid_maps_op = { +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; + +static int maps_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &proc_pid_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = NULL; + } + return ret; +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index cf6e1cf4035..752cea12e30 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, - size_t count, loff_t pos) -{ - return generic_file_aio_write(iocb, buf, count, pos); -} - const struct file_operations reiserfs_file_operations = { .read = generic_file_read, .write = reiserfs_file_write, @@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = { .fsync = reiserfs_sync_file, .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1b73529b809..49d1a53dbef 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock, get_bh(bh); if (test_set_buffer_locked(bh)) { if (!buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); goto loop_next; } spin_unlock(lock); @@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock, ret = -EIO; } if (buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); add_to_chunk(&chunk, bh, lock, write_ordered_chunk); } else { reiserfs_free_jh(bh); diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c71dd2760d3..c8e96195b96 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req) if (!(req->rq_flags & SMB_REQ_TRANSMITTED)) goto out; - list_del_init(&req->rq_queue); - list_add_tail(&req->rq_queue, &server->recvq); + list_move_tail(&req->rq_queue, &server->recvq); result = 1; out: return result; @@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server) result = smb_request_send_req(req); if (result < 0) { server->conn_error = result; - list_del_init(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); result = -EIO; goto out; } diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c index 3f71384020c..24577e2c489 100644 --- a/fs/smbfs/smbiod.c +++ b/fs/smbfs/smbiod.c @@ -193,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server) if (req->rq_flags & SMB_REQ_RETRY) { /* must move the request to the xmitq */ VERBOSE("retrying request %p on recvq\n", req); - list_del(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); continue; } #endif diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 610b5bdbe75..61c42430cba 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); - } + if (filp->f_pos == 2) + list_move(q, &parent_sd->s_children); + for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct sysfs_dirent *next; const char * name; @@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f2dbdf5a876..259bd196099 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -605,39 +605,12 @@ static void ufs_set_inode_ops(struct inode *inode) ufs_get_inode_dev(inode->i_sb, UFS_I(inode))); } -void ufs_read_inode (struct inode * inode) +static void ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) { struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_inode * ufs_inode; - struct ufs2_inode *ufs2_inode; - struct buffer_head * bh; + struct super_block *sb = inode->i_sb; mode_t mode; unsigned i; - unsigned flags; - - UFSD("ENTER, ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - flags = UFS_SB(sb)->s_flags; - - if (inode->i_ino < UFS_ROOTINO || - inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { - ufs_warning (sb, "ufs_read_inode", "bad inode number (%lu)\n", inode->i_ino); - goto bad_inode; - } - - bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); - if (!bh) { - ufs_warning (sb, "ufs_read_inode", "unable to read inode %lu\n", inode->i_ino); - goto bad_inode; - } - if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) - goto ufs2_inode; - - ufs_inode = (struct ufs_inode *) (bh->b_data + sizeof(struct ufs_inode) * ufs_inotofsbo(inode->i_ino)); /* * Copy data to the in-core inode. @@ -661,14 +634,11 @@ void ufs_read_inode (struct inode * inode) inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs32_to_cpu(sb, ufs_inode->ui_blocks); - inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size (for stat) */ - inode->i_version++; ufsi->i_flags = fs32_to_cpu(sb, ufs_inode->ui_flags); ufsi->i_gen = fs32_to_cpu(sb, ufs_inode->ui_gen); ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - ufsi->i_lastfrag = (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; - ufsi->i_dir_start_lookup = 0; + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) @@ -677,24 +647,16 @@ void ufs_read_inode (struct inode * inode) for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) ufsi->i_u1.i_symlink[i] = ufs_inode->ui_u2.ui_symlink[i]; } - ufsi->i_osync = 0; - - ufs_set_inode_ops(inode); - - brelse (bh); - - UFSD("EXIT\n"); - return; +} -bad_inode: - make_bad_inode(inode); - return; +static void ufs2_read_inode(struct inode *inode, struct ufs2_inode *ufs2_inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + mode_t mode; + unsigned i; -ufs2_inode : UFSD("Reading ufs2 inode, ino %lu\n", inode->i_ino); - - ufs2_inode = (struct ufs2_inode *)(bh->b_data + sizeof(struct ufs2_inode) * ufs_inotofsbo(inode->i_ino)); - /* * Copy data to the in-core inode. */ @@ -717,26 +679,64 @@ ufs2_inode : inode->i_atime.tv_nsec = 0; inode->i_ctime.tv_nsec = 0; inode->i_blocks = fs64_to_cpu(sb, ufs2_inode->ui_blocks); - inode->i_blksize = PAGE_SIZE; /*This is the optimal IO size(for stat)*/ - - inode->i_version++; ufsi->i_flags = fs32_to_cpu(sb, ufs2_inode->ui_flags); ufsi->i_gen = fs32_to_cpu(sb, ufs2_inode->ui_gen); /* ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); */ - ufsi->i_lastfrag= (inode->i_size + uspi->s_fsize- 1) >> uspi->s_fshift; if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR); i++) ufsi->i_u1.u2_i_data[i] = ufs2_inode->ui_u2.ui_addr.ui_db[i]; - } - else { + } else { for (i = 0; i < (UFS_NDADDR + UFS_NINDIR) * 4; i++) ufsi->i_u1.i_symlink[i] = ufs2_inode->ui_u2.ui_symlink[i]; } +} + +void ufs_read_inode(struct inode * inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct buffer_head * bh; + + UFSD("ENTER, ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + if (inode->i_ino < UFS_ROOTINO || + inode->i_ino > (uspi->s_ncg * uspi->s_ipg)) { + ufs_warning(sb, "ufs_read_inode", "bad inode number (%lu)\n", + inode->i_ino); + goto bad_inode; + } + + bh = sb_bread(sb, uspi->s_sbbase + ufs_inotofsba(inode->i_ino)); + if (!bh) { + ufs_warning(sb, "ufs_read_inode", "unable to read inode %lu\n", + inode->i_ino); + goto bad_inode; + } + if ((UFS_SB(sb)->s_flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { + struct ufs2_inode *ufs2_inode = (struct ufs2_inode *)bh->b_data; + + ufs2_read_inode(inode, + ufs2_inode + ufs_inotofsbo(inode->i_ino)); + } else { + struct ufs_inode *ufs_inode = (struct ufs_inode *)bh->b_data; + + ufs1_read_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); + } + + inode->i_blksize = PAGE_SIZE;/*This is the optimal IO size (for stat)*/ + inode->i_version++; + ufsi->i_lastfrag = + (inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift; + ufsi->i_dir_start_lookup = 0; ufsi->i_osync = 0; ufs_set_inode_ops(inode); @@ -745,6 +745,9 @@ ufs2_inode : UFSD("EXIT\n"); return; + +bad_inode: + make_bad_inode(inode); } static int ufs_update_inode(struct inode * inode, int do_sync) diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index 12810baeb5d..d9180020de6 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -419,16 +419,15 @@ xfs_vn_link( int error; ip = old_dentry->d_inode; /* inode being linked to */ - if (S_ISDIR(ip->i_mode)) - return -EPERM; - tdvp = vn_from_inode(dir); vp = vn_from_inode(ip); + VN_HOLD(vp); error = bhv_vop_link(tdvp, vp, dentry, NULL); - if (likely(!error)) { + if (unlikely(error)) { + VN_RELE(vp); + } else { VMODIFY(tdvp); - VN_HOLD(vp); xfs_validate_fields(ip, &vattr); d_instantiate(dentry, ip); } diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index aa26ab906c8..028eb17ec2e 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -140,9 +140,7 @@ BUFFER_FNS(PrivateStart, unwritten); #define current_pid() (current->pid) #define current_fsuid(cred) (current->fsuid) #define current_fsgid(cred) (current->fsgid) -#define current_set_flags(f) (current->flags |= (f)) #define current_test_flags(f) (current->flags & (f)) -#define current_clear_flags(f) (current->flags & ~(f)) #define current_set_flags_nested(sp, f) \ (*(sp) = current->flags, current->flags |= (f)) #define current_clear_flags_nested(sp, f) \ diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 35c6a01963a..c42b3221b20 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -93,7 +93,7 @@ typedef enum { */ static inline struct bhv_vnode *vn_from_inode(struct inode *inode) { - return (bhv_vnode_t *)list_entry(inode, bhv_vnode_t, v_inode); + return container_of(inode, bhv_vnode_t, v_inode); } static inline struct inode *vn_to_inode(struct bhv_vnode *vnode) { diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h index 1d8ff103201..6e6e56fb352 100644 --- a/fs/xfs/xfs_behavior.h +++ b/fs/xfs/xfs_behavior.h @@ -78,15 +78,12 @@ * */ -struct bhv_head_lock; - /* * Behavior head. Head of the chain of behaviors. * Contained within each virtualized object data structure. */ typedef struct bhv_head { struct bhv_desc *bh_first; /* first behavior in chain */ - struct bhv_head_lock *bh_lockp; /* pointer to lock info struct */ } bhv_head_t; /* diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 5fa0adb7e17..86c1bf0bba9 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1961,9 +1961,9 @@ xfs_iunlink_remove( xfs_agino_t agino; xfs_agino_t next_agino; xfs_buf_t *last_ibp; - xfs_dinode_t *last_dip; + xfs_dinode_t *last_dip = NULL; short bucket_index; - int offset, last_offset; + int offset, last_offset = 0; int error; int agi_ok; diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index d8f5d4cbe8b..e730328636c 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1740,10 +1740,10 @@ xlog_write(xfs_mount_t * mp, xlog_in_core_t **commit_iclog, uint flags) { - xlog_t *log = mp->m_log; + xlog_t *log = mp->m_log; xlog_ticket_t *ticket = (xlog_ticket_t *)tic; + xlog_in_core_t *iclog = NULL; /* ptr to current in-core log */ xlog_op_header_t *logop_head; /* ptr to log operation header */ - xlog_in_core_t *iclog; /* ptr to current in-core log */ __psint_t ptr; /* copy address into data region */ int len; /* # xlog_write() bytes 2 still copy */ int index; /* region index currently copying */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 55b4237c215..3cb678e3a13 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -990,6 +990,8 @@ xlog_find_zeroed( xfs_daddr_t num_scan_bblks; int error, log_bbnum = log->l_logBBsize; + *blk_no = 0; + /* check totally zeroed log */ bp = xlog_get_bp(log, 1); if (!bp) diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 10dbf203c62..4be5c0b2d29 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -1721,15 +1721,14 @@ xfs_mount_log_sbunit( * is present to prevent thrashing). */ +#ifdef CONFIG_HOTPLUG_CPU /* * hot-plug CPU notifier support. * - * We cannot use the hotcpu_register() function because it does - * not allow notifier instances. We need a notifier per filesystem - * as we need to be able to identify the filesystem to balance - * the counters out. This is achieved by having a notifier block - * embedded in the xfs_mount_t and doing pointer magic to get the - * mount pointer from the notifier block address. + * We need a notifier per filesystem as we need to be able to identify + * the filesystem to balance the counters out. This is achieved by + * having a notifier block embedded in the xfs_mount_t and doing pointer + * magic to get the mount pointer from the notifier block address. */ STATIC int xfs_icsb_cpu_notify( @@ -1779,6 +1778,7 @@ xfs_icsb_cpu_notify( return NOTIFY_OK; } +#endif /* CONFIG_HOTPLUG_CPU */ int xfs_icsb_init_counters( @@ -1791,9 +1791,11 @@ xfs_icsb_init_counters( if (mp->m_sb_cnts == NULL) return -ENOMEM; +#ifdef CONFIG_HOTPLUG_CPU mp->m_icsb_notifier.notifier_call = xfs_icsb_cpu_notify; mp->m_icsb_notifier.priority = 0; - register_cpu_notifier(&mp->m_icsb_notifier); + register_hotcpu_notifier(&mp->m_icsb_notifier); +#endif /* CONFIG_HOTPLUG_CPU */ for_each_online_cpu(i) { cntp = (xfs_icsb_cnts_t *)per_cpu_ptr(mp->m_sb_cnts, i); @@ -1812,7 +1814,7 @@ xfs_icsb_destroy_counters( xfs_mount_t *mp) { if (mp->m_sb_cnts) { - unregister_cpu_notifier(&mp->m_icsb_notifier); + unregister_hotcpu_notifier(&mp->m_icsb_notifier); free_percpu(mp->m_sb_cnts); } } @@ -2026,7 +2028,7 @@ xfs_icsb_balance_counter( xfs_sb_field_t field, int flags) { - uint64_t count, resid = 0; + uint64_t count, resid; int weight = num_online_cpus(); int s; @@ -2058,6 +2060,7 @@ xfs_icsb_balance_counter( break; default: BUG(); + count = resid = 0; /* quiet, gcc */ break; } diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 0c1e42b037e..5a0b678956e 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -1929,7 +1929,7 @@ xfs_growfs_rt( /* * Initial error checking. */ - if (mp->m_rtdev_targp || mp->m_rbmip == NULL || + if (mp->m_rtdev_targp == NULL || mp->m_rbmip == NULL || (nrblocks = in->newblocks) <= sbp->sb_rblocks || (sbp->sb_rblocks && (in->extsize != sbp->sb_rextsize))) return XFS_ERROR(EINVAL); diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index cb65c3a603f..9dc88b38060 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -338,8 +338,6 @@ typedef void (*xfs_trans_callback_t)(struct xfs_trans *, void *); typedef struct xfs_trans { unsigned int t_magic; /* magic number */ xfs_log_callback_t t_logcb; /* log callback struct */ - struct xfs_trans *t_forw; /* async list pointers */ - struct xfs_trans *t_back; /* async list pointers */ unsigned int t_type; /* transaction type */ unsigned int t_log_res; /* amt of log space resvd */ unsigned int t_log_count; /* count for perm log res */ @@ -364,9 +362,11 @@ typedef struct xfs_trans { long t_res_fdblocks_delta; /* on-disk only chg */ long t_frextents_delta;/* superblock freextents chg*/ long t_res_frextents_delta; /* on-disk only chg */ +#ifdef DEBUG long t_ag_freeblks_delta; /* debugging counter */ long t_ag_flist_delta; /* debugging counter */ long t_ag_btree_delta; /* debugging counter */ +#endif long t_dblocks_delta;/* superblock dblocks change */ long t_agcount_delta;/* superblock agcount change */ long t_imaxpct_delta;/* superblock imaxpct change */ diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 00a6b7dc24a..23cfa583772 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -2603,8 +2603,7 @@ xfs_link( vn_trace_entry(src_vp, __FUNCTION__, (inst_t *)__return_address); target_namelen = VNAMELEN(dentry); - if (VN_ISDIR(src_vp)) - return XFS_ERROR(EPERM); + ASSERT(!VN_ISDIR(src_vp)); sip = xfs_vtoi(src_vp); tdp = XFS_BHVTOI(target_dir_bdp); @@ -2699,9 +2698,8 @@ xfs_link( xfs_trans_log_inode(tp, tdp, XFS_ILOG_CORE); error = xfs_bumplink(tp, sip); - if (error) { + if (error) goto abort_return; - } /* * If this is a synchronous mount, make sure that the @@ -2719,9 +2717,8 @@ xfs_link( } error = xfs_trans_commit(tp, XFS_TRANS_RELEASE_LOG_RES, NULL); - if (error) { + if (error) goto std_return; - } /* Fall through to std_return with error = 0. */ std_return: @@ -2742,6 +2739,8 @@ std_return: xfs_trans_cancel(tp, cancel_flags); goto std_return; } + + /* * xfs_mkdir * |