diff options
Diffstat (limited to 'fs')
54 files changed, 2175 insertions, 2019 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c index f4407eb276c..12e1baa4508 100644 --- a/fs/9p/mux.c +++ b/fs/9p/mux.c @@ -712,7 +712,7 @@ static void v9fs_read_work(void *a) * v9fs_send_request - send 9P request * The function can sleep until the request is scheduled for sending. * The function can be interrupted. Return from the function is not - * a guarantee that the request is sent succesfully. Can return errors + * a guarantee that the request is sent successfully. Can return errors * that can be retrieved by PTR_ERR macros. * * @m: mux data diff --git a/fs/Kconfig b/fs/Kconfig index 1cdc043922d..6c5051802bd 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -1490,7 +1490,12 @@ config NFSD select LOCKD select SUNRPC select EXPORTFS - select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL + select NFSD_V2_ACL if NFSD_V3_ACL + select NFS_ACL_SUPPORT if NFSD_V2_ACL + select NFSD_TCP if NFSD_V4 + select CRYPTO_MD5 if NFSD_V4 + select CRYPTO if NFSD_V4 + select FS_POSIX_ACL if NFSD_V4 help If you want your Linux box to act as an NFS *server*, so that other computers on your local network which support NFS can access certain @@ -1528,7 +1533,6 @@ config NFSD_V3 config NFSD_V3_ACL bool "Provide server support for the NFSv3 ACL protocol extension" depends on NFSD_V3 - select NFSD_V2_ACL help Implement the NFSv3 ACL protocol extension for manipulating POSIX Access Control Lists on exported file systems. NFS clients should @@ -1538,10 +1542,6 @@ config NFSD_V3_ACL config NFSD_V4 bool "Provide NFSv4 server support (EXPERIMENTAL)" depends on NFSD_V3 && EXPERIMENTAL - select NFSD_TCP - select CRYPTO_MD5 - select CRYPTO - select FS_POSIX_ACL help If you would like to include the NFSv4 server as well as the NFSv2 and NFSv3 servers, say Y here. This feature is experimental, and diff --git a/fs/afs/cell.c b/fs/afs/cell.c index 009a9ae88d6..bfc1fd22d5b 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer, /* we found it in the graveyard - resurrect it */ found_dead_server: - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_list); + list_move_tail(&server->link, &cell->sv_list); afs_get_server(server); afs_kafstimod_del_timer(&server->timeout); spin_unlock(&cell->sv_gylock); diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c index 7ac07d0d47b..f09a794f248 100644 --- a/fs/afs/kafsasyncd.c +++ b/fs/afs/kafsasyncd.c @@ -136,8 +136,7 @@ static int kafsasyncd(void *arg) if (!list_empty(&kafsasyncd_async_attnq)) { op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link); - list_del(&op->link); - list_add_tail(&op->link, + list_move_tail(&op->link, &kafsasyncd_async_busyq); } @@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op) init_waitqueue_entry(&op->waiter, kafsasyncd_task); add_wait_queue(&op->call->waitq, &op->waiter); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_busyq); + list_move_tail(&op->link, &kafsasyncd_async_busyq); spin_unlock(&kafsasyncd_async_lock); @@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op) spin_lock(&kafsasyncd_async_lock); - list_del(&op->link); - list_add_tail(&op->link, &kafsasyncd_async_attnq); + list_move_tail(&op->link, &kafsasyncd_async_attnq); spin_unlock(&kafsasyncd_async_lock); diff --git a/fs/afs/server.c b/fs/afs/server.c index 62b093aa41c..22afaae1a4c 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr, resurrect_server: _debug("resurrecting server"); - list_del(&zombie->link); - list_add_tail(&zombie->link, &cell->sv_list); + list_move_tail(&zombie->link, &cell->sv_list); afs_get_server(zombie); afs_kafstimod_del_timer(&zombie->timeout); spin_unlock(&cell->sv_gylock); @@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server) } spin_lock(&cell->sv_gylock); - list_del(&server->link); - list_add_tail(&server->link, &cell->sv_graveyard); + list_move_tail(&server->link, &cell->sv_graveyard); /* time out in 10 secs */ afs_kafstimod_add_timer(&server->timeout, 10 * HZ); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index eced20618ec..331f730a1fb 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell, /* found in the graveyard - resurrect */ _debug("found in graveyard"); atomic_inc(&vlocation->usage); - list_del(&vlocation->link); - list_add_tail(&vlocation->link, &cell->vl_list); + list_move_tail(&vlocation->link, &cell->vl_list); spin_unlock(&cell->vl_gylock); afs_kafstimod_del_timer(&vlocation->timeout); @@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation) } /* move to graveyard queue */ - list_del(&vlocation->link); - list_add_tail(&vlocation->link,&cell->vl_graveyard); + list_move_tail(&vlocation->link,&cell->vl_graveyard); /* remove from pending timeout queue (refcounted if actually being * updated) */ diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c index 9867fef3261..cf62da5d782 100644 --- a/fs/afs/vnode.c +++ b/fs/afs/vnode.c @@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode, vnode->cb_expiry * HZ); spin_lock(&afs_cb_hash_lock); - list_del(&vnode->cb_hash_link); - list_add_tail(&vnode->cb_hash_link, + list_move_tail(&vnode->cb_hash_link, &afs_cb_hash(server, &vnode->fid)); spin_unlock(&afs_cb_hash_lock); @@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb) * invoked both for initial i/o submission and * subsequent retries via the aio_kick_handler. * Expects to be invoked with iocb->ki_ctx->lock - * already held. The lock is released and reaquired + * already held. The lock is released and reacquired * as needed during processing. * * Calls the iocb retry method (already setup for the diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 4456d1daa40..8dbd44f10e9 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -376,8 +376,7 @@ next: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); spin_lock(&dcache_lock); - list_del(&expired->d_parent->d_subdirs); - list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child); + list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child); spin_unlock(&dcache_lock); return expired; } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b1c902e319c..c94d52eafd1 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm, } /* OK, This is the point of no return */ - set_personality(PER_LINUX); + set_personality(PER_LINUX_32BIT); } /* diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c index 6c6771db36d..7caee8d8ea3 100644 --- a/fs/coda/psdev.c +++ b/fs/coda/psdev.c @@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf, /* If request was not a signal, enqueue and don't free */ if (!(req->uc_flags & REQ_ASYNC)) { req->uc_flags |= REQ_READ; - list_add(&(req->uc_chain), vcp->vc_processing.prev); + list_add_tail(&(req->uc_chain), &vcp->vc_processing); goto out; } diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c index b040eba13a7..a5b5e631ba6 100644 --- a/fs/coda/upcall.c +++ b/fs/coda/upcall.c @@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi, ((union inputArgs *)buffer)->ih.unique = req->uc_unique; /* Append msg to pending queue and poke Venus. */ - list_add(&(req->uc_chain), vcommp->vc_pending.prev); + list_add_tail(&(req->uc_chain), &vcommp->vc_pending); wake_up_interruptible(&vcommp->vc_waitq); /* We can be interrupted while we wait for Venus to process diff --git a/fs/compat.c b/fs/compat.c index 7e7e5bc4f3c..e31e9cf9664 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -55,6 +55,20 @@ extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat); +int compat_log = 1; + +int compat_printk(const char *fmt, ...) +{ + va_list ap; + int ret; + if (!compat_log) + return 0; + va_start(ap, fmt); + ret = vprintk(fmt, ap); + va_end(ap); + return ret; +} + /* * Not all architectures have sys_utime, so implement this in terms * of sys_utimes. @@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd, sprintf(buf,"'%c'", (cmd>>24) & 0x3f); if (!isprint(buf[1])) sprintf(buf, "%02x", buf[1]); - printk("ioctl32(%s:%d): Unknown cmd fd(%d) " + compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) " "cmd(%08x){%s} arg(%08x) on %s\n", current->comm, current->pid, (int)fd, (unsigned int)cmd, buf, diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 9eb9824dd33..d8ecfedef18 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -80,6 +80,7 @@ #include <net/bluetooth/rfcomm.h> #include <linux/capi.h> +#include <linux/gigaset_dev.h> #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 5f952187fc5..207f8006fd6 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir /* fallthrough */ default: if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); + list_move(q, &parent_sd->s_children); } for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct configfs_dirent *next; @@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/dcache.c b/fs/dcache.c index b85fda36053..48b44a714b3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb) dentry = list_entry(tmp, struct dentry, d_lru); if (dentry->d_sb != sb) continue; - list_del(tmp); - list_add(tmp, &dentry_unused); + list_move(tmp, &dentry_unused); } /* @@ -638,7 +637,7 @@ resume: * of the unused list for prune_dcache */ if (!atomic_read(&dentry->d_count)) { - list_add(&dentry->d_lru, dentry_unused.prev); + list_add_tail(&dentry->d_lru, &dentry_unused); dentry_stat.nr_unused++; found++; } diff --git a/fs/dquot.c b/fs/dquot.c index 81d87a413c6..0122a279106 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block /* Add a dquot to the tail of the free list */ static inline void put_dquot_last(struct dquot *dquot) { - list_add(&dquot->dq_free, free_dquots.prev); + list_add_tail(&dquot->dq_free, &free_dquots); dqstats.free_dquots++; } @@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot) { /* We add to the back of inuse list so we don't have to restart * when traversing this list and we block */ - list_add(&dquot->dq_inuse, inuse_list.prev); + list_add_tail(&dquot->dq_inuse, &inuse_list); dqstats.allocated_dquots++; } diff --git a/fs/exec.c b/fs/exec.c index 0b88bf64614..c8494f513ea 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct dentry *proc_dentry1, *proc_dentry2; - /* * Wait for the thread group leader to be a zombie. * It should already be zombie at this point, most @@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk) */ current->start_time = leader->start_time; - spin_lock(&leader->proc_lock); - spin_lock(¤t->proc_lock); - proc_dentry1 = proc_pid_unhash(current); - proc_dentry2 = proc_pid_unhash(leader); write_lock_irq(&tasklist_lock); BUG_ON(leader->tgid != current->tgid); @@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_PID, current->pid); attach_pid(current, PIDTYPE_PGID, current->signal->pgrp); attach_pid(current, PIDTYPE_SID, current->signal->session); - list_add_tail_rcu(¤t->tasks, &init_task.tasks); + list_replace_rcu(&leader->tasks, ¤t->tasks); current->group_leader = current; leader->group_leader = current; @@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk) /* Reduce leader to a thread */ detach_pid(leader, PIDTYPE_PGID); detach_pid(leader, PIDTYPE_SID); - list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; @@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk) leader->exit_state = EXIT_DEAD; write_unlock_irq(&tasklist_lock); - spin_unlock(&leader->proc_lock); - spin_unlock(¤t->proc_lock); - proc_pid_flush(proc_dentry1); - proc_pid_flush(proc_dentry2); } /* @@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr) *out_ptr = 0; } -static void zap_threads (struct mm_struct *mm) +static void zap_process(struct task_struct *start) { - struct task_struct *g, *p; - struct task_struct *tsk = current; - struct completion *vfork_done = tsk->vfork_done; - int traced = 0; + struct task_struct *t; - /* - * Make sure nobody is waiting for us to release the VM, - * otherwise we can deadlock when we wait on each other - */ - if (vfork_done) { - tsk->vfork_done = NULL; - complete(vfork_done); - } + start->signal->flags = SIGNAL_GROUP_EXIT; + start->signal->group_stop_count = 0; - read_lock(&tasklist_lock); - do_each_thread(g,p) - if (mm == p->mm && p != tsk) { - force_sig_specific(SIGKILL, p); - mm->core_waiters++; - if (unlikely(p->ptrace) && - unlikely(p->parent->mm == mm)) - traced = 1; + t = start; + do { + if (t != current && t->mm) { + t->mm->core_waiters++; + sigaddset(&t->pending.signal, SIGKILL); + signal_wake_up(t, 1); } - while_each_thread(g,p); + } while ((t = next_thread(t)) != start); +} - read_unlock(&tasklist_lock); +static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm, + int exit_code) +{ + struct task_struct *g, *p; + unsigned long flags; + int err = -EAGAIN; + + spin_lock_irq(&tsk->sighand->siglock); + if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) { + tsk->signal->group_exit_code = exit_code; + zap_process(tsk); + err = 0; + } + spin_unlock_irq(&tsk->sighand->siglock); + if (err) + return err; - if (unlikely(traced)) { - /* - * We are zapping a thread and the thread it ptraces. - * If the tracee went into a ptrace stop for exit tracing, - * we could deadlock since the tracer is waiting for this - * coredump to finish. Detach them so they can both die. - */ - write_lock_irq(&tasklist_lock); - do_each_thread(g,p) { - if (mm == p->mm && p != tsk && - p->ptrace && p->parent->mm == mm) { - __ptrace_detach(p, 0); + if (atomic_read(&mm->mm_users) == mm->core_waiters + 1) + goto done; + + rcu_read_lock(); + for_each_process(g) { + if (g == tsk->group_leader) + continue; + + p = g; + do { + if (p->mm) { + if (p->mm == mm) { + /* + * p->sighand can't disappear, but + * may be changed by de_thread() + */ + lock_task_sighand(p, &flags); + zap_process(p); + unlock_task_sighand(p, &flags); + } + break; } - } while_each_thread(g,p); - write_unlock_irq(&tasklist_lock); + } while ((p = next_thread(p)) != g); } + rcu_read_unlock(); +done: + return mm->core_waiters; } -static void coredump_wait(struct mm_struct *mm) +static int coredump_wait(int exit_code) { - DECLARE_COMPLETION(startup_done); + struct task_struct *tsk = current; + struct mm_struct *mm = tsk->mm; + struct completion startup_done; + struct completion *vfork_done; int core_waiters; + init_completion(&mm->core_done); + init_completion(&startup_done); mm->core_startup_done = &startup_done; - zap_threads(mm); - core_waiters = mm->core_waiters; + core_waiters = zap_threads(tsk, mm, exit_code); up_write(&mm->mmap_sem); + if (unlikely(core_waiters < 0)) + goto fail; + + /* + * Make sure nobody is waiting for us to release the VM, + * otherwise we can deadlock when we wait on each other + */ + vfork_done = tsk->vfork_done; + if (vfork_done) { + tsk->vfork_done = NULL; + complete(vfork_done); + } + if (core_waiters) wait_for_completion(&startup_done); +fail: BUG_ON(mm->core_waiters); + return core_waiters; } int do_coredump(long signr, int exit_code, struct pt_regs * regs) @@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs) } mm->dumpable = 0; - retval = -EAGAIN; - spin_lock_irq(¤t->sighand->siglock); - if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { - current->signal->flags = SIGNAL_GROUP_EXIT; - current->signal->group_exit_code = exit_code; - current->signal->group_stop_count = 0; - retval = 0; - } - spin_unlock_irq(¤t->sighand->siglock); - if (retval) { - up_write(&mm->mmap_sem); + retval = coredump_wait(exit_code); + if (retval < 0) goto fail; - } - - init_completion(&mm->core_done); - coredump_wait(mm); /* * Clear any false indication of pending signals that might diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b2891cc29db..b7483360a2d 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -630,7 +630,7 @@ enum { Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro, Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl, - Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, + Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh, Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, @@ -666,6 +666,7 @@ static match_table_t tokens = { {Opt_noreservation, "noreservation"}, {Opt_noload, "noload"}, {Opt_nobh, "nobh"}, + {Opt_bh, "bh"}, {Opt_commit, "commit=%u"}, {Opt_journal_update, "journal=update"}, {Opt_journal_inum, "journal=%u"}, @@ -1014,6 +1015,9 @@ clear_qf_name: case Opt_nobh: set_opt(sbi->s_mount_opt, NOBH); break; + case Opt_bh: + clear_opt(sbi->s_mount_opt, NOBH); + break; default: printk (KERN_ERR "EXT3-fs: Unrecognized mount option \"%s\" " diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 1862e8bc101..b8886f048ea 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, if (!instr) { printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n"); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c, /* Erase failed immediately. Refile it on the list */ D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo { D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset)); spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add_tail(&jeb->list, &c->erase_complete_list); + list_move_tail(&jeb->list, &c->erase_complete_list); spin_unlock(&c->erase_completion_lock); /* Ensure that kupdated calls us again to mark them clean */ jffs2_erase_pending_trigger(c); @@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { /* We'd like to give this block another try. */ spin_lock(&c->erase_completion_lock); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->erasing_size -= c->sector_size; c->dirty_size += c->sector_size; jeb->dirty_size = c->sector_size; @@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock spin_lock(&c->erase_completion_lock); c->erasing_size -= c->sector_size; c->bad_size += c->sector_size; - list_del(&jeb->list); - list_add(&jeb->list, &c->bad_list); + list_move(&jeb->list, &c->bad_list); c->nr_erasing_blocks--; spin_unlock(&c->erase_completion_lock); wake_up(&c->erase_wait); diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c index 8bedfd2ff68..ac0c350ed7d 100644 --- a/fs/jffs2/nodemgmt.c +++ b/fs/jffs2/nodemgmt.c @@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c) struct jffs2_eraseblock *ejeb; ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list); - list_del(&ejeb->list); - list_add_tail(&ejeb->list, &c->erase_pending_list); + list_move_tail(&ejeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n", diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c index 0b02fc79e4d..be1acc3dad9 100644 --- a/fs/jffs2/summary.c +++ b/fs/jffs2/summary.c @@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c) return -ENOMEM; } - dbg_summary("returned succesfully\n"); + dbg_summary("returned successfully\n"); return 0; } diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index a7f153f79ec..b9b700730df 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c) /* Fix up the original jeb now it's on the bad_list */ if (first_raw == jeb->first_node) { D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset)); - list_del(&jeb->list); - list_add(&jeb->list, &c->erase_pending_list); + list_move(&jeb->list, &c->erase_pending_list); c->nr_erasing_blocks++; jffs2_erase_pending_trigger(c); } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 5549378358b..4d52593a5fc 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr) /* allocate the disk blocks for the extent. initially, extBalloc() * will try to allocate disk blocks for the requested size (xlen). - * if this fails (xlen contigious free blocks not avaliable), it'll + * if this fails (xlen contiguous free blocks not avaliable), it'll * try to allocate a smaller number of blocks (producing a smaller * extent), with this smaller number of blocks consisting of the * requested number of blocks rounded down to the next smaller @@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp) * * initially, we will try to allocate disk blocks for the * requested size (nblocks). if this fails (nblocks - * contigious free blocks not avaliable), we'll try to allocate + * contiguous free blocks not avaliable), we'll try to allocate * a smaller number of blocks (producing a smaller extent), with * this smaller number of blocks consisting of the requested * number of blocks rounded down to the next smaller power of 2 @@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) /* get the number of blocks to initially attempt to allocate. * we'll first try the number of blocks requested unless this - * number is greater than the maximum number of contigious free + * number is greater than the maximum number of contiguous free * blocks in the map. in that case, we'll start off with the * maximum free. */ @@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno) * in place. if this fails, we'll try to move the extent * to a new set of blocks. if moving the extent, we initially * will try to allocate disk blocks for the requested size - * (nnew). if this fails (nnew contigious free blocks not + * (nnew). if this fails (new contiguous free blocks not * avaliable), we'll try to allocate a smaller number of * blocks (producing a smaller extent), with this smaller * number of blocks consisting of the requested number of diff --git a/fs/libfs.c b/fs/libfs.c index fc785d8befb..ac02ea602c3 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) /* fallthrough */ default: spin_lock(&dcache_lock); - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &dentry->d_subdirs); - } + if (filp->f_pos == 2) + list_move(q, &dentry->d_subdirs); + for (p=q->next; p != &dentry->d_subdirs; p=p->next) { struct dentry *next; next = list_entry(p, struct dentry, d_u.d_child); @@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir) return 0; spin_lock(&dcache_lock); /* next is still alive */ - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } diff --git a/fs/namespace.c b/fs/namespace.c index 866430bb024..b3ed212ea41 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill) { struct vfsmount *p; - for (p = mnt; p; p = next_mnt(p, mnt)) { - list_del(&p->mnt_hash); - list_add(&p->mnt_hash, kill); - } + for (p = mnt; p; p = next_mnt(p, mnt)) + list_move(&p->mnt_hash, kill); if (propagate) propagate_umount(kill); diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 96c7578cbe1..1630b5670dc 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp) dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp); list_del_init(&clp->cl_strhash); - list_del_init(&clp->cl_idhash); - list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); + list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]); strhashval = clientstr_hashval(clp->cl_recdir); list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]); renew_client(clp); diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index d852ebb538e..fdf7cf3dfad 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -103,8 +103,7 @@ nfsd_cache_shutdown(void) static void lru_put_end(struct svc_cacherep *rp) { - list_del(&rp->c_lru); - list_add_tail(&rp->c_lru, &lru_head); + list_move_tail(&rp->c_lru, &lru_head); } /* diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index 355593dd8ef..42775e2bbe2 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, lock->ml.node == dlm->node_num ? "master" : "remote"); memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN); - } else if (lksb->flags & DLM_LKSB_PUT_LVB) { - mlog(0, "setting lvb from lockres for %s node\n", - lock->ml.node == dlm->node_num ? "master" : - "remote"); - memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN); } + /* Do nothing for lvb put requests - they should be done in + * place when the lock is downconverted - otherwise we risk + * racing gets and puts which could result in old lvb data + * being propagated. We leave the put flag set and clear it + * here. In the future we might want to clear it at the time + * the put is actually done. + */ spin_unlock(&res->spinlock); } @@ -381,8 +383,7 @@ do_ast: ret = DLM_NORMAL; if (past->type == DLM_AST) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); mlog(0, "ast: adding to granted list... type=%d, " "convert_type=%d\n", lock->ml.type, lock->ml.convert_type); if (lock->ml.convert_type != LKM_IVMODE) { diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h index 88cc43df18f..9bdc9cf6599 100644 --- a/fs/ocfs2/dlm/dlmcommon.h +++ b/fs/ocfs2/dlm/dlmcommon.h @@ -37,7 +37,17 @@ #define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes #define DLM_THREAD_MS 200 // flush at least every 200 ms -#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_SIZE_DEFAULT (1 << 14) +#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE +# define DLM_HASH_PAGES 1 +#else +# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE) +#endif +#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head)) +#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE) + +/* Intended to make it easier for us to switch out hash functions */ +#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l) enum dlm_ast_type { DLM_AST = 0, @@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len) return 0; } -#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_ACTIVE 0x0001 +#define DLM_RECO_STATE_FINALIZE 0x0002 struct dlm_recovery_ctxt { @@ -85,7 +96,7 @@ enum dlm_ctxt_state { struct dlm_ctxt { struct list_head list; - struct hlist_head *lockres_hash; + struct hlist_head **lockres_hash; struct list_head dirty_list; struct list_head purge_list; struct list_head pending_asts; @@ -120,6 +131,7 @@ struct dlm_ctxt struct o2hb_callback_func dlm_hb_down; struct task_struct *dlm_thread_task; struct task_struct *dlm_reco_thread_task; + struct workqueue_struct *dlm_worker; wait_queue_head_t dlm_thread_wq; wait_queue_head_t dlm_reco_thread_wq; wait_queue_head_t ast_wq; @@ -132,6 +144,11 @@ struct dlm_ctxt struct list_head dlm_eviction_callbacks; }; +static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i) +{ + return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE); +} + /* these keventd work queue items are for less-frequently * called functions that cannot be directly called from the * net message handlers for some reason, usually because @@ -216,20 +233,29 @@ struct dlm_lock_resource /* WARNING: Please see the comment in dlm_init_lockres before * adding fields here. */ struct hlist_node hash_node; + struct qstr lockname; struct kref refs; - /* please keep these next 3 in this order - * some funcs want to iterate over all lists */ + /* + * Please keep granted, converting, and blocked in this order, + * as some funcs want to iterate over all lists. + * + * All four lists are protected by the hash's reference. + */ struct list_head granted; struct list_head converting; struct list_head blocked; + struct list_head purge; + /* + * These two lists require you to hold an additional reference + * while they are on the list. + */ struct list_head dirty; struct list_head recovering; // dlm_recovery_ctxt.resources list /* unused lock resources have their last_used stamped and are * put on a list for the dlm thread to run. */ - struct list_head purge; unsigned long last_used; unsigned migration_pending:1; @@ -238,7 +264,6 @@ struct dlm_lock_resource wait_queue_head_t wq; u8 owner; //node which owns the lock resource, or unknown u16 state; - struct qstr lockname; char lvb[DLM_LVB_LEN]; }; @@ -300,6 +325,15 @@ enum dlm_lockres_list { DLM_BLOCKED_LIST }; +static inline int dlm_lvb_is_empty(char *lvb) +{ + int i; + for (i=0; i<DLM_LVB_LEN; i++) + if (lvb[i]) + return 0; + return 1; +} + static inline struct list_head * dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx) { @@ -609,7 +643,8 @@ struct dlm_finalize_reco { u8 node_idx; u8 dead_node; - __be16 pad1; + u8 flags; + u8 pad1; __be32 pad2; }; @@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm); void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); void dlm_put(struct dlm_ctxt *dlm); struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); @@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); void dlm_purge_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *lockres); -void dlm_lockres_get(struct dlm_lock_resource *res); +static inline void dlm_lockres_get(struct dlm_lock_resource *res) +{ + /* This is called on every lookup, so it might be worth + * inlining. */ + kref_get(&res->refs); +} void dlm_lockres_put(struct dlm_lock_resource *res); void __dlm_unhash_lockres(struct dlm_lock_resource *res); void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res); struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, - unsigned int len); + unsigned int len, + unsigned int hash); struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, const char *name, unsigned int len); @@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node); int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock); +int __dlm_lockres_unused(struct dlm_lock_resource *res); static inline const char * dlm_lock_mode_name(int mode) { diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c index 8285228d9e3..c764dc8e40a 100644 --- a/fs/ocfs2/dlm/dlmconvert.c +++ b/fs/ocfs2/dlm/dlmconvert.c @@ -214,6 +214,9 @@ grant: if (lock->ml.node == dlm->node_num) mlog(0, "doing in-place convert for nonlocal lock\n"); lock->ml.type = type; + if (lock->lksb->flags & DLM_LKSB_PUT_LVB) + memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN); + status = DLM_NORMAL; *call_ast = 1; goto unlock_exit; @@ -231,8 +234,7 @@ switch_queues: lock->ml.convert_type = type; /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); unlock_exit: spin_unlock(&lock->spinlock); @@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res, struct dlm_lock *lock) { /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB); } @@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm, res->state |= DLM_LOCK_RES_IN_PROGRESS; /* move lock to local convert queue */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->converting); + list_move_tail(&lock->list, &res->converting); lock->convert_pending = 1; lock->ml.convert_type = type; @@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } spin_lock(&res->spinlock); + status = __dlm_lockres_state_to_status(res); + if (status != DLM_NORMAL) { + spin_unlock(&res->spinlock); + dlm_error(status); + goto leave; + } list_for_each(iter, &res->granted) { lock = list_entry(iter, struct dlm_lock, list); if (lock->ml.cookie == cnv->cookie && @@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data) } lock = NULL; } + if (!lock) { + __dlm_print_one_lock_resource(res); + list_for_each(iter, &res->granted) { + lock = list_entry(iter, struct dlm_lock, list); + if (lock->ml.node == cnv->node_idx) { + mlog(ML_ERROR, "There is something here " + "for node %u, lock->ml.cookie=%llu, " + "cnv->cookie=%llu\n", cnv->node_idx, + (unsigned long long)lock->ml.cookie, + (unsigned long long)cnv->cookie); + break; + } + } + lock = NULL; + } spin_unlock(&res->spinlock); if (!lock) { status = DLM_IVLOCKID; diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index c7eae5d3324..3f6c8d88f7a 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -37,10 +37,8 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" -#include "dlmdebug.h" #define MLOG_MASK_PREFIX ML_DLM #include "cluster/masklog.h" @@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid) } EXPORT_SYMBOL_GPL(dlm_print_one_lock); +#if 0 void dlm_dump_lock_resources(struct dlm_ctxt *dlm) { struct dlm_lock_resource *res; @@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm) spin_lock(&dlm->spinlock); for (i=0; i<DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) dlm_print_one_lock_resource(res); } spin_unlock(&dlm->spinlock); } +#endif /* 0 */ static const char *dlm_errnames[] = { [DLM_NORMAL] = "DLM_NORMAL", diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h deleted file mode 100644 index 6858510c3cc..00000000000 --- a/fs/ocfs2/dlm/dlmdebug.h +++ /dev/null @@ -1,30 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * - * dlmdebug.h - * - * Copyright (C) 2004 Oracle. All rights reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the - * Free Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 021110-1307, USA. - * - */ - -#ifndef DLMDEBUG_H -#define DLMDEBUG_H - -void dlm_dump_lock_resources(struct dlm_ctxt *dlm); - -#endif diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 8f3a9e3106f..ba27c5c5e95 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -41,7 +41,6 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #include "dlmver.h" @@ -49,6 +48,33 @@ #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN) #include "cluster/masklog.h" +static void dlm_free_pagevec(void **vec, int pages) +{ + while (pages--) + free_page((unsigned long)vec[pages]); + kfree(vec); +} + +static void **dlm_alloc_pagevec(int pages) +{ + void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL); + int i; + + if (!vec) + return NULL; + + for (i = 0; i < pages; i++) + if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL))) + goto out_free; + + mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n", + pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE); + return vec; +out_free: + dlm_free_pagevec(vec, i); + return NULL; +} + /* * * spinlock lock ordering: if multiple locks are needed, obey this ordering: @@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, assert_spin_locked(&dlm->spinlock); q = &res->lockname; - q->hash = full_name_hash(q->name, q->len); - bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]); + bucket = dlm_lockres_hash(dlm, q->hash); /* get a reference for our hashtable */ dlm_lockres_get(res); @@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm, } struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm, - const char *name, - unsigned int len) + const char *name, + unsigned int len, + unsigned int hash) { - unsigned int hash; - struct hlist_node *iter; - struct dlm_lock_resource *tmpres=NULL; struct hlist_head *bucket; + struct hlist_node *list; mlog_entry("%.*s\n", len, name); assert_spin_locked(&dlm->spinlock); - hash = full_name_hash(name, len); - - bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]); - - /* check for pre-existing lock */ - hlist_for_each(iter, bucket) { - tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node); - if (tmpres->lockname.len == len && - memcmp(tmpres->lockname.name, name, len) == 0) { - dlm_lockres_get(tmpres); - break; - } + bucket = dlm_lockres_hash(dlm, hash); - tmpres = NULL; + hlist_for_each(list, bucket) { + struct dlm_lock_resource *res = hlist_entry(list, + struct dlm_lock_resource, hash_node); + if (res->lockname.name[0] != name[0]) + continue; + if (unlikely(res->lockname.len != len)) + continue; + if (memcmp(res->lockname.name + 1, name + 1, len - 1)) + continue; + dlm_lockres_get(res); + return res; } - return tmpres; + return NULL; } struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, @@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm, unsigned int len) { struct dlm_lock_resource *res; + unsigned int hash = dlm_lockid_hash(name, len); spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, len); + res = __dlm_lookup_lockres(dlm, name, len, hash); spin_unlock(&dlm->spinlock); return res; } @@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain) static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm) { if (dlm->lockres_hash) - free_page((unsigned long) dlm->lockres_hash); + dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); if (dlm->name) kfree(dlm->name); @@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm) return ret; } +static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm) +{ + if (dlm->dlm_worker) { + flush_workqueue(dlm->dlm_worker); + destroy_workqueue(dlm->dlm_worker); + dlm->dlm_worker = NULL; + } +} + static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm) { dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); /* We've left the domain. Now we can take ourselves out of the * list and allow the kref stuff to help us free the @@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm) restart: spin_lock(&dlm->spinlock); for (i = 0; i < DLM_HASH_BUCKETS; i++) { - while (!hlist_empty(&dlm->lockres_hash[i])) { - res = hlist_entry(dlm->lockres_hash[i].first, + while (!hlist_empty(dlm_lockres_hash(dlm, i))) { + res = hlist_entry(dlm_lockres_hash(dlm, i)->first, struct dlm_lock_resource, hash_node); /* need reference when manually grabbing lockres */ dlm_lockres_get(res); @@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm) goto bail; } + dlm->dlm_worker = create_singlethread_workqueue("dlm_wq"); + if (!dlm->dlm_worker) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + do { unsigned int backoff; status = dlm_try_to_join_domain(dlm); @@ -1166,6 +1207,7 @@ bail: dlm_unregister_domain_handlers(dlm); dlm_complete_thread(dlm); dlm_complete_recovery_thread(dlm); + dlm_destroy_dlm_worker(dlm); } return status; @@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL); + dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { mlog_errno(-ENOMEM); kfree(dlm->name); @@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, goto leave; } - for (i=0; i<DLM_HASH_BUCKETS; i++) - INIT_HLIST_HEAD(&dlm->lockres_hash[i]); + for (i = 0; i < DLM_HASH_BUCKETS; i++) + INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i)); strcpy(dlm->name, domain); dlm->key = key; @@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->dlm_thread_task = NULL; dlm->dlm_reco_thread_task = NULL; + dlm->dlm_worker = NULL; init_waitqueue_head(&dlm->dlm_thread_wq); init_waitqueue_head(&dlm->dlm_reco_thread_wq); init_waitqueue_head(&dlm->reco.event); diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c index 7273d9fa6ba..033ad170123 100644 --- a/fs/ocfs2/dlm/dlmfs.c +++ b/fs/ocfs2/dlm/dlmfs.c @@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode, * doesn't make sense for LVB writes. */ file->f_flags &= ~O_APPEND; - fp = kmalloc(sizeof(*fp), GFP_KERNEL); + fp = kmalloc(sizeof(*fp), GFP_NOFS); if (!fp) { status = -ENOMEM; goto bail; @@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp, else readlen = count - *ppos; - lvb_buf = kmalloc(readlen, GFP_KERNEL); + lvb_buf = kmalloc(readlen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; @@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp, else writelen = count - *ppos; - lvb_buf = kmalloc(writelen, GFP_KERNEL); + lvb_buf = kmalloc(writelen, GFP_NOFS); if (!lvb_buf) return -ENOMEM; diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c index 6fea28318d6..d6f89577e25 100644 --- a/fs/ocfs2/dlm/dlmlock.c +++ b/fs/ocfs2/dlm/dlmlock.c @@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, struct dlm_lock *lock, int flags) { enum dlm_status status = DLM_DENIED; + int lockres_changed = 1; mlog_entry("type=%d\n", lock->ml.type); mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len, @@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, res->state &= ~DLM_LOCK_RES_IN_PROGRESS; lock->lock_pending = 0; if (status != DLM_NORMAL) { - if (status != DLM_NOTQUEUED) + if (status == DLM_RECOVERING && + dlm_is_recovery_lock(res->lockname.name, + res->lockname.len)) { + /* recovery lock was mastered by dead node. + * we need to have calc_usage shoot down this + * lockres and completely remaster it. */ + mlog(0, "%s: recovery lock was owned by " + "dead node %u, remaster it now.\n", + dlm->name, res->owner); + } else if (status != DLM_NOTQUEUED) { + /* + * DO NOT call calc_usage, as this would unhash + * the remote lockres before we ever get to use + * it. treat as if we never made any change to + * the lockres. + */ + lockres_changed = 0; dlm_error(status); + } dlm_revert_pending_lock(res, lock); dlm_lock_put(lock); } else if (dlm_is_recovery_lock(res->lockname.name, @@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm, mlog(0, "%s: $RECOVERY lock for this node (%u) is " "mastered by %u; got lock, manually granting (no ast)\n", dlm->name, dlm->node_num, res->owner); - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); } spin_unlock(&res->spinlock); - dlm_lockres_calc_usage(dlm, res); + if (lockres_changed) + dlm_lockres_calc_usage(dlm, res); wake_up(&res->wq); return status; @@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm, if (tmpret >= 0) { // successfully sent and received ret = status; // this is already a dlm_status + if (ret == DLM_REJECTED) { + mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " + "no longer owned by %u. that node is coming back " + "up currently.\n", dlm->name, create.namelen, + create.name, res->owner); + dlm_print_one_lock_resource(res); + BUG(); + } } else { mlog_errno(tmpret); if (dlm_is_host_down(tmpret)) { @@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie, struct dlm_lock *lock; int kernel_allocated = 0; - lock = kcalloc(1, sizeof(*lock), GFP_KERNEL); + lock = kcalloc(1, sizeof(*lock), GFP_NOFS); if (!lock) return NULL; if (!lksb) { /* zero memory only if kernel-allocated */ - lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL); + lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS); if (!lksb) { kfree(lock); return NULL; @@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return DLM_REJECTED; - mlog_bug_on_msg(!dlm_domain_fully_joined(dlm), - "Domain %s not fully joined!\n", dlm->name); - name = create->name; namelen = create->namelen; + status = DLM_REJECTED; + if (!dlm_domain_fully_joined(dlm)) { + mlog(ML_ERROR, "Domain %s not fully joined, but node %u is " + "sending a create_lock message for lock %.*s!\n", + dlm->name, create->node_idx, namelen, name); + dlm_error(status); + goto leave; + } status = DLM_IVBUFLEN; if (namelen > DLM_LOCKID_NAME_MAX) { @@ -669,18 +700,22 @@ retry_lock: msleep(100); /* no waiting for dlm_reco_thread */ if (recovery) { - if (status == DLM_RECOVERING) { - mlog(0, "%s: got RECOVERING " - "for $REOCVERY lock, master " - "was %u\n", dlm->name, - res->owner); - dlm_wait_for_node_death(dlm, res->owner, - DLM_NODE_DEATH_WAIT_MAX); - } + if (status != DLM_RECOVERING) + goto retry_lock; + + mlog(0, "%s: got RECOVERING " + "for $RECOVERY lock, master " + "was %u\n", dlm->name, + res->owner); + /* wait to see the node go down, then + * drop down and allow the lockres to + * get cleaned up. need to remaster. */ + dlm_wait_for_node_death(dlm, res->owner, + DLM_NODE_DEATH_WAIT_MAX); } else { dlm_wait_for_recovery(dlm); + goto retry_lock; } - goto retry_lock; } if (status != DLM_NORMAL) { diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 940be4c13b1..1b8346dd057 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -47,7 +47,6 @@ #include "dlmapi.h" #include "dlmcommon.h" -#include "dlmdebug.h" #include "dlmdomain.h" #define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER) @@ -74,6 +73,7 @@ struct dlm_master_list_entry wait_queue_head_t wq; atomic_t woken; struct kref mle_refs; + int inuse; unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; @@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm, return 1; } -#if 0 -/* Code here is included but defined out as it aids debugging */ +#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m) +static void _dlm_print_nodemap(unsigned long *map, const char *mapname) +{ + int i; + printk("%s=[ ", mapname); + for (i=0; i<O2NM_MAX_NODES; i++) + if (test_bit(i, map)) + printk("%d ", i); + printk("]"); +} -void dlm_print_one_mle(struct dlm_master_list_entry *mle) +static void dlm_print_one_mle(struct dlm_master_list_entry *mle) { - int i = 0, refs; + int refs; char *type; char attached; u8 master; unsigned int namelen; const char *name; struct kref *k; + unsigned long *maybe = mle->maybe_map, + *vote = mle->vote_map, + *resp = mle->response_map, + *node = mle->node_map; k = &mle->mle_refs; if (mle->type == DLM_MLE_BLOCK) @@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle) name = mle->u.res->lockname.name; } - mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n", - i, type, refs, master, mle->new_master, attached, - namelen, namelen, name); + mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ", + namelen, name, type, refs, master, mle->new_master, attached, + mle->inuse); + dlm_print_nodemap(maybe); + printk(", "); + dlm_print_nodemap(vote); + printk(", "); + dlm_print_nodemap(resp); + printk(", "); + dlm_print_nodemap(node); + printk(", "); + printk("\n"); } +#if 0 +/* Code here is included but defined out as it aids debugging */ + static void dlm_dump_mles(struct dlm_ctxt *dlm) { struct dlm_master_list_entry *mle; struct list_head *iter; mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name); - mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n"); spin_lock(&dlm->master_lock); list_for_each(iter, &dlm->master_list) { mle = list_entry(iter, struct dlm_master_list_entry, list); @@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm, spin_unlock(&dlm->spinlock); } +static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&dlm->master_lock); + mle->inuse++; + kref_get(&mle->mle_refs); +} + +static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle) +{ + struct dlm_ctxt *dlm; + dlm = mle->dlm; + + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + mle->inuse--; + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + +} + /* remove from list and free */ static void __dlm_put_mle(struct dlm_master_list_entry *mle) { @@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle) assert_spin_locked(&dlm->spinlock); assert_spin_locked(&dlm->master_lock); - BUG_ON(!atomic_read(&mle->mle_refs.refcount)); - - kref_put(&mle->mle_refs, dlm_mle_release); + if (!atomic_read(&mle->mle_refs.refcount)) { + /* this may or may not crash, but who cares. + * it's a BUG. */ + mlog(ML_ERROR, "bad mle: %p\n", mle); + dlm_print_one_mle(mle); + BUG(); + } else + kref_put(&mle->mle_refs, dlm_mle_release); } @@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle, memset(mle->response_map, 0, sizeof(mle->response_map)); mle->master = O2NM_MAX_NODES; mle->new_master = O2NM_MAX_NODES; + mle->inuse = 0; if (mle->type == DLM_MLE_MASTER) { BUG_ON(!res); @@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref) mlog(0, "destroying lockres %.*s\n", res->lockname.len, res->lockname.name); + if (!hlist_unhashed(&res->hash_node) || + !list_empty(&res->granted) || + !list_empty(&res->converting) || + !list_empty(&res->blocked) || + !list_empty(&res->dirty) || + !list_empty(&res->recovering) || + !list_empty(&res->purge)) { + mlog(ML_ERROR, + "Going to BUG for resource %.*s." + " We're on a list! [%c%c%c%c%c%c%c]\n", + res->lockname.len, res->lockname.name, + !hlist_unhashed(&res->hash_node) ? 'H' : ' ', + !list_empty(&res->granted) ? 'G' : ' ', + !list_empty(&res->converting) ? 'C' : ' ', + !list_empty(&res->blocked) ? 'B' : ' ', + !list_empty(&res->dirty) ? 'D' : ' ', + !list_empty(&res->recovering) ? 'R' : ' ', + !list_empty(&res->purge) ? 'P' : ' '); + + dlm_print_one_lock_resource(res); + } + /* By the time we're ready to blow this guy away, we shouldn't * be on any lists. */ BUG_ON(!hlist_unhashed(&res->hash_node)); @@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref) kfree(res); } -void dlm_lockres_get(struct dlm_lock_resource *res) -{ - kref_get(&res->refs); -} - void dlm_lockres_put(struct dlm_lock_resource *res) { kref_put(&res->refs, dlm_lockres_release); @@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, memcpy(qname, name, namelen); res->lockname.len = namelen; - res->lockname.hash = full_name_hash(name, namelen); + res->lockname.hash = dlm_lockid_hash(name, namelen); init_waitqueue_head(&res->wq); spin_lock_init(&res->spinlock); @@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, { struct dlm_lock_resource *res; - res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL); + res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS); if (!res) return NULL; - res->lockname.name = kmalloc(namelen, GFP_KERNEL); + res->lockname.name = kmalloc(namelen, GFP_NOFS); if (!res->lockname.name) { kfree(res); return NULL; @@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, int blocked = 0; int ret, nodenum; struct dlm_node_iter iter; - unsigned int namelen; + unsigned int namelen, hash; int tries = 0; int bit, wait_on_recovery = 0; BUG_ON(!lockid); namelen = strlen(lockid); + hash = dlm_lockid_hash(lockid, namelen); mlog(0, "get lockres %s (len %d)\n", lockid, namelen); lookup: spin_lock(&dlm->spinlock); - tmpres = __dlm_lookup_lockres(dlm, lockid, namelen); + tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); if (tmpres) { spin_unlock(&dlm->spinlock); mlog(0, "found in hash!\n"); @@ -704,7 +776,7 @@ lookup: mlog(0, "allocating a new resource\n"); /* nothing found and we need to allocate one. */ alloc_mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!alloc_mle) goto leave; res = dlm_new_lockres(dlm, lockid, namelen); @@ -790,10 +862,11 @@ lookup: * if so, the creator of the BLOCK may try to put the last * ref at this time in the assert master handler, so we * need an extra one to keep from a bad ptr deref. */ - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); +redo_request: while (wait_on_recovery) { /* any cluster changes that occurred after dropping the * dlm spinlock would be detectable be a change on the mle, @@ -812,7 +885,7 @@ lookup: } dlm_kick_recovery_thread(dlm); - msleep(100); + msleep(1000); dlm_wait_for_recovery(dlm); spin_lock(&dlm->spinlock); @@ -825,13 +898,15 @@ lookup: } else wait_on_recovery = 0; spin_unlock(&dlm->spinlock); + + if (wait_on_recovery) + dlm_wait_for_node_recovery(dlm, bit, 10000); } /* must wait for lock to be mastered elsewhere */ if (blocked) goto wait; -redo_request: ret = -EINVAL; dlm_node_iter_init(mle->vote_map, &iter); while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { @@ -856,6 +931,7 @@ wait: /* keep going until the response map includes all nodes */ ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); if (ret < 0) { + wait_on_recovery = 1; mlog(0, "%s:%.*s: node map changed, redo the " "master request now, blocked=%d\n", dlm->name, res->lockname.len, @@ -866,7 +942,7 @@ wait: dlm->name, res->lockname.len, res->lockname.name, blocked); dlm_print_one_lock_resource(res); - /* dlm_print_one_mle(mle); */ + dlm_print_one_mle(mle); tries = 0; } goto redo_request; @@ -880,7 +956,7 @@ wait: dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); /* put the extra ref */ - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); wake_waiters: spin_lock(&res->spinlock); @@ -921,12 +997,14 @@ recheck: spin_unlock(&res->spinlock); /* this will cause the master to re-assert across * the whole cluster, freeing up mles */ - ret = dlm_do_master_request(mle, res->owner); - if (ret < 0) { - /* give recovery a chance to run */ - mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); - msleep(500); - goto recheck; + if (res->owner != dlm->node_num) { + ret = dlm_do_master_request(mle, res->owner); + if (ret < 0) { + /* give recovery a chance to run */ + mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); + msleep(500); + goto recheck; + } } ret = 0; goto leave; @@ -962,6 +1040,12 @@ recheck: "rechecking now\n", dlm->name, res->lockname.len, res->lockname.name); goto recheck; + } else { + if (!voting_done) { + mlog(0, "map not changed and voting not done " + "for %s:%.*s\n", dlm->name, res->lockname.len, + res->lockname.name); + } } if (m != O2NM_MAX_NODES) { @@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, set_bit(node, mle->vote_map); } else { mlog(ML_ERROR, "node down! %d\n", node); - - /* if the node wasn't involved in mastery skip it, - * but clear it out from the maps so that it will - * not affect mastery of this lockres */ - clear_bit(node, mle->response_map); - clear_bit(node, mle->vote_map); - if (!test_bit(node, mle->maybe_map)) - goto next; - - /* if we're already blocked on lock mastery, and the - * dead node wasn't the expected master, or there is - * another node in the maybe_map, keep waiting */ if (blocked) { int lowest = find_next_bit(mle->maybe_map, O2NM_MAX_NODES, 0); @@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, /* act like it was never there */ clear_bit(node, mle->maybe_map); - if (node != lowest) - goto next; - - mlog(ML_ERROR, "expected master %u died while " - "this node was blocked waiting on it!\n", - node); - lowest = find_next_bit(mle->maybe_map, - O2NM_MAX_NODES, - lowest+1); - if (lowest < O2NM_MAX_NODES) { - mlog(0, "still blocked. waiting " - "on %u now\n", lowest); - goto next; + if (node == lowest) { + mlog(0, "expected master %u died" + " while this node was blocked " + "waiting on it!\n", node); + lowest = find_next_bit(mle->maybe_map, + O2NM_MAX_NODES, + lowest+1); + if (lowest < O2NM_MAX_NODES) { + mlog(0, "%s:%.*s:still " + "blocked. waiting on %u " + "now\n", dlm->name, + res->lockname.len, + res->lockname.name, + lowest); + } else { + /* mle is an MLE_BLOCK, but + * there is now nothing left to + * block on. we need to return + * all the way back out and try + * again with an MLE_MASTER. + * dlm_do_local_recovery_cleanup + * has already run, so the mle + * refcount is ok */ + mlog(0, "%s:%.*s: no " + "longer blocking. try to " + "master this here\n", + dlm->name, + res->lockname.len, + res->lockname.name); + mle->type = DLM_MLE_MASTER; + mle->u.res = res; + } } - - /* mle is an MLE_BLOCK, but there is now - * nothing left to block on. we need to return - * all the way back out and try again with - * an MLE_MASTER. dlm_do_local_recovery_cleanup - * has already run, so the mle refcount is ok */ - mlog(0, "no longer blocking. we can " - "try to master this here\n"); - mle->type = DLM_MLE_MASTER; - memset(mle->maybe_map, 0, - sizeof(mle->maybe_map)); - memset(mle->response_map, 0, - sizeof(mle->maybe_map)); - memcpy(mle->vote_map, mle->node_map, - sizeof(mle->node_map)); - mle->u.res = res; - set_bit(dlm->node_num, mle->maybe_map); - - ret = -EAGAIN; - goto next; } - clear_bit(node, mle->maybe_map); - if (node > dlm->node_num) - goto next; - - mlog(0, "dead node in map!\n"); - /* yuck. go back and re-contact all nodes - * in the vote_map, removing this node. */ - memset(mle->response_map, 0, - sizeof(mle->response_map)); + /* now blank out everything, as if we had never + * contacted anyone */ + memset(mle->maybe_map, 0, sizeof(mle->maybe_map)); + memset(mle->response_map, 0, sizeof(mle->response_map)); + /* reset the vote_map to the current node_map */ + memcpy(mle->vote_map, mle->node_map, + sizeof(mle->node_map)); + /* put myself into the maybe map */ + if (mle->type != DLM_MLE_BLOCK) + set_bit(dlm->node_num, mle->maybe_map); } ret = -EAGAIN; -next: node = dlm_bitmap_diff_iter_next(&bdi, &sc); } return ret; @@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_master_request *request = (struct dlm_master_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; int found, ret; int set_maybe; int dispatch_assert = 0; @@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) name = request->name; namelen = request->namelen; + hash = dlm_lockid_hash(name, namelen); if (namelen > DLM_LOCKID_NAME_MAX) { response = DLM_IVBUFLEN; @@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) way_up_top: spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_unlock(&dlm->spinlock); @@ -1459,21 +1531,18 @@ way_up_top: spin_unlock(&dlm->spinlock); mle = (struct dlm_master_list_entry *) - kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL); + kmem_cache_alloc(dlm_mle_cache, GFP_NOFS); if (!mle) { response = DLM_MASTER_RESP_ERROR; mlog_errno(-ENOMEM); goto send_response; } - spin_lock(&dlm->spinlock); - dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, - name, namelen); - spin_unlock(&dlm->spinlock); goto way_up_top; } // mlog(0, "this is second time thru, already allocated, " // "add the block.\n"); + dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen); set_bit(request->node_idx, mle->maybe_map); list_add(&mle->list, &dlm->master_list); response = DLM_MASTER_RESP_NO; @@ -1556,6 +1625,8 @@ again: dlm_node_iter_init(nodemap, &iter); while ((to = dlm_node_iter_next(&iter)) >= 0) { int r = 0; + struct dlm_master_list_entry *mle = NULL; + mlog(0, "sending assert master to %d (%.*s)\n", to, namelen, lockname); memset(&assert, 0, sizeof(assert)); @@ -1567,20 +1638,28 @@ again: tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key, &assert, sizeof(assert), to, &r); if (tmpret < 0) { - mlog(ML_ERROR, "assert_master returned %d!\n", tmpret); + mlog(0, "assert_master returned %d!\n", tmpret); if (!dlm_is_host_down(tmpret)) { - mlog(ML_ERROR, "unhandled error!\n"); + mlog(ML_ERROR, "unhandled error=%d!\n", tmpret); BUG(); } /* a node died. finish out the rest of the nodes. */ - mlog(ML_ERROR, "link to %d went down!\n", to); + mlog(0, "link to %d went down!\n", to); /* any nonzero status return will do */ ret = tmpret; } else if (r < 0) { /* ok, something horribly messed. kill thyself. */ mlog(ML_ERROR,"during assert master of %.*s to %u, " "got %d.\n", namelen, lockname, to, r); - dlm_dump_lock_resources(dlm); + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + if (dlm_find_mle(dlm, &mle, (char *)lockname, + namelen)) { + dlm_print_one_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); BUG(); } else if (r == EAGAIN) { mlog(0, "%.*s: node %u create mles on other " @@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf; struct dlm_lock_resource *res = NULL; char *name; - unsigned int namelen; + unsigned int namelen, hash; u32 flags; int master_request = 0; int ret = 0; @@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) name = assert->name; namelen = assert->namelen; + hash = dlm_lockid_hash(name, namelen); flags = be32_to_cpu(assert->flags); if (namelen > DLM_LOCKID_NAME_MAX) { @@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) if (bit >= O2NM_MAX_NODES) { /* not necessarily an error, though less likely. * could be master just re-asserting. */ - mlog(ML_ERROR, "no bits set in the maybe_map, but %u " + mlog(0, "no bits set in the maybe_map, but %u " "is asserting! (%.*s)\n", assert->node_idx, namelen, name); } else if (bit != assert->node_idx) { @@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) * number winning the mastery will respond * YES to mastery requests, but this node * had no way of knowing. let it pass. */ - mlog(ML_ERROR, "%u is the lowest node, " + mlog(0, "%u is the lowest node, " "%u is asserting. (%.*s) %u must " "have begun after %u won.\n", bit, assert->node_idx, namelen, name, bit, assert->node_idx); } } + if (mle->type == DLM_MLE_MIGRATION) { + if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) { + mlog(0, "%s:%.*s: got cleanup assert" + " from %u for migration\n", + dlm->name, namelen, name, + assert->node_idx); + } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) { + mlog(0, "%s:%.*s: got unrelated assert" + " from %u for migration, ignoring\n", + dlm->name, namelen, name, + assert->node_idx); + __dlm_put_mle(mle); + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + goto done; + } + } } spin_unlock(&dlm->master_lock); /* ok everything checks out with the MLE * now check to see if there is a lockres */ - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); if (res) { spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_RECOVERING) { @@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) goto kill; } if (!mle) { - if (res->owner != assert->node_idx) { + if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && + res->owner != assert->node_idx) { mlog(ML_ERROR, "assert_master from " "%u, but current owner is " "%u! (%.*s)\n", @@ -1732,6 +1830,7 @@ ok: if (mle) { int extra_ref = 0; int nn = -1; + int rr, err = 0; spin_lock(&mle->spinlock); if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION) @@ -1751,27 +1850,64 @@ ok: wake_up(&mle->wq); spin_unlock(&mle->spinlock); - if (mle->type == DLM_MLE_MIGRATION && res) { - mlog(0, "finishing off migration of lockres %.*s, " - "from %u to %u\n", - res->lockname.len, res->lockname.name, - dlm->node_num, mle->new_master); + if (res) { spin_lock(&res->spinlock); - res->state &= ~DLM_LOCK_RES_MIGRATING; - dlm_change_lockres_owner(dlm, res, mle->new_master); - BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + if (mle->type == DLM_MLE_MIGRATION) { + mlog(0, "finishing off migration of lockres %.*s, " + "from %u to %u\n", + res->lockname.len, res->lockname.name, + dlm->node_num, mle->new_master); + res->state &= ~DLM_LOCK_RES_MIGRATING; + dlm_change_lockres_owner(dlm, res, mle->new_master); + BUG_ON(res->state & DLM_LOCK_RES_DIRTY); + } else { + dlm_change_lockres_owner(dlm, res, mle->master); + } spin_unlock(&res->spinlock); } - /* master is known, detach if not already detached */ - dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); - + + /* master is known, detach if not already detached. + * ensures that only one assert_master call will happen + * on this mle. */ + spin_lock(&dlm->spinlock); + spin_lock(&dlm->master_lock); + + rr = atomic_read(&mle->mle_refs.refcount); + if (mle->inuse > 0) { + if (extra_ref && rr < 3) + err = 1; + else if (!extra_ref && rr < 2) + err = 1; + } else { + if (extra_ref && rr < 2) + err = 1; + else if (!extra_ref && rr < 1) + err = 1; + } + if (err) { + mlog(ML_ERROR, "%s:%.*s: got assert master from %u " + "that will mess up this node, refs=%d, extra=%d, " + "inuse=%d\n", dlm->name, namelen, name, + assert->node_idx, rr, extra_ref, mle->inuse); + dlm_print_one_mle(mle); + } + list_del_init(&mle->list); + __dlm_mle_detach_hb_events(dlm, mle); + __dlm_put_mle(mle); if (extra_ref) { /* the assert master message now balances the extra * ref given by the master / migration request message. * if this is the last put, it will be removed * from the list. */ - dlm_put_mle(mle); + __dlm_put_mle(mle); + } + spin_unlock(&dlm->master_lock); + spin_unlock(&dlm->spinlock); + } else if (res) { + if (res->owner != assert->node_idx) { + mlog(0, "assert_master from %u, but current " + "owner is %u (%.*s), no mle\n", assert->node_idx, + res->owner, namelen, name); } } @@ -1788,12 +1924,12 @@ done: kill: /* kill the caller! */ + mlog(ML_ERROR, "Bad message received from another node. Dumping state " + "and killing the other node now! This node is OK and can continue.\n"); + __dlm_print_one_lock_resource(res); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); dlm_lockres_put(res); - mlog(ML_ERROR, "Bad message received from another node. Dumping state " - "and killing the other node now! This node is OK and can continue.\n"); - dlm_dump_lock_resources(dlm); dlm_put(dlm); return -EINVAL; } @@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, int ignore_higher, u8 request_from, u32 flags) { struct dlm_work_item *item; - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) return -ENOMEM; @@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); return 0; } @@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) } } + /* + * If we're migrating this lock to someone else, we are no + * longer allowed to assert out own mastery. OTOH, we need to + * prevent migration from starting while we're still asserting + * our dominance. The reserved ast delays migration. + */ + spin_lock(&res->spinlock); + if (res->state & DLM_LOCK_RES_MIGRATING) { + mlog(0, "Someone asked us to assert mastery, but we're " + "in the middle of migration. Skipping assert, " + "the new master will handle that.\n"); + spin_unlock(&res->spinlock); + goto put; + } else + __dlm_lockres_reserve_ast(res); + spin_unlock(&res->spinlock); + /* this call now finishes out the nodemap * even if one or more nodes die */ mlog(0, "worker about to master %.*s here, this=%u\n", @@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) nodemap, flags); if (ret < 0) { /* no need to restart, we are done */ - mlog_errno(ret); + if (!dlm_is_host_down(ret)) + mlog_errno(ret); } + /* Ok, we've asserted ourselves. Let's let migration start. */ + dlm_lockres_release_ast(dlm, res); + +put: dlm_lockres_put(res); mlog(0, "finished with dlm_assert_master_worker\n"); @@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, BUG(); /* host is down, so answer for that node would be * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ + ret = 0; } if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { @@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, */ ret = -ENOMEM; - mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL); + mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS); if (!mres) { mlog_errno(ret); goto leave; } mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { mlog_errno(ret); goto leave; @@ -2117,7 +2276,7 @@ fail: * take both dlm->spinlock and dlm->master_lock */ spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); - dlm_get_mle(mle); + dlm_get_mle_inuse(mle); spin_unlock(&dlm->master_lock); spin_unlock(&dlm->spinlock); @@ -2134,7 +2293,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } @@ -2164,8 +2326,8 @@ fail: /* avoid hang during shutdown when migrating lockres * to a node which also goes down */ if (dlm_is_node_dead(dlm, target)) { - mlog(0, "%s:%.*s: expected migration target %u " - "is no longer up. restarting.\n", + mlog(0, "%s:%.*s: expected migration " + "target %u is no longer up, restarting\n", dlm->name, res->lockname.len, res->lockname.name, target); ret = -ERESTARTSYS; @@ -2175,7 +2337,10 @@ fail: /* migration failed, detach and clean up mle */ dlm_mle_detach_hb_events(dlm, mle); dlm_put_mle(mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); + spin_lock(&res->spinlock); + res->state &= ~DLM_LOCK_RES_MIGRATING; + spin_unlock(&res->spinlock); goto leave; } /* TODO: if node died: stop, clean up, return error */ @@ -2191,7 +2356,7 @@ fail: /* master is known, detach if not already detached */ dlm_mle_detach_hb_events(dlm, mle); - dlm_put_mle(mle); + dlm_put_mle_inuse(mle); ret = 0; dlm_lockres_calc_usage(dlm, res); @@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf; struct dlm_master_list_entry *mle = NULL, *oldmle = NULL; const char *name; - unsigned int namelen; + unsigned int namelen, hash; int ret = 0; if (!dlm_grab(dlm)) @@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) name = migrate->name; namelen = migrate->namelen; + hash = dlm_lockid_hash(name, namelen); /* preallocate.. if this fails, abort */ mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache, - GFP_KERNEL); + GFP_NOFS); if (!mle) { ret = -ENOMEM; @@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) /* check for pre-existing lock */ spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, name, namelen); + res = __dlm_lookup_lockres(dlm, name, namelen, hash); spin_lock(&dlm->master_lock); if (res) { @@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, /* remove it from the list so that only one * mle will be found */ list_del_init(&tmp->list); + __dlm_mle_detach_hb_events(dlm, mle); } spin_unlock(&tmp->spinlock); } @@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node) struct list_head *iter, *iter2; struct dlm_master_list_entry *mle; struct dlm_lock_resource *res; + unsigned int hash; mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node); top: @@ -2640,7 +2808,7 @@ top: * may result in the mle being unlinked and * freed, but there may still be a process * waiting in the dlmlock path which is fine. */ - mlog(ML_ERROR, "node %u was expected master\n", + mlog(0, "node %u was expected master\n", dead_node); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); @@ -2673,19 +2841,21 @@ top: /* remove from the list early. NOTE: unlinking * list_head while in list_for_each_safe */ + __dlm_mle_detach_hb_events(dlm, mle); spin_lock(&mle->spinlock); list_del_init(&mle->list); atomic_set(&mle->woken, 1); spin_unlock(&mle->spinlock); wake_up(&mle->wq); - mlog(0, "node %u died during migration from " - "%u to %u!\n", dead_node, + mlog(0, "%s: node %u died during migration from " + "%u to %u!\n", dlm->name, dead_node, mle->master, mle->new_master); /* if there is a lockres associated with this * mle, find it and set its owner to UNKNOWN */ + hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len); res = __dlm_lookup_lockres(dlm, mle->u.name.name, - mle->u.name.len); + mle->u.name.len, hash); if (res) { /* unfortunately if we hit this rare case, our * lock ordering is messed. we need to drop diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 805cbabac05..da399013516 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void) return c; } +static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm, + u8 dead_node) +{ + assert_spin_locked(&dlm->spinlock); + if (dlm->reco.dead_node != dead_node) + mlog(0, "%s: changing dead_node from %u to %u\n", + dlm->name, dlm->reco.dead_node, dead_node); + dlm->reco.dead_node = dead_node; +} + +static inline void dlm_set_reco_master(struct dlm_ctxt *dlm, + u8 master) +{ + assert_spin_locked(&dlm->spinlock); + mlog(0, "%s: changing new_master from %u to %u\n", + dlm->name, dlm->reco.new_master, master); + dlm->reco.new_master = master; +} + +static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm) +{ + assert_spin_locked(&dlm->spinlock); + clear_bit(dlm->reco.dead_node, dlm->recovery_map); + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); +} + static inline void dlm_reset_recovery(struct dlm_ctxt *dlm) { spin_lock(&dlm->spinlock); - clear_bit(dlm->reco.dead_node, dlm->recovery_map); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + __dlm_reset_recovery(dlm); spin_unlock(&dlm->spinlock); } @@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data) struct list_head *iter, *iter2; struct dlm_work_item *item; dlm_workfunc_t *workfunc; + int tot=0; + + if (!dlm_joined(dlm)) + return; spin_lock(&dlm->work_lock); list_splice_init(&dlm->work_list, &tmp_list); spin_unlock(&dlm->work_lock); list_for_each_safe(iter, iter2, &tmp_list) { + tot++; + } + mlog(0, "%s: work thread has %d work items\n", dlm->name, tot); + + list_for_each_safe(iter, iter2, &tmp_list) { item = list_entry(iter, struct dlm_work_item, list); workfunc = item->func; list_del_init(&item->list); @@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm) * */ +static void dlm_print_reco_node_status(struct dlm_ctxt *dlm) +{ + struct dlm_reco_node_data *ndata; + struct dlm_lock_resource *res; + + mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive", + dlm->reco.dead_node, dlm->reco.new_master); + + list_for_each_entry(ndata, &dlm->reco.node_data, list) { + char *st = "unknown"; + switch (ndata->state) { + case DLM_RECO_NODE_DATA_INIT: + st = "init"; + break; + case DLM_RECO_NODE_DATA_REQUESTING: + st = "requesting"; + break; + case DLM_RECO_NODE_DATA_DEAD: + st = "dead"; + break; + case DLM_RECO_NODE_DATA_RECEIVING: + st = "receiving"; + break; + case DLM_RECO_NODE_DATA_REQUESTED: + st = "requested"; + break; + case DLM_RECO_NODE_DATA_DONE: + st = "done"; + break; + case DLM_RECO_NODE_DATA_FINALIZE_SENT: + st = "finalize-sent"; + break; + default: + st = "bad"; + break; + } + mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n", + dlm->name, ndata->node_num, st); + } + list_for_each_entry(res, &dlm->reco.resources, recovering) { + mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n", + dlm->name, res->lockname.len, res->lockname.name); + } +} #define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000) @@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) { int dead; spin_lock(&dlm->spinlock); - dead = test_bit(node, dlm->domain_map); + dead = !test_bit(node, dlm->domain_map); spin_unlock(&dlm->spinlock); return dead; } +/* returns true if node is no longer in the domain + * could be dead or just not joined */ +static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node) +{ + int recovered; + spin_lock(&dlm->spinlock); + recovered = !test_bit(node, dlm->recovery_map); + spin_unlock(&dlm->spinlock); + return recovered; +} + + int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) { if (timeout) { @@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) return 0; } +int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) +{ + if (timeout) { + mlog(0, "%s: waiting %dms for notification of " + "recovery of node %u\n", dlm->name, timeout, node); + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node), + msecs_to_jiffies(timeout)); + } else { + mlog(0, "%s: waiting indefinitely for notification " + "of recovery of node %u\n", dlm->name, node); + wait_event(dlm->dlm_reco_thread_wq, + dlm_is_node_recovered(dlm, node)); + } + /* for now, return 0 */ + return 0; +} + /* callers of the top-level api calls (dlmlock/dlmunlock) should * block on the dlm->reco.event when recovery is in progress. * the dlm recovery thread will set this state when it begins @@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm) void dlm_wait_for_recovery(struct dlm_ctxt *dlm) { + if (dlm_in_recovery(dlm)) { + mlog(0, "%s: reco thread %d in recovery: " + "state=%d, master=%u, dead=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, + dlm->reco.state, dlm->reco.new_master, + dlm->reco.dead_node); + } wait_event(dlm->reco.event, !dlm_in_recovery(dlm)); } @@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) mlog(0, "new master %u died while recovering %u!\n", dlm->reco.new_master, dlm->reco.dead_node); /* unset the new_master, leave dead_node */ - dlm->reco.new_master = O2NM_INVALID_NODE_NUM; + dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM); } /* select a target to recover */ @@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0); if (bit >= O2NM_MAX_NODES || bit < 0) - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); else - dlm->reco.dead_node = bit; + dlm_set_reco_dead_node(dlm, bit); } else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) { /* BUG? */ mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n", dlm->reco.dead_node); - dlm->reco.dead_node = O2NM_INVALID_NODE_NUM; + dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM); } if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) { @@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) /* return to main thread loop and sleep. */ return 0; } - mlog(0, "recovery thread found node %u in the recovery map!\n", + mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.dead_node); spin_unlock(&dlm->spinlock); @@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) } mlog(0, "another node will master this recovery session.\n"); } - mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n", - dlm->name, dlm->reco.new_master, + mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", + dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master, dlm->node_num, dlm->reco.dead_node); /* it is safe to start everything back up here @@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) return 0; master_here: - mlog(0, "mastering recovery of %s:%u here(this=%u)!\n", + mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n", + dlm->dlm_reco_thread_task->pid, dlm->name, dlm->reco.dead_node, dlm->node_num); status = dlm_remaster_locks(dlm, dlm->reco.dead_node); if (status < 0) { + /* we should never hit this anymore */ mlog(ML_ERROR, "error %d remastering locks for node %u, " "retrying.\n", status, dlm->reco.dead_node); /* yield a bit to allow any final network messages @@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) int destroy = 0; int pass = 0; - status = dlm_init_recovery_area(dlm, dead_node); - if (status < 0) - goto leave; + do { + /* we have become recovery master. there is no escaping + * this, so just keep trying until we get it. */ + status = dlm_init_recovery_area(dlm, dead_node); + if (status < 0) { + mlog(ML_ERROR, "%s: failed to alloc recovery area, " + "retrying\n", dlm->name); + msleep(1000); + } + } while (status != 0); /* safe to access the node data list without a lock, since this * process is the only one to change the list */ @@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) continue; } - status = dlm_request_all_locks(dlm, ndata->node_num, dead_node); - if (status < 0) { - mlog_errno(status); - if (dlm_is_host_down(status)) - ndata->state = DLM_RECO_NODE_DATA_DEAD; - else { - destroy = 1; - goto leave; + do { + status = dlm_request_all_locks(dlm, ndata->node_num, + dead_node); + if (status < 0) { + mlog_errno(status); + if (dlm_is_host_down(status)) { + /* node died, ignore it for recovery */ + status = 0; + ndata->state = DLM_RECO_NODE_DATA_DEAD; + /* wait for the domain map to catch up + * with the network state. */ + wait_event_timeout(dlm->dlm_reco_thread_wq, + dlm_is_node_dead(dlm, + ndata->node_num), + msecs_to_jiffies(1000)); + mlog(0, "waited 1 sec for %u, " + "dead? %s\n", ndata->node_num, + dlm_is_node_dead(dlm, ndata->node_num) ? + "yes" : "no"); + } else { + /* -ENOMEM on the other node */ + mlog(0, "%s: node %u returned " + "%d during recovery, retrying " + "after a short wait\n", + dlm->name, ndata->node_num, + status); + msleep(100); + } } - } + } while (status != 0); switch (ndata->state) { case DLM_RECO_NODE_DATA_INIT: @@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) mlog(0, "node %u died after requesting " "recovery info for node %u\n", ndata->node_num, dead_node); - // start all over - destroy = 1; - status = -EAGAIN; - goto leave; + /* fine. don't need this node's info. + * continue without it. */ + break; case DLM_RECO_NODE_DATA_REQUESTING: ndata->state = DLM_RECO_NODE_DATA_REQUESTED; mlog(0, "now receiving recovery data from " @@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) BUG(); break; case DLM_RECO_NODE_DATA_DEAD: - mlog(ML_NOTICE, "node %u died after " + mlog(0, "node %u died after " "requesting recovery info for " "node %u\n", ndata->node_num, dead_node); - spin_unlock(&dlm_reco_state_lock); - // start all over - destroy = 1; - status = -EAGAIN; - /* instead of spinning like crazy here, - * wait for the domain map to catch up - * with the network state. otherwise this - * can be hit hundreds of times before - * the node is really seen as dead. */ - wait_event_timeout(dlm->dlm_reco_thread_wq, - dlm_is_node_dead(dlm, - ndata->node_num), - msecs_to_jiffies(1000)); - mlog(0, "waited 1 sec for %u, " - "dead? %s\n", ndata->node_num, - dlm_is_node_dead(dlm, ndata->node_num) ? - "yes" : "no"); - goto leave; + break; case DLM_RECO_NODE_DATA_RECEIVING: case DLM_RECO_NODE_DATA_REQUESTED: + mlog(0, "%s: node %u still in state %s\n", + dlm->name, ndata->node_num, + ndata->state==DLM_RECO_NODE_DATA_RECEIVING ? + "receiving" : "requested"); all_nodes_done = 0; break; case DLM_RECO_NODE_DATA_DONE: + mlog(0, "%s: node %u state is done\n", + dlm->name, ndata->node_num); break; case DLM_RECO_NODE_DATA_FINALIZE_SENT: + mlog(0, "%s: node %u state is finalize\n", + dlm->name, ndata->node_num); break; } } @@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) jiffies, dlm->reco.dead_node, dlm->node_num, dlm->reco.new_master); destroy = 1; - status = ret; + status = 0; /* rescan everything marked dirty along the way */ dlm_kick_thread(dlm, NULL); break; @@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) } -leave: if (destroy) dlm_destroy_recovery_area(dlm, dead_node); @@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node) } BUG_ON(num == dead_node); - ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL); + ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS); if (!ndata) { dlm_destroy_recovery_area(dlm, dead_node); return -ENOMEM; @@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return -EINVAL; + if (lr->dead_node != dlm->reco.dead_node) { + mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local " + "dead_node is %u\n", dlm->name, lr->node_idx, + lr->dead_node, dlm->reco.dead_node); + dlm_print_reco_node_status(dlm); + /* this is a hack */ + dlm_put(dlm); + return -ENOMEM; + } BUG_ON(lr->dead_node != dlm->reco.dead_node); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!item) { dlm_put(dlm); return -ENOMEM; } /* this will get freed by dlm_request_all_locks_worker */ - buf = (char *) __get_free_page(GFP_KERNEL); + buf = (char *) __get_free_page(GFP_NOFS); if (!buf) { kfree(item); dlm_put(dlm); @@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); dlm_put(dlm); return 0; @@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) struct list_head *iter; int ret; u8 dead_node, reco_master; + int skip_all_done = 0; dlm = item->dlm; dead_node = item->u.ral.dead_node; reco_master = item->u.ral.reco_master; mres = (struct dlm_migratable_lockres *)data; + mlog(0, "%s: recovery worker started, dead=%u, master=%u\n", + dlm->name, dead_node, reco_master); + if (dead_node != dlm->reco.dead_node || reco_master != dlm->reco.new_master) { - /* show extra debug info if the recovery state is messed */ - mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " - "request(dead=%u, master=%u)\n", - dlm->name, dlm->reco.dead_node, dlm->reco.new_master, - dead_node, reco_master); - mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " - "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", - dlm->name, mres->lockname_len, mres->lockname, mres->master, - mres->num_locks, mres->total_locks, mres->flags, - dlm_get_lock_cookie_node(mres->ml[0].cookie), - dlm_get_lock_cookie_seq(mres->ml[0].cookie), - mres->ml[0].list, mres->ml[0].flags, - mres->ml[0].type, mres->ml[0].convert_type, - mres->ml[0].highest_blocked, mres->ml[0].node); - BUG(); + /* worker could have been created before the recovery master + * died. if so, do not continue, but do not error. */ + if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { + mlog(ML_NOTICE, "%s: will not send recovery state, " + "recovery master %u died, thread=(dead=%u,mas=%u)" + " current=(dead=%u,mas=%u)\n", dlm->name, + reco_master, dead_node, reco_master, + dlm->reco.dead_node, dlm->reco.new_master); + } else { + mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, " + "master=%u), request(dead=%u, master=%u)\n", + dlm->name, dlm->reco.dead_node, + dlm->reco.new_master, dead_node, reco_master); + } + goto leave; } - BUG_ON(dead_node != dlm->reco.dead_node); - BUG_ON(reco_master != dlm->reco.new_master); /* lock resources should have already been moved to the * dlm->reco.resources list. now move items from that list @@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) dlm_move_reco_locks_to_list(dlm, &resources, dead_node); /* now we can begin blasting lockreses without the dlm lock */ + + /* any errors returned will be due to the new_master dying, + * the dlm_reco_thread should detect this */ list_for_each(iter, &resources) { res = list_entry (iter, struct dlm_lock_resource, recovering); ret = dlm_send_one_lockres(dlm, res, mres, reco_master, DLM_MRES_RECOVERY); - if (ret < 0) - mlog_errno(ret); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery state for dead node %u, ret=%d\n", dlm->name, + reco_master, dead_node, ret); + skip_all_done = 1; + break; + } } /* move the resources back to the list */ @@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) list_splice_init(&resources, &dlm->reco.resources); spin_unlock(&dlm->spinlock); - ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); - if (ret < 0) - mlog_errno(ret); - + if (!skip_all_done) { + ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); + if (ret < 0) { + mlog(ML_ERROR, "%s: node %u went down while sending " + "recovery all-done for dead node %u, ret=%d\n", + dlm->name, reco_master, dead_node, ret); + } + } +leave: free_page((unsigned long)data); } @@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, sizeof(done_msg), send_to, &tmpret); - /* negative status is ignored by the caller */ - if (ret >= 0) + if (ret < 0) { + if (!dlm_is_host_down(ret)) { + mlog_errno(ret); + mlog(ML_ERROR, "%s: unknown error sending data-done " + "to %u\n", dlm->name, send_to); + BUG(); + } + } else ret = tmpret; return ret; } @@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, " "node_idx=%u, this node=%u\n", done->dead_node, dlm->reco.dead_node, done->node_idx, dlm->node_num); - BUG_ON(done->dead_node != dlm->reco.dead_node); + + mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node), + "Got DATA DONE: dead_node=%u, reco.dead_node=%u, " + "node_idx=%u, this node=%u\n", done->dead_node, + dlm->reco.dead_node, done->node_idx, dlm->node_num); spin_lock(&dlm_reco_state_lock); list_for_each(iter, &dlm->reco.node_data) { @@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, mlog(0, "found lockres owned by dead node while " "doing recovery for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { mlog(0, "found UNKNOWN owner while doing recovery " "for node %u. sending it.\n", dead_node); - list_del_init(&res->recovering); - list_add_tail(&res->recovering, list); + list_move_tail(&res->recovering, list); } } spin_unlock(&dlm->spinlock); @@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock, ml->type == LKM_PRMODE) { /* if it is already set, this had better be a PR * and it has to match */ - if (mres->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { + if (!dlm_lvb_is_empty(mres->lvb) && + (ml->type == LKM_EXMODE || + memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) { mlog(ML_ERROR, "mismatched lvbs!\n"); __dlm_print_one_lock_resource(lock->lockres); BUG(); @@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, * we must send it immediately. */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg " - "returned %d, TODO\n", ret); - BUG(); - } + if (ret < 0) + goto error; } } /* flush any remaining locks */ ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); - if (ret < 0) { - // TODO - mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " - "TODO\n", ret); + if (ret < 0) + goto error; + return ret; + +error: + mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", + dlm->name, ret); + if (!dlm_is_host_down(ret)) BUG(); - } + mlog(0, "%s: node %u went down while sending %s " + "lockres %.*s\n", dlm->name, send_to, + flags & DLM_MRES_RECOVERY ? "recovery" : "migration", + res->lockname.len, res->lockname.name); return ret; } @@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) mlog(0, "all done flag. all lockres data received!\n"); ret = -ENOMEM; - buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL); - item = kcalloc(1, sizeof(*item), GFP_KERNEL); + buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS); + item = kcalloc(1, sizeof(*item), GFP_NOFS); if (!buf || !item) goto leave; @@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data) spin_lock(&dlm->work_lock); list_add_tail(&item->list, &dlm->work_list); spin_unlock(&dlm->work_lock); - schedule_work(&dlm->dispatched_work); + queue_work(dlm->dlm_worker, &dlm->dispatched_work); leave: dlm_put(dlm); @@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) struct dlm_ctxt *dlm = data; struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf; struct dlm_lock_resource *res = NULL; + unsigned int hash; int master = DLM_LOCK_RES_OWNER_UNKNOWN; u32 flags = DLM_ASSERT_MASTER_REQUERY; @@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data) return master; } + hash = dlm_lockid_hash(req->name, req->namelen); + spin_lock(&dlm->spinlock); - res = __dlm_lookup_lockres(dlm, req->name, req->namelen); + res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash); if (res) { spin_lock(&res->spinlock); master = res->owner; @@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, struct dlm_lock *newlock = NULL; struct dlm_lockstatus *lksb = NULL; int ret = 0; - int i; + int i, bad; struct list_head *iter; struct dlm_lock *lock = NULL; @@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, /* move the lock to its proper place */ /* do not alter lock refcount. switching lists. */ - list_del_init(&lock->list); - list_add_tail(&lock->list, queue); + list_move_tail(&lock->list, queue); spin_unlock(&res->spinlock); mlog(0, "just reordered a local lock!\n"); @@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, } lksb->flags |= (ml->flags & (DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB)); - - if (mres->lvb[0]) { + + if (ml->type == LKM_NLMODE) + goto skip_lvb; + + if (!dlm_lvb_is_empty(mres->lvb)) { if (lksb->flags & DLM_LKSB_PUT_LVB) { /* other node was trying to update * lvb when node died. recreate the * lksb with the updated lvb. */ memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN); + /* the lock resource lvb update must happen + * NOW, before the spinlock is dropped. + * we no longer wait for the AST to update + * the lvb. */ + memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } else { /* otherwise, the node is sending its * most recent valid lvb info */ BUG_ON(ml->type != LKM_EXMODE && ml->type != LKM_PRMODE); - if (res->lvb[0] && (ml->type == LKM_EXMODE || - memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { - mlog(ML_ERROR, "received bad lvb!\n"); - __dlm_print_one_lock_resource(res); - BUG(); + if (!dlm_lvb_is_empty(res->lvb) && + (ml->type == LKM_EXMODE || + memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) { + int i; + mlog(ML_ERROR, "%s:%.*s: received bad " + "lvb! type=%d\n", dlm->name, + res->lockname.len, + res->lockname.name, ml->type); + printk("lockres lvb=["); + for (i=0; i<DLM_LVB_LEN; i++) + printk("%02x", res->lvb[i]); + printk("]\nmigrated lvb=["); + for (i=0; i<DLM_LVB_LEN; i++) + printk("%02x", mres->lvb[i]); + printk("]\n"); + dlm_print_one_lock_resource(res); + BUG(); } memcpy(res->lvb, mres->lvb, DLM_LVB_LEN); } } - +skip_lvb: /* NOTE: * wrt lock queue ordering and recovery: @@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, * relative to each other, but clearly *not* * preserved relative to locks from other nodes. */ + bad = 0; spin_lock(&res->spinlock); - dlm_lock_get(newlock); - list_add_tail(&newlock->list, queue); + list_for_each_entry(lock, queue, list) { + if (lock->ml.cookie == ml->cookie) { + u64 c = lock->ml.cookie; + mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already " + "exists on this lockres!\n", dlm->name, + res->lockname.len, res->lockname.name, + dlm_get_lock_cookie_node(c), + dlm_get_lock_cookie_seq(c)); + + mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, " + "node=%u, cookie=%u:%llu, queue=%d\n", + ml->type, ml->convert_type, ml->node, + dlm_get_lock_cookie_node(ml->cookie), + dlm_get_lock_cookie_seq(ml->cookie), + ml->list); + + __dlm_print_one_lock_resource(res); + bad = 1; + break; + } + } + if (!bad) { + dlm_lock_get(newlock); + list_add_tail(&newlock->list, queue); + } spin_unlock(&res->spinlock); } mlog(0, "done running all the locks\n"); @@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm, struct dlm_lock *lock; res->state |= DLM_LOCK_RES_RECOVERING; - if (!list_empty(&res->recovering)) + if (!list_empty(&res->recovering)) { + mlog(0, + "Recovering res %s:%.*s, is already on recovery list!\n", + dlm->name, res->lockname.len, res->lockname.name); list_del_init(&res->recovering); + } + /* We need to hold a reference while on the recovery list */ + dlm_lockres_get(res); list_add_tail(&res->recovering, &dlm->reco.resources); /* find any pending locks and put them back on proper list */ @@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); + dlm_lockres_put(res); } } @@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, * the RECOVERING state and set the owner * if necessary */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, hash_iter, bucket, hash_node) { if (res->state & DLM_LOCK_RES_RECOVERING) { if (res->owner == dead_node) { @@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm, dlm->name, res->lockname.len, res->lockname.name, res->owner); list_del_init(&res->recovering); + dlm_lockres_put(res); } spin_lock(&res->spinlock); dlm_change_lockres_owner(dlm, res, new_master); res->state &= ~DLM_LOCK_RES_RECOVERING; - __dlm_dirty_lockres(dlm, res); + if (!__dlm_lockres_unused(res)) + __dlm_dirty_lockres(dlm, res); spin_unlock(&res->spinlock); wake_up(&res->wq); } @@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) * need to be fired as a result. */ for (i = 0; i < DLM_HASH_BUCKETS; i++) { - bucket = &(dlm->lockres_hash[i]); + bucket = dlm_lockres_hash(dlm, i); hlist_for_each_entry(res, iter, bucket, hash_node) { /* always prune any $RECOVERY entries for dead nodes, * otherwise hangs can occur during later recovery */ @@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx) { assert_spin_locked(&dlm->spinlock); + if (dlm->reco.new_master == idx) { + mlog(0, "%s: recovery master %d just died\n", + dlm->name, idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + /* finalize1 was reached, so it is safe to clear + * the new_master and dead_node. that recovery + * is complete. */ + mlog(0, "%s: dead master %d had reached " + "finalize1 state, clearing\n", dlm->name, idx); + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + __dlm_reset_recovery(dlm); + } + } + /* check to see if the node is already considered dead */ if (!test_bit(idx, dlm->live_nodes_map)) { mlog(0, "for domain %s, node %d is already dead. " @@ -2087,7 +2329,7 @@ again: /* set the new_master to this node */ spin_lock(&dlm->spinlock); - dlm->reco.new_master = dlm->node_num; + dlm_set_reco_master(dlm, dlm->node_num); spin_unlock(&dlm->spinlock); } @@ -2125,6 +2367,10 @@ again: mlog(0, "%s: reco master %u is ready to recover %u\n", dlm->name, dlm->reco.new_master, dlm->reco.dead_node); status = -EEXIST; + } else if (ret == DLM_RECOVERING) { + mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n", + dlm->name, dlm->node_num); + goto again; } else { struct dlm_lock_resource *res; @@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) mlog_entry("%u\n", dead_node); - mlog(0, "dead node is %u\n", dead_node); + mlog(0, "%s: dead node is %u\n", dlm->name, dead_node); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); @@ -2214,6 +2460,14 @@ retry: * another ENOMEM */ msleep(100); goto retry; + } else if (ret == EAGAIN) { + mlog(0, "%s: trying to start recovery of node " + "%u, but node %u is waiting for last recovery " + "to complete, backoff for a bit\n", dlm->name, + dead_node, nodenum); + /* TODO Look into replacing msleep with cond_resched() */ + msleep(100); + goto retry; } } @@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u wants to recover node %u\n", - br->node_idx, br->dead_node); + spin_lock(&dlm->spinlock); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(0, "%s: node %u wants to recover node %u (%u:%u) " + "but this node is in finalize state, waiting on finalize2\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + spin_unlock(&dlm->spinlock); + return EAGAIN; + } + spin_unlock(&dlm->spinlock); + + mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); dlm_fire_domain_eviction_callbacks(dlm, br->dead_node); @@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) "node %u changing it to %u\n", dlm->name, dlm->reco.dead_node, br->node_idx, br->dead_node); } - dlm->reco.new_master = br->node_idx; - dlm->reco.dead_node = br->dead_node; + dlm_set_reco_master(dlm, br->node_idx); + dlm_set_reco_dead_node(dlm, br->dead_node); if (!test_bit(br->dead_node, dlm->recovery_map)) { mlog(0, "recovery master %u sees %u as dead, but this " "node has not yet. marking %u as dead\n", @@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) spin_unlock(&dlm->spinlock); dlm_kick_recovery_thread(dlm); + + mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n", + dlm->name, br->node_idx, br->dead_node, + dlm->reco.dead_node, dlm->reco.new_master); + dlm_put(dlm); return 0; } +#define DLM_FINALIZE_STAGE2 0x01 static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) { int ret = 0; @@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) struct dlm_node_iter iter; int nodenum; int status; + int stage = 1; - mlog(0, "finishing recovery for node %s:%u\n", - dlm->name, dlm->reco.dead_node); + mlog(0, "finishing recovery for node %s:%u, " + "stage %d\n", dlm->name, dlm->reco.dead_node, stage); spin_lock(&dlm->spinlock); dlm_node_iter_init(dlm->domain_map, &iter); spin_unlock(&dlm->spinlock); +stage2: memset(&fr, 0, sizeof(fr)); fr.node_idx = dlm->node_num; fr.dead_node = dlm->reco.dead_node; + if (stage == 2) + fr.flags |= DLM_FINALIZE_STAGE2; while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { if (nodenum == dlm->node_num) continue; ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key, &fr, sizeof(fr), nodenum, &status); - if (ret >= 0) { + if (ret >= 0) ret = status; + if (ret < 0) { + mlog_errno(ret); if (dlm_is_host_down(ret)) { /* this has no effect on this recovery * session, so set the status to zero to @@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm) mlog(ML_ERROR, "node %u went down after this " "node finished recovery.\n", nodenum); ret = 0; + continue; } - } - if (ret < 0) { - mlog_errno(ret); break; } } + if (stage == 1) { + /* reset the node_iter back to the top and send finalize2 */ + iter.curnode = -1; + stage = 2; + goto stage2; + } return ret; } @@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) { struct dlm_ctxt *dlm = data; struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf; + int stage = 1; /* ok to return 0, domain has gone away */ if (!dlm_grab(dlm)) return 0; - mlog(0, "node %u finalizing recovery of node %u\n", - fr->node_idx, fr->dead_node); + if (fr->flags & DLM_FINALIZE_STAGE2) + stage = 2; + mlog(0, "%s: node %u finalizing recovery stage%d of " + "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage, + fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master); + spin_lock(&dlm->spinlock); if (dlm->reco.new_master != fr->node_idx) { @@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data) BUG(); } - dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); - - spin_unlock(&dlm->spinlock); + switch (stage) { + case 1: + dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx); + if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) { + mlog(ML_ERROR, "%s: received finalize1 from " + "new master %u for dead node %u, but " + "this node has already received it!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state |= DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + break; + case 2: + if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) { + mlog(ML_ERROR, "%s: received finalize2 from " + "new master %u for dead node %u, but " + "this node did not have finalize1!\n", + dlm->name, fr->node_idx, fr->dead_node); + dlm_print_reco_node_status(dlm); + BUG(); + } + dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE; + spin_unlock(&dlm->spinlock); + dlm_reset_recovery(dlm); + dlm_kick_recovery_thread(dlm); + break; + default: + BUG(); + } - dlm_reset_recovery(dlm); + mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n", + dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master); - dlm_kick_recovery_thread(dlm); dlm_put(dlm); return 0; } diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index 5be9d14f12c..0c822f3ffb0 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -39,6 +39,7 @@ #include <linux/inet.h> #include <linux/timer.h> #include <linux/kthread.h> +#include <linux/delay.h> #include "cluster/heartbeat.h" @@ -53,6 +54,8 @@ #include "cluster/masklog.h" static int dlm_thread(void *data); +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres); static void dlm_flush_asts(struct dlm_ctxt *dlm); @@ -80,7 +83,7 @@ repeat: } -static int __dlm_lockres_unused(struct dlm_lock_resource *res) +int __dlm_lockres_unused(struct dlm_lock_resource *res) { if (list_empty(&res->granted) && list_empty(&res->converting) && @@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, assert_spin_locked(&res->spinlock); if (__dlm_lockres_unused(res)){ + /* For now, just keep any resource we master */ + if (res->owner == dlm->node_num) + { + if (!list_empty(&res->purge)) { + mlog(0, "we master %s:%.*s, but it is on " + "the purge list. Removing\n", + dlm->name, res->lockname.len, + res->lockname.name); + list_del_init(&res->purge); + dlm->purge_count--; + } + return; + } + if (list_empty(&res->purge)) { mlog(0, "putting lockres %.*s from purge list\n", res->lockname.len, res->lockname.name); @@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm, res->last_used = jiffies; list_add_tail(&res->purge, &dlm->purge_list); dlm->purge_count++; + + /* if this node is not the owner, there is + * no way to keep track of who the owner could be. + * unhash it to avoid serious problems. */ + if (res->owner != dlm->node_num) { + mlog(0, "%s:%.*s: doing immediate " + "purge of lockres owned by %u\n", + dlm->name, res->lockname.len, + res->lockname.name, res->owner); + + dlm_purge_lockres_now(dlm, res); + } } } else if (!list_empty(&res->purge)) { - mlog(0, "removing lockres %.*s from purge list\n", - res->lockname.len, res->lockname.name); + mlog(0, "removing lockres %.*s from purge list, " + "owner=%u\n", res->lockname.len, res->lockname.name, + res->owner); list_del_init(&res->purge); dlm->purge_count--; @@ -165,6 +195,7 @@ again: } else if (ret < 0) { mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n", lockres->lockname.len, lockres->lockname.name); + msleep(100); goto again; } @@ -178,6 +209,24 @@ finish: __dlm_unhash_lockres(lockres); } +/* make an unused lockres go away immediately. + * as soon as the dlm spinlock is dropped, this lockres + * will not be found. kfree still happens on last put. */ +static void dlm_purge_lockres_now(struct dlm_ctxt *dlm, + struct dlm_lock_resource *lockres) +{ + assert_spin_locked(&dlm->spinlock); + assert_spin_locked(&lockres->spinlock); + + BUG_ON(!__dlm_lockres_unused(lockres)); + + if (!list_empty(&lockres->purge)) { + list_del_init(&lockres->purge); + dlm->purge_count--; + } + __dlm_unhash_lockres(lockres); +} + static void dlm_run_purge_list(struct dlm_ctxt *dlm, int purge_now) { @@ -318,8 +367,7 @@ converting: target->ml.type = target->ml.convert_type; target->ml.convert_type = LKM_IVMODE; - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -380,8 +428,7 @@ blocked: target->ml.type, target->ml.node); // target->ml.type is already correct - list_del_init(&target->list); - list_add_tail(&target->list, &res->granted); + list_move_tail(&target->list, &res->granted); BUG_ON(!target->lksb); target->lksb->status = DLM_NORMAL; @@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) /* don't shuffle secondary queues */ if ((res->owner == dlm->node_num) && !(res->state & DLM_LOCK_RES_DIRTY)) { + /* ref for dirty_list */ + dlm_lockres_get(res); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; } @@ -606,6 +655,8 @@ static int dlm_thread(void *data) list_del_init(&res->dirty); spin_unlock(&res->spinlock); spin_unlock(&dlm->spinlock); + /* Drop dirty_list ref */ + dlm_lockres_put(res); /* lockres can be re-dirtied/re-added to the * dirty_list in this gap, but that is ok */ @@ -642,8 +693,9 @@ static int dlm_thread(void *data) * spinlock and do NOT have the dlm lock. * safe to reserve/queue asts and run the lists. */ - mlog(0, "calling dlm_shuffle_lists with dlm=%p, " - "res=%p\n", dlm, res); + mlog(0, "calling dlm_shuffle_lists with dlm=%s, " + "res=%.*s\n", dlm->name, + res->lockname.len, res->lockname.name); /* called while holding lockres lock */ dlm_shuffle_lists(dlm, res); @@ -657,6 +709,8 @@ in_progress: /* if the lock was in-progress, stick * it on the back of the list */ if (delay) { + /* ref for dirty_list */ + dlm_lockres_get(res); spin_lock(&res->spinlock); list_add_tail(&res->dirty, &dlm->dirty_list); res->state |= DLM_LOCK_RES_DIRTY; @@ -677,7 +731,7 @@ in_progress: /* yield and continue right away if there is more work to do */ if (!n) { - yield(); + cond_resched(); continue; } diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 7b1a2754267..b0c3134f4f7 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res, void dlm_commit_pending_cancel(struct dlm_lock_resource *res, struct dlm_lock *lock) { - list_del_init(&lock->list); - list_add_tail(&lock->list, &res->granted); + list_move_tail(&lock->list, &res->granted); lock->ml.convert_type = LKM_IVMODE; } @@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm, mlog_entry("%.*s\n", res->lockname.len, res->lockname.name); + if (owner == dlm->node_num) { + /* ended up trying to contact ourself. this means + * that the lockres had been remote but became local + * via a migration. just retry it, now as local */ + mlog(0, "%s:%.*s: this node became the master due to a " + "migration, re-evaluate now\n", dlm->name, + res->lockname.len, res->lockname.name); + return DLM_FORWARD; + } + memset(&unlock, 0, sizeof(unlock)); unlock.node_idx = dlm->node_num; unlock.flags = cpu_to_be32(flags); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index 74ca4e5f976..e641b084b34 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name) u32 dlm_key; char *domain; - domain = kmalloc(name->len + 1, GFP_KERNEL); + domain = kmalloc(name->len + 1, GFP_NOFS); if (!domain) { mlog_errno(-ENOMEM); return ERR_PTR(-ENOMEM); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index eebc3cfa6be..3fe8781c22c 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle, BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list)); OCFS2_I(inode)->ip_handle = handle; - list_del(&(OCFS2_I(inode)->ip_handle_list)); - list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); + list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list)); } static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle) diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c index efc7c91128a..93a56bd4a2b 100644 --- a/fs/openpromfs/inode.c +++ b/fs/openpromfs/inode.c @@ -1,5 +1,4 @@ -/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $ - * openpromfs.c: /proc/openprom handling routines +/* inode.c: /proc/openprom handling routines * * Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com) * Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be) @@ -12,762 +11,245 @@ #include <linux/openprom_fs.h> #include <linux/init.h> #include <linux/slab.h> -#include <linux/smp_lock.h> +#include <linux/seq_file.h> #include <asm/openprom.h> #include <asm/oplib.h> +#include <asm/prom.h> #include <asm/uaccess.h> -#define ALIASES_NNODES 64 - -typedef struct { - u16 parent; - u16 next; - u16 child; - u16 first_prop; - u32 node; -} openpromfs_node; - -typedef struct { -#define OPP_STRING 0x10 -#define OPP_STRINGLIST 0x20 -#define OPP_BINARY 0x40 -#define OPP_HEXSTRING 0x80 -#define OPP_DIRTY 0x01 -#define OPP_QUOTED 0x02 -#define OPP_NOTQUOTED 0x04 -#define OPP_ASCIIZ 0x08 - u32 flag; - u32 alloclen; - u32 len; - char *value; - char name[8]; -} openprom_property; - -static openpromfs_node *nodes; -static int alloced; -static u16 last_node; -static u16 first_prop; -static u16 options = 0xffff; -static u16 aliases = 0xffff; -static int aliases_nodes; -static char *alias_names [ALIASES_NNODES]; - -#define OPENPROM_ROOT_INO 16 -#define OPENPROM_FIRST_INO OPENPROM_ROOT_INO -#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO] -#define NODE2INO(node) (node + OPENPROM_FIRST_INO) -#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node) - -static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *); -static int openpromfs_readdir(struct file *, void *, filldir_t); -static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd); -static int openpromfs_unlink (struct inode *, struct dentry *dentry); +static DEFINE_MUTEX(op_mutex); -static inline u16 ptr_nod(void *p) -{ - return (long)p & 0xFFFF; -} +#define OPENPROM_ROOT_INO 0 -static ssize_t nodenum_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +enum op_inode_type { + op_inode_node, + op_inode_prop, +}; + +union op_inode_data { + struct device_node *node; + struct property *prop; +}; + +struct op_inode_info { + struct inode vfs_inode; + enum op_inode_type type; + union op_inode_data u; +}; + +static inline struct op_inode_info *OP_I(struct inode *inode) { - struct inode *inode = file->f_dentry->d_inode; - char buffer[10]; - - if (count < 0 || !inode->u.generic_ip) - return -EINVAL; - sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip); - if (file->f_pos >= 9) - return 0; - if (count > 9 - file->f_pos) - count = 9 - file->f_pos; - if (copy_to_user(buf, buffer + file->f_pos, count)) - return -EFAULT; - *ppos += count; - return count; + return container_of(inode, struct op_inode_info, vfs_inode); } -static ssize_t property_read(struct file *filp, char __user *buf, - size_t count, loff_t *ppos) +static int is_string(unsigned char *p, int len) { - struct inode *inode = filp->f_dentry->d_inode; - int i, j, k; - u32 node; - char *p, *s; - u32 *q; - openprom_property *op; - char buffer[64]; - - if (!filp->private_data) { - node = nodes[ptr_nod(inode->u.generic_ip)].node; - i = ((u32)(long)inode->u.generic_ip) >> 16; - if (ptr_nod(inode->u.generic_ip) == aliases) { - if (i >= aliases_nodes) - p = NULL; - else - p = alias_names [i]; - } else - for (p = prom_firstprop (node, buffer); - i && p && *p; - p = prom_nextprop (node, p, buffer), i--) - /* nothing */ ; - if (!p || !*p) - return -EIO; - i = prom_getproplen (node, p); - if (i < 0) { - if (ptr_nod(inode->u.generic_ip) == aliases) - i = 0; - else - return -EIO; - } - k = i; - if (i < 64) i = 64; - filp->private_data = kmalloc (sizeof (openprom_property) - + (j = strlen (p)) + 2 * i, - GFP_KERNEL); - if (!filp->private_data) - return -ENOMEM; - op = filp->private_data; - op->flag = 0; - op->alloclen = 2 * i; - strcpy (op->name, p); - op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3); - op->len = k; - if (k && prom_getproperty (node, p, op->value, i) < 0) - return -EIO; - op->value [k] = 0; - if (k) { - for (s = NULL, p = op->value; p < op->value + k; p++) { - if ((*p >= ' ' && *p <= '~') || *p == '\n') { - op->flag |= OPP_STRING; - s = p; - continue; - } - if (p > op->value && !*p && s == p - 1) { - if (p < op->value + k - 1) - op->flag |= OPP_STRINGLIST; - else - op->flag |= OPP_ASCIIZ; - continue; - } - if (k == 1 && !*p) { - op->flag |= (OPP_STRING|OPP_ASCIIZ); - break; - } - op->flag &= ~(OPP_STRING|OPP_STRINGLIST); - if (k & 3) - op->flag |= OPP_HEXSTRING; - else - op->flag |= OPP_BINARY; - break; - } - if (op->flag & OPP_STRINGLIST) - op->flag &= ~(OPP_STRING); - if (op->flag & OPP_ASCIIZ) - op->len--; - } - } else - op = filp->private_data; - if (!count || !(op->len || (op->flag & OPP_ASCIIZ))) - return 0; - if (*ppos >= 0xffffff || count >= 0xffffff) - return -EINVAL; - if (op->flag & OPP_STRINGLIST) { - for (k = 0, p = op->value; p < op->value + op->len; p++) - if (!*p) - k++; - i = op->len + 4 * k + 3; - } else if (op->flag & OPP_STRING) { - i = op->len + 3; - } else if (op->flag & OPP_BINARY) { - i = (op->len * 9) >> 2; - } else { - i = (op->len << 1) + 1; - } - k = *ppos; - if (k >= i) return 0; - if (count > i - k) count = i - k; - if (op->flag & OPP_STRING) { - if (!k) { - if (put_user('\'', buf)) - return -EFAULT; - k++; - count--; - } + int i; - if (k + count >= i - 2) - j = i - 2 - k; - else - j = count; - - if (j >= 0) { - if (copy_to_user(buf + k - *ppos, - op->value + k - 1, j)) - return -EFAULT; - count -= j; - k += j; - } + for (i = 0; i < len; i++) { + unsigned char val = p[i]; - if (count) { - if (put_user('\'', &buf [k++ - *ppos])) - return -EFAULT; - } - if (count > 1) { - if (put_user('\n', &buf [k++ - *ppos])) - return -EFAULT; - } - } else if (op->flag & OPP_STRINGLIST) { - char *tmp; - - tmp = kmalloc (i, GFP_KERNEL); - if (!tmp) - return -ENOMEM; - - s = tmp; - *s++ = '\''; - for (p = op->value; p < op->value + op->len; p++) { - if (!*p) { - strcpy(s, "' + '"); - s += 5; - continue; - } - *s++ = *p; - } - strcpy(s, "'\n"); - - if (copy_to_user(buf, tmp + k, count)) - return -EFAULT; - - kfree(tmp); - k += count; - - } else if (op->flag & OPP_BINARY) { - char buffer[10]; - u32 *first, *last; - int first_off, last_cnt; - - first = ((u32 *)op->value) + k / 9; - first_off = k % 9; - last = ((u32 *)op->value) + (k + count - 1) / 9; - last_cnt = (k + count) % 9; - if (!last_cnt) last_cnt = 9; - - if (first == last) { - sprintf (buffer, "%08x.", *first); - if (copy_to_user(buf, buffer + first_off, - last_cnt - first_off)) - return -EFAULT; - buf += last_cnt - first_off; - } else { - for (q = first; q <= last; q++) { - sprintf (buffer, "%08x.", *q); - if (q == first) { - if (copy_to_user(buf, buffer + first_off, - 9 - first_off)) - return -EFAULT; - buf += 9 - first_off; - } else if (q == last) { - if (copy_to_user(buf, buffer, last_cnt)) - return -EFAULT; - buf += last_cnt; - } else { - if (copy_to_user(buf, buffer, 9)) - return -EFAULT; - buf += 9; - } - } - } + if ((i && !val) || + (val >= ' ' && val <= '~')) + continue; - if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) { - if (put_user('\n', (buf - 1))) - return -EFAULT; - } + return 0; + } - k += count; + return 1; +} - } else if (op->flag & OPP_HEXSTRING) { - char buffer[3]; +static int property_show(struct seq_file *f, void *v) +{ + struct property *prop = f->private; + void *pval; + int len; - if ((k < i - 1) && (k & 1)) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (put_user(buffer[1], &buf[k++ - *ppos])) - return -EFAULT; - count--; - } + len = prop->length; + pval = prop->value; - for (; (count > 1) && (k < i - 1); k += 2) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (copy_to_user(buf + k - *ppos, buffer, 2)) - return -EFAULT; - count -= 2; - } + if (is_string(pval, len)) { + while (len > 0) { + int n = strlen(pval); - if (count && (k < i - 1)) { - sprintf (buffer, "%02x", - (unsigned char) *(op->value + (k >> 1)) & 0xff); - if (put_user(buffer[0], &buf[k++ - *ppos])) - return -EFAULT; - count--; - } + seq_printf(f, "%s", (char *) pval); - if (count) { - if (put_user('\n', &buf [k++ - *ppos])) - return -EFAULT; - } - } - count = k - *ppos; - *ppos = k; - return count; -} + /* Skip over the NULL byte too. */ + pval += n + 1; + len -= n + 1; -static ssize_t property_write(struct file *filp, const char __user *buf, - size_t count, loff_t *ppos) -{ - int i, j, k; - char *p; - u32 *q; - void *b; - openprom_property *op; - - if (*ppos >= 0xffffff || count >= 0xffffff) - return -EINVAL; - if (!filp->private_data) { - i = property_read (filp, NULL, 0, NULL); - if (i) - return i; - } - k = *ppos; - op = filp->private_data; - if (!(op->flag & OPP_STRING)) { - u32 *first, *last; - int first_off, last_cnt; - u32 mask, mask2; - char tmp [9]; - int forcelen = 0; - - j = k % 9; - for (i = 0; i < count; i++, j++) { - if (j == 9) j = 0; - if (!j) { - char ctmp; - if (get_user(ctmp, &buf[i])) - return -EFAULT; - if (ctmp != '.') { - if (ctmp != '\n') { - if (op->flag & OPP_BINARY) - return -EINVAL; - else - goto write_try_string; - } else { - count = i + 1; - forcelen = 1; - break; - } - } - } else { - char ctmp; - if (get_user(ctmp, &buf[i])) - return -EFAULT; - if (ctmp < '0' || - (ctmp > '9' && ctmp < 'A') || - (ctmp > 'F' && ctmp < 'a') || - ctmp > 'f') { - if (op->flag & OPP_BINARY) - return -EINVAL; - else - goto write_try_string; - } - } - } - op->flag |= OPP_BINARY; - tmp [8] = 0; - i = ((count + k + 8) / 9) << 2; - if (op->alloclen <= i) { - b = kmalloc (sizeof (openprom_property) + 2 * i, - GFP_KERNEL); - if (!b) - return -ENOMEM; - memcpy (b, filp->private_data, - sizeof (openprom_property) - + strlen (op->name) + op->alloclen); - memset (b + sizeof (openprom_property) - + strlen (op->name) + op->alloclen, - 0, 2 * i - op->alloclen); - op = b; - op->alloclen = 2*i; - b = filp->private_data; - filp->private_data = op; - kfree (b); + if (len > 0) + seq_printf(f, " + "); } - first = ((u32 *)op->value) + (k / 9); - first_off = k % 9; - last = (u32 *)(op->value + i); - last_cnt = (k + count) % 9; - if (first + 1 == last) { - memset (tmp, '0', 8); - if (copy_from_user(tmp + first_off, buf, - (count + first_off > 8) ? - 8 - first_off : count)) - return -EFAULT; - mask = 0xffffffff; - mask2 = 0xffffffff; - for (j = 0; j < first_off; j++) - mask >>= 1; - for (j = 8 - count - first_off; j > 0; j--) - mask2 <<= 1; - mask &= mask2; - if (mask) { - *first &= ~mask; - *first |= simple_strtoul (tmp, NULL, 16); - op->flag |= OPP_DIRTY; + } else { + if (len & 3) { + while (len) { + len--; + if (len) + seq_printf(f, "%02x.", + *(unsigned char *) pval); + else + seq_printf(f, "%02x", + *(unsigned char *) pval); + pval++; } } else { - op->flag |= OPP_DIRTY; - for (q = first; q < last; q++) { - if (q == first) { - if (first_off < 8) { - memset (tmp, '0', 8); - if (copy_from_user(tmp + first_off, - buf, - 8 - first_off)) - return -EFAULT; - mask = 0xffffffff; - for (j = 0; j < first_off; j++) - mask >>= 1; - *q &= ~mask; - *q |= simple_strtoul (tmp,NULL,16); - } - buf += 9; - } else if ((q == last - 1) && last_cnt - && (last_cnt < 8)) { - memset (tmp, '0', 8); - if (copy_from_user(tmp, buf, last_cnt)) - return -EFAULT; - mask = 0xffffffff; - for (j = 0; j < 8 - last_cnt; j++) - mask <<= 1; - *q &= ~mask; - *q |= simple_strtoul (tmp, NULL, 16); - buf += last_cnt; - } else { - char tchars[2 * sizeof(long) + 1]; - - if (copy_from_user(tchars, buf, sizeof(tchars) - 1)) - return -EFAULT; - tchars[sizeof(tchars) - 1] = '\0'; - *q = simple_strtoul (tchars, NULL, 16); - buf += 9; - } - } - } - if (!forcelen) { - if (op->len < i) - op->len = i; - } else - op->len = i; - *ppos += count; - } -write_try_string: - if (!(op->flag & OPP_BINARY)) { - if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) { - char ctmp; - - /* No way, if somebody starts writing from the middle, - * we don't know whether he uses quotes around or not - */ - if (k > 0) - return -EINVAL; - if (get_user(ctmp, buf)) - return -EFAULT; - if (ctmp == '\'') { - op->flag |= OPP_QUOTED; - buf++; - count--; - (*ppos)++; - if (!count) { - op->flag |= OPP_STRING; - return 1; - } - } else - op->flag |= OPP_NOTQUOTED; - } - op->flag |= OPP_STRING; - if (op->alloclen <= count + *ppos) { - b = kmalloc (sizeof (openprom_property) - + 2 * (count + *ppos), GFP_KERNEL); - if (!b) - return -ENOMEM; - memcpy (b, filp->private_data, - sizeof (openprom_property) - + strlen (op->name) + op->alloclen); - memset (b + sizeof (openprom_property) - + strlen (op->name) + op->alloclen, - 0, 2*(count - *ppos) - op->alloclen); - op = b; - op->alloclen = 2*(count + *ppos); - b = filp->private_data; - filp->private_data = op; - kfree (b); - } - p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0); - if (copy_from_user(p, buf, count)) - return -EFAULT; - op->flag |= OPP_DIRTY; - for (i = 0; i < count; i++, p++) - if (*p == '\n') { - *p = 0; - break; + while (len >= 4) { + len -= 4; + + if (len) + seq_printf(f, "%08x.", + *(unsigned int *) pval); + else + seq_printf(f, "%08x", + *(unsigned int *) pval); + pval += 4; } - if (i < count) { - op->len = p - op->value; - *ppos += i + 1; - if ((p > op->value) && (op->flag & OPP_QUOTED) - && (*(p - 1) == '\'')) - op->len--; - } else { - if (p - op->value > op->len) - op->len = p - op->value; - *ppos += count; } } - return *ppos - k; + seq_printf(f, "\n"); + + return 0; } -int property_release (struct inode *inode, struct file *filp) +static void *property_start(struct seq_file *f, loff_t *pos) { - openprom_property *op = filp->private_data; - int error; - u32 node; - - if (!op) - return 0; - lock_kernel(); - node = nodes[ptr_nod(inode->u.generic_ip)].node; - if (ptr_nod(inode->u.generic_ip) == aliases) { - if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) { - char *p = op->name; - int i = (op->value - op->name) - strlen (op->name) - 1; - op->value [op->len] = 0; - *(op->value - 1) = ' '; - if (i) { - for (p = op->value - i - 2; p >= op->name; p--) - p[i] = *p; - p = op->name + i; - } - memcpy (p - 8, "nvalias ", 8); - prom_feval (p - 8); - } - } else if (op->flag & OPP_DIRTY) { - if (op->flag & OPP_STRING) { - op->value [op->len] = 0; - error = prom_setprop (node, op->name, - op->value, op->len + 1); - if (error <= 0) - printk (KERN_WARNING "openpromfs: " - "Couldn't write property %s\n", - op->name); - } else if ((op->flag & OPP_BINARY) || !op->len) { - error = prom_setprop (node, op->name, - op->value, op->len); - if (error <= 0) - printk (KERN_WARNING "openpromfs: " - "Couldn't write property %s\n", - op->name); - } else { - printk (KERN_WARNING "openpromfs: " - "Unknown property type of %s\n", - op->name); - } + if (*pos == 0) + return pos; + return NULL; +} + +static void *property_next(struct seq_file *f, void *v, loff_t *pos) +{ + (*pos)++; + return NULL; +} + +static void property_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations property_op = { + .start = property_start, + .next = property_next, + .stop = property_stop, + .show = property_show +}; + +static int property_open(struct inode *inode, struct file *file) +{ + struct op_inode_info *oi = OP_I(inode); + int ret; + + BUG_ON(oi->type != op_inode_prop); + + ret = seq_open(file, &property_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = oi->u.prop; } - unlock_kernel(); - kfree (filp->private_data); - return 0; + return ret; } static const struct file_operations openpromfs_prop_ops = { - .read = property_read, - .write = property_write, - .release = property_release, + .open = property_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; -static const struct file_operations openpromfs_nodenum_ops = { - .read = nodenum_read, -}; +static int openpromfs_readdir(struct file *, void *, filldir_t); static const struct file_operations openprom_operations = { .read = generic_read_dir, .readdir = openpromfs_readdir, }; -static struct inode_operations openprom_alias_inode_operations = { - .create = openpromfs_create, - .lookup = openpromfs_lookup, - .unlink = openpromfs_unlink, -}; +static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *); static struct inode_operations openprom_inode_operations = { .lookup = openpromfs_lookup, }; -static int lookup_children(u16 n, const char * name, int len) +static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) { - int ret; - u16 node; - for (; n != 0xffff; n = nodes[n].next) { - node = nodes[n].child; - if (node != 0xffff) { - char buffer[128]; - int i; - char *p; - - while (node != 0xffff) { - if (prom_getname (nodes[node].node, - buffer, 128) >= 0) { - i = strlen (buffer); - if ((len == i) - && !strncmp (buffer, name, len)) - return NODE2INO(node); - p = strchr (buffer, '@'); - if (p && (len == p - buffer) - && !strncmp (buffer, name, len)) - return NODE2INO(node); - } - node = nodes[node].next; - } - } else - continue; - ret = lookup_children (nodes[n].child, name, len); - if (ret) return ret; - } - return 0; -} - -static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd) -{ - int ino = 0; -#define OPFSL_DIR 0 -#define OPFSL_PROPERTY 1 -#define OPFSL_NODENUM 2 - int type = 0; - char buffer[128]; - char *p; + struct op_inode_info *ent_oi, *oi = OP_I(dir); + struct device_node *dp, *child; + struct property *prop; + enum op_inode_type ent_type; + union op_inode_data ent_data; const char *name; - u32 n; - u16 dirnode; - unsigned int len; - int i; struct inode *inode; - char buffer2[64]; + unsigned int ino; + int len; - inode = NULL; + BUG_ON(oi->type != op_inode_node); + + dp = oi->u.node; + name = dentry->d_name.name; len = dentry->d_name.len; - lock_kernel(); - if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) { - ino = NODEP2INO(NODE(dir->i_ino).first_prop); - type = OPFSL_NODENUM; - } - if (!ino) { - u16 node = NODE(dir->i_ino).child; - while (node != 0xffff) { - if (prom_getname (nodes[node].node, buffer, 128) >= 0) { - i = strlen (buffer); - if (len == i && !strncmp (buffer, name, len)) { - ino = NODE2INO(node); - type = OPFSL_DIR; - break; - } - p = strchr (buffer, '@'); - if (p && (len == p - buffer) - && !strncmp (buffer, name, len)) { - ino = NODE2INO(node); - type = OPFSL_DIR; - break; - } - } - node = nodes[node].next; - } - } - n = NODE(dir->i_ino).node; - dirnode = dir->i_ino - OPENPROM_FIRST_INO; - if (!ino) { - int j = NODEP2INO(NODE(dir->i_ino).first_prop); - if (dirnode != aliases) { - for (p = prom_firstprop (n, buffer2); - p && *p; - p = prom_nextprop (n, p, buffer2)) { - j++; - if ((len == strlen (p)) - && !strncmp (p, name, len)) { - ino = j; - type = OPFSL_PROPERTY; - break; - } - } - } else { - int k; - for (k = 0; k < aliases_nodes; k++) { - j++; - if (alias_names [k] - && (len == strlen (alias_names [k])) - && !strncmp (alias_names [k], name, len)) { - ino = j; - type = OPFSL_PROPERTY; - break; - } - } + + mutex_lock(&op_mutex); + + child = dp->child; + while (child) { + int n = strlen(child->path_component_name); + + if (len == n && + !strncmp(child->path_component_name, name, len)) { + ent_type = op_inode_node; + ent_data.node = child; + ino = child->unique_id; + goto found; } + child = child->sibling; } - if (!ino) { - ino = lookup_children (NODE(dir->i_ino).child, name, len); - if (ino) - type = OPFSL_DIR; - else { - unlock_kernel(); - return ERR_PTR(-ENOENT); + + prop = dp->properties; + while (prop) { + int n = strlen(prop->name); + + if (len == n && !strncmp(prop->name, name, len)) { + ent_type = op_inode_prop; + ent_data.prop = prop; + ino = prop->unique_id; + goto found; } + + prop = prop->next; } - inode = iget (dir->i_sb, ino); - unlock_kernel(); + + mutex_unlock(&op_mutex); + return ERR_PTR(-ENOENT); + +found: + inode = iget(dir->i_sb, ino); + mutex_unlock(&op_mutex); if (!inode) return ERR_PTR(-EINVAL); - switch (type) { - case OPFSL_DIR: + ent_oi = OP_I(inode); + ent_oi->type = ent_type; + ent_oi->u = ent_data; + + switch (ent_type) { + case op_inode_node: inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (ino == OPENPROM_FIRST_INO + aliases) { - inode->i_mode |= S_IWUSR; - inode->i_op = &openprom_alias_inode_operations; - } else - inode->i_op = &openprom_inode_operations; + inode->i_op = &openprom_inode_operations; inode->i_fop = &openprom_operations; inode->i_nlink = 2; break; - case OPFSL_NODENUM: - inode->i_mode = S_IFREG | S_IRUGO; - inode->i_fop = &openpromfs_nodenum_ops; - inode->i_nlink = 1; - inode->u.generic_ip = (void *)(long)(n); - break; - case OPFSL_PROPERTY: - if ((dirnode == options) && (len == 17) - && !strncmp (name, "security-password", 17)) + case op_inode_prop: + if (!strcmp(dp->name, "options") && (len == 17) && + !strncmp (name, "security-password", 17)) inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; - else { + else inode->i_mode = S_IFREG | S_IRUGO; - if (dirnode == options || dirnode == aliases) { - if (len != 4 || strncmp (name, "name", 4)) - inode->i_mode |= S_IWUSR; - } - } inode->i_fop = &openpromfs_prop_ops; inode->i_nlink = 1; - if (inode->i_size < 0) - inode->i_size = 0; - inode->u.generic_ip = (void *)(long)(((u16)dirnode) | - (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16)); + inode->i_size = ent_oi->u.prop->length; break; } @@ -781,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir) { struct inode *inode = filp->f_dentry->d_inode; + struct op_inode_info *oi = OP_I(inode); + struct device_node *dp = oi->u.node; + struct device_node *child; + struct property *prop; unsigned int ino; - u32 n; - int i, j; - char buffer[128]; - u16 node; - char *p; - char buffer2[64]; - - lock_kernel(); + int i; + + mutex_lock(&op_mutex); ino = inode->i_ino; i = filp->f_pos; switch (i) { case 0: - if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out; + if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) + goto out; i++; filp->f_pos++; /* fall thru */ case 1: - if (filldir(dirent, "..", 2, i, - (NODE(ino).parent == 0xffff) ? - OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0) + if (filldir(dirent, "..", 2, i, + (dp->parent == NULL ? + OPENPROM_ROOT_INO : + dp->parent->unique_id), DT_DIR) < 0) goto out; i++; filp->f_pos++; /* fall thru */ default: i -= 2; - node = NODE(ino).child; - while (i && node != 0xffff) { - node = nodes[node].next; + + /* First, the children nodes as directories. */ + child = dp->child; + while (i && child) { + child = child->sibling; i--; } - while (node != 0xffff) { - if (prom_getname (nodes[node].node, buffer, 128) < 0) - goto out; - if (filldir(dirent, buffer, strlen(buffer), - filp->f_pos, NODE2INO(node), DT_DIR) < 0) + while (child) { + if (filldir(dirent, + child->path_component_name, + strlen(child->path_component_name), + filp->f_pos, child->unique_id, DT_DIR) < 0) goto out; + filp->f_pos++; - node = nodes[node].next; + child = child->sibling; } - j = NODEP2INO(NODE(ino).first_prop); - if (!i) { - if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0) + + /* Next, the properties as files. */ + prop = dp->properties; + while (i && prop) { + prop = prop->next; + i--; + } + while (prop) { + if (filldir(dirent, prop->name, strlen(prop->name), + filp->f_pos, prop->unique_id, DT_REG) < 0) goto out; + filp->f_pos++; - } else - i--; - n = NODE(ino).node; - if (ino == OPENPROM_FIRST_INO + aliases) { - for (j++; i < aliases_nodes; i++, j++) { - if (alias_names [i]) { - if (filldir (dirent, alias_names [i], - strlen (alias_names [i]), - filp->f_pos, j, DT_REG) < 0) goto out; - filp->f_pos++; - } - } - } else { - for (p = prom_firstprop (n, buffer2); - p && *p; - p = prom_nextprop (n, p, buffer2)) { - j++; - if (i) i--; - else { - if (filldir(dirent, p, strlen(p), - filp->f_pos, j, DT_REG) < 0) - goto out; - filp->f_pos++; - } - } + prop = prop->next; } } out: - unlock_kernel(); - return 0; -} - -static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode, - struct nameidata *nd) -{ - char *p; - struct inode *inode; - - if (!dir) - return -ENOENT; - if (dentry->d_name.len > 256) - return -EINVAL; - p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL); - if (!p) - return -ENOMEM; - strncpy (p, dentry->d_name.name, dentry->d_name.len); - p [dentry->d_name.len] = 0; - lock_kernel(); - if (aliases_nodes == ALIASES_NNODES) { - kfree(p); - unlock_kernel(); - return -EIO; - } - alias_names [aliases_nodes++] = p; - inode = iget (dir->i_sb, - NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes); - if (!inode) { - unlock_kernel(); - return -EINVAL; - } - inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR; - inode->i_fop = &openpromfs_prop_ops; - inode->i_nlink = 1; - if (inode->i_size < 0) inode->i_size = 0; - inode->u.generic_ip = (void *)(long)(((u16)aliases) | - (((u16)(aliases_nodes - 1)) << 16)); - unlock_kernel(); - d_instantiate(dentry, inode); + mutex_unlock(&op_mutex); return 0; } -static int openpromfs_unlink (struct inode *dir, struct dentry *dentry) -{ - unsigned int len; - char *p; - const char *name; - int i; - - name = dentry->d_name.name; - len = dentry->d_name.len; - lock_kernel(); - for (i = 0; i < aliases_nodes; i++) - if ((strlen (alias_names [i]) == len) - && !strncmp (name, alias_names[i], len)) { - char buffer[512]; - - p = alias_names [i]; - alias_names [i] = NULL; - kfree (p); - strcpy (buffer, "nvunalias "); - memcpy (buffer + 10, name, len); - buffer [10 + len] = 0; - prom_feval (buffer); - } - unlock_kernel(); - return 0; -} +static kmem_cache_t *op_inode_cachep; -/* {{{ init section */ -static int __init check_space (u16 n) +static struct inode *openprom_alloc_inode(struct super_block *sb) { - unsigned long pages; + struct op_inode_info *oi; - if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) { - pages = __get_free_pages (GFP_KERNEL, alloced + 1); - if (!pages) - return -1; + oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL); + if (!oi) + return NULL; - if (nodes) { - memcpy ((char *)pages, nodes, - (1 << alloced) * PAGE_SIZE); - free_pages ((unsigned long)nodes, alloced); - } - alloced++; - nodes = (openpromfs_node *)pages; - } - return 0; + return &oi->vfs_inode; } -static u16 __init get_nodes (u16 parent, u32 node) +static void openprom_destroy_inode(struct inode *inode) { - char *p; - u16 n = last_node++, i; - char buffer[64]; - - if (check_space (n) < 0) - return 0xffff; - nodes[n].parent = parent; - nodes[n].node = node; - nodes[n].next = 0xffff; - nodes[n].child = 0xffff; - nodes[n].first_prop = first_prop++; - if (!parent) { - char buffer[8]; - int j; - - if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) { - buffer[j] = 0; - if (!strcmp (buffer, "options")) - options = n; - else if (!strcmp (buffer, "aliases")) - aliases = n; - } - } - if (n != aliases) - for (p = prom_firstprop (node, buffer); - p && p != (char *)-1 && *p; - p = prom_nextprop (node, p, buffer)) - first_prop++; - else { - char *q; - for (p = prom_firstprop (node, buffer); - p && p != (char *)-1 && *p; - p = prom_nextprop (node, p, buffer)) { - if (aliases_nodes == ALIASES_NNODES) - break; - for (i = 0; i < aliases_nodes; i++) - if (!strcmp (p, alias_names [i])) - break; - if (i < aliases_nodes) - continue; - q = kmalloc (strlen (p) + 1, GFP_KERNEL); - if (!q) - return 0xffff; - strcpy (q, p); - alias_names [aliases_nodes++] = q; - } - first_prop += ALIASES_NNODES; - } - node = prom_getchild (node); - if (node) { - parent = get_nodes (n, node); - if (parent == 0xffff) - return 0xffff; - nodes[n].child = parent; - while ((node = prom_getsibling (node)) != 0) { - i = get_nodes (n, node); - if (i == 0xffff) - return 0xffff; - nodes[parent].next = i; - parent = i; - } - } - return n; + kmem_cache_free(op_inode_cachep, OP_I(inode)); } static void openprom_read_inode(struct inode * inode) @@ -1031,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data) } static struct super_operations openprom_sops = { + .alloc_inode = openprom_alloc_inode, + .destroy_inode = openprom_destroy_inode, .read_inode = openprom_read_inode, .statfs = simple_statfs, .remount_fs = openprom_remount, @@ -1038,7 +374,8 @@ static struct super_operations openprom_sops = { static int openprom_fill_super(struct super_block *s, void *data, int silent) { - struct inode * root_inode; + struct inode *root_inode; + struct op_inode_info *oi; s->s_flags |= MS_NOATIME; s->s_blocksize = 1024; @@ -1049,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent) root_inode = iget(s, OPENPROM_ROOT_INO); if (!root_inode) goto out_no_root; + + oi = OP_I(root_inode); + oi->type = op_inode_node; + oi->u.node = of_find_node_by_path("/"); + s->s_root = d_alloc_root(root_inode); if (!s->s_root) goto out_no_root; @@ -1073,29 +415,39 @@ static struct file_system_type openprom_fs_type = { .kill_sb = kill_anon_super, }; +static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags) +{ + struct op_inode_info *oi = (struct op_inode_info *) data; + + if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) == + SLAB_CTOR_CONSTRUCTOR) + inode_init_once(&oi->vfs_inode); +} + static int __init init_openprom_fs(void) { - nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0); - if (!nodes) { - printk (KERN_WARNING "openpromfs: can't get free page\n"); - return -EIO; - } - if (get_nodes (0xffff, prom_root_node) == 0xffff) { - printk (KERN_WARNING "openpromfs: couldn't setup tree\n"); - return -EIO; - } - nodes[last_node].first_prop = first_prop; - return register_filesystem(&openprom_fs_type); + int err; + + op_inode_cachep = kmem_cache_create("op_inode_cache", + sizeof(struct op_inode_info), + 0, + (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD), + op_inode_init_once, NULL); + if (!op_inode_cachep) + return -ENOMEM; + + err = register_filesystem(&openprom_fs_type); + if (err) + kmem_cache_destroy(op_inode_cachep); + + return err; } static void __exit exit_openprom_fs(void) { - int i; unregister_filesystem(&openprom_fs_type); - free_pages ((unsigned long)nodes, alloced); - for (i = 0; i < aliases_nodes; i++) - kfree (alias_names [i]); - nodes = NULL; + kmem_cache_destroy(op_inode_cachep); } module_init(init_openprom_fs) diff --git a/fs/pnode.c b/fs/pnode.c index 37b568ed0e0..da42ee61c1d 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt) if (master) { list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) slave_mnt->mnt_master = master; - list_del(&mnt->mnt_slave); - list_add(&mnt->mnt_slave, &master->mnt_slave_list); + list_move(&mnt->mnt_slave, &master->mnt_slave_list); list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); INIT_LIST_HEAD(&mnt->mnt_slave_list); } else { @@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt) * umount the child only if the child has no * other children */ - if (child && list_empty(&child->mnt_mounts)) { - list_del(&child->mnt_hash); - list_add_tail(&child->mnt_hash, &mnt->mnt_hash); - } + if (child && list_empty(&child->mnt_mounts)) + list_move_tail(&child->mnt_hash, &mnt->mnt_hash); } } diff --git a/fs/proc/base.c b/fs/proc/base.c index 6afff725a8c..6ba7785319d 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -74,6 +74,16 @@ #include <linux/poll.h> #include "internal.h" +/* NOTE: + * Implementing inode permission operations in /proc is almost + * certainly an error. Permission checks need to happen during + * each system call not at open time. The reason is that most of + * what we wish to check for permissions in /proc varies at runtime. + * + * The classic example of a problem is opening file descriptors + * in /proc for a task before it execs a suid executable. + */ + /* * For hysterical raisins we keep the same inumbers as in the old procfs. * Feel free to change the macro below - just keep the range distinct from @@ -121,6 +131,8 @@ enum pid_directory_inos { PROC_TGID_ATTR_PREV, PROC_TGID_ATTR_EXEC, PROC_TGID_ATTR_FSCREATE, + PROC_TGID_ATTR_KEYCREATE, + PROC_TGID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TGID_LOGINUID, @@ -162,6 +174,8 @@ enum pid_directory_inos { PROC_TID_ATTR_PREV, PROC_TID_ATTR_EXEC, PROC_TID_ATTR_FSCREATE, + PROC_TID_ATTR_KEYCREATE, + PROC_TID_ATTR_SOCKCREATE, #endif #ifdef CONFIG_AUDITSYSCALL PROC_TID_LOGINUID, @@ -173,6 +187,9 @@ enum pid_directory_inos { PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */ }; +/* Worst case buffer size needed for holding an integer. */ +#define PROC_NUMBUF 10 + struct pid_entry { int type; int len; @@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = { E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; static struct pid_entry tid_attr_stuff[] = { @@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = { E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO), E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO), E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO), + E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO), {0,0,NULL,0} }; #endif @@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = { static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct task_struct *task = proc_task(inode); - struct files_struct *files; + struct task_struct *task = get_proc_task(inode); + struct files_struct *files = NULL; struct file *file; - int fd = proc_type(inode) - PROC_TID_FD_DIR; + int fd = proc_fd(inode); - files = get_files_struct(task); + if (task) { + files = get_files_struct(task); + put_task_struct(task); + } if (files) { /* * We are not taking a ref to the file structure, so we must @@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task) return fs; } -static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int get_nr_threads(struct task_struct *tsk) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); - int result = -ENOENT; - if (fs) { - read_lock(&fs->lock); - *mnt = mntget(fs->pwdmnt); - *dentry = dget(fs->pwd); - read_unlock(&fs->lock); - result = 0; - put_fs_struct(fs); + /* Must be called with the rcu_read_lock held */ + unsigned long flags; + int count = 0; + + if (lock_task_sighand(tsk, &flags)) { + count = atomic_read(&tsk->signal->count); + unlock_task_sighand(tsk, &flags); } - return result; + return count; } -static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) +static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs = get_fs_struct(proc_task(inode)); + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; + + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); + } if (fs) { read_lock(&fs->lock); - *mnt = mntget(fs->rootmnt); - *dentry = dget(fs->root); + *mnt = mntget(fs->pwdmnt); + *dentry = dget(fs->pwd); read_unlock(&fs->lock); result = 0; put_fs_struct(fs); @@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf return result; } - -/* Same as proc_root_link, but this addionally tries to get fs from other - * threads in the group */ -static int proc_task_root_link(struct inode *inode, struct dentry **dentry, - struct vfsmount **mnt) +static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt) { - struct fs_struct *fs; + struct task_struct *task = get_proc_task(inode); + struct fs_struct *fs = NULL; int result = -ENOENT; - struct task_struct *leader = proc_task(inode); - task_lock(leader); - fs = leader->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(leader); - } else { - /* Try to get fs from other threads */ - task_unlock(leader); - read_lock(&tasklist_lock); - if (pid_alive(leader)) { - struct task_struct *task = leader; - - while ((task = next_thread(task)) != leader) { - task_lock(task); - fs = task->fs; - if (fs) { - atomic_inc(&fs->count); - task_unlock(task); - break; - } - task_unlock(task); - } - } - read_unlock(&tasklist_lock); + if (task) { + fs = get_fs_struct(task); + put_task_struct(task); } - if (fs) { read_lock(&fs->lock); *mnt = mntget(fs->rootmnt); @@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry, return result; } - #define MAY_PTRACE(task) \ (task == current || \ (task->parent == current && \ @@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer) /************************************************************************/ /* permission checks */ - -/* If the process being read is separated by chroot from the reading process, - * don't let the reader access the threads. - * - * note: this does dput(root) and mntput(vfsmnt) on exit. - */ -static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt) -{ - struct dentry *de, *base; - struct vfsmount *our_vfsmnt, *mnt; - int res = 0; - - read_lock(¤t->fs->lock); - our_vfsmnt = mntget(current->fs->rootmnt); - base = dget(current->fs->root); - read_unlock(¤t->fs->lock); - - spin_lock(&vfsmount_lock); - de = root; - mnt = vfsmnt; - - while (mnt != our_vfsmnt) { - if (mnt == mnt->mnt_parent) - goto out; - de = mnt->mnt_mountpoint; - mnt = mnt->mnt_parent; - } - - if (!is_subdir(de, base)) - goto out; - spin_unlock(&vfsmount_lock); - -exit: - dput(base); - mntput(our_vfsmnt); - dput(root); - mntput(vfsmnt); - return res; -out: - spin_unlock(&vfsmount_lock); - res = -EACCES; - goto exit; -} - -static int proc_check_root(struct inode *inode) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */ - return -ENOENT; - return proc_check_chroot(root, vfsmnt); -} - -static int proc_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - return proc_check_root(inode); -} - -static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd) -{ - struct dentry *root; - struct vfsmount *vfsmnt; - - if (generic_permission(inode, mask, NULL) != 0) - return -EACCES; - - if (proc_task_root_link(inode, &root, &vfsmnt)) - return -ENOENT; - - return proc_check_chroot(root, vfsmnt); -} - -extern struct seq_operations proc_pid_maps_op; -static int maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_maps_operations = { - .open = maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - -#ifdef CONFIG_NUMA -extern struct seq_operations proc_pid_numa_maps_op; -static int numa_maps_open(struct inode *inode, struct file *file) -{ - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_numa_maps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; - } - return ret; -} - -static struct file_operations proc_numa_maps_operations = { - .open = numa_maps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - -#ifdef CONFIG_MMU -extern struct seq_operations proc_pid_smaps_op; -static int smaps_open(struct inode *inode, struct file *file) +static int proc_fd_access_allowed(struct inode *inode) { - struct task_struct *task = proc_task(inode); - int ret = seq_open(file, &proc_pid_smaps_op); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = task; + struct task_struct *task; + int allowed = 0; + /* Allow access to a task's file descriptors if it is us or we + * may use ptrace attach to the process and find out that + * information. + */ + task = get_proc_task(inode); + if (task) { + allowed = ptrace_may_attach(task); + put_task_struct(task); } - return ret; + return allowed; } -static struct file_operations proc_smaps_operations = { - .open = smaps_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; -#endif - extern struct seq_operations mounts_op; struct proc_mounts { struct seq_file m; @@ -679,16 +560,19 @@ struct proc_mounts { static int mounts_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); - struct namespace *namespace; + struct task_struct *task = get_proc_task(inode); + struct namespace *namespace = NULL; struct proc_mounts *p; int ret = -EINVAL; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) { ret = -ENOMEM; @@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = { extern struct seq_operations mountstats_op; static int mountstats_open(struct inode *inode, struct file *file) { - struct task_struct *task = proc_task(inode); int ret = seq_open(file, &mountstats_op); if (!ret) { struct seq_file *m = file->private_data; - struct namespace *namespace; - task_lock(task); - namespace = task->namespace; - if (namespace) - get_namespace(namespace); - task_unlock(task); + struct namespace *namespace = NULL; + struct task_struct *task = get_proc_task(inode); + + if (task) { + task_lock(task); + namespace = task->namespace; + if (namespace) + get_namespace(namespace); + task_unlock(task); + put_task_struct(task); + } if (namespace) m->private = namespace; @@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PROC_BLOCK_SIZE) count = PROC_BLOCK_SIZE; + + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = PROC_I(inode)->op.proc_read(task, (char*)page); if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file) static ssize_t mem_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); char *page; unsigned long src = *ppos; int ret = -ESRCH; struct mm_struct *mm; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) goto out; @@ -865,6 +765,8 @@ out_put: out_free: free_page((unsigned long) page); out: + put_task_struct(task); +out_no_task: return ret; } @@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf, { int copied = 0; char *page; - struct task_struct *task = proc_task(file->f_dentry->d_inode); + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); unsigned long dst = *ppos; + copied = -ESRCH; + if (!task) + goto out_no_task; + if (!MAY_PTRACE(task) || !ptrace_may_attach(task)) - return -ESRCH; + goto out; + copied = -ENOMEM; page = (char *)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; while (count > 0) { int this_len, retval; @@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf, } *ppos = dst; free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return copied; } #endif @@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = { static ssize_t oom_adjust_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8]; + struct task_struct *task = get_proc_task(file->f_dentry->d_inode); + char buffer[PROC_NUMBUF]; size_t len; - int oom_adjust = task->oomkilladj; + int oom_adjust; loff_t __ppos = *ppos; - len = sprintf(buffer, "%i\n", oom_adjust); + if (!task) + return -ESRCH; + oom_adjust = task->oomkilladj; + put_task_struct(task); + + len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); if (__ppos >= len) return 0; if (count > len-__ppos) @@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, static ssize_t oom_adjust_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *task = proc_task(file->f_dentry->d_inode); - char buffer[8], *end; + struct task_struct *task; + char buffer[PROC_NUMBUF], *end; int oom_adjust; if (!capable(CAP_SYS_RESOURCE)) return -EPERM; - memset(buffer, 0, 8); - if (count > 6) - count = 6; + memset(buffer, 0, sizeof(buffer)); + if (count > sizeof(buffer) - 1) + count = sizeof(buffer) - 1; if (copy_from_user(buffer, buf, count)) return -EFAULT; oom_adjust = simple_strtol(buffer, &end, 0); @@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, return -EINVAL; if (*end == '\n') end++; + task = get_proc_task(file->f_dentry->d_inode); + if (!task) + return -ESRCH; task->oomkilladj = oom_adjust; + put_task_struct(task); if (end - buffer == 0) return -EIO; return end - buffer; @@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = { .write = oom_adjust_write, }; -static struct inode_operations proc_mem_inode_operations = { - .permission = proc_permission, -}; - #ifdef CONFIG_AUDITSYSCALL #define TMPBUFLEN 21 static ssize_t proc_loginuid_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { struct inode * inode = file->f_dentry->d_inode; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); ssize_t length; char tmpbuf[TMPBUFLEN]; + if (!task) + return -ESRCH; length = scnprintf(tmpbuf, TMPBUFLEN, "%u", audit_get_loginuid(task->audit_context)); + put_task_struct(task); return simple_read_from_buffer(buf, count, ppos, tmpbuf, length); } @@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page, *tmp; ssize_t length; - struct task_struct *task = proc_task(inode); uid_t loginuid; if (!capable(CAP_AUDIT_CONTROL)) return -EPERM; - if (current != task) + if (current != pid_task(proc_pid(inode), PIDTYPE_PID)) return -EPERM; if (count >= PAGE_SIZE) @@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf, goto out_free_page; } - length = audit_set_loginuid(task, loginuid); + length = audit_set_loginuid(current, loginuid); if (likely(length == 0)) length = count; @@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = { static ssize_t seccomp_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20]; loff_t __ppos = *ppos; size_t len; + if (!tsk) + return -ESRCH; /* no need to print the trailing zero, so use only len */ len = sprintf(__buf, "%u\n", tsk->seccomp.mode); + put_task_struct(tsk); if (__ppos >= len) return 0; if (count > len - __ppos) @@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf, static ssize_t seccomp_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - struct task_struct *tsk = proc_task(file->f_dentry->d_inode); + struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode); char __buf[20], *end; unsigned int seccomp_mode; + ssize_t result; + + result = -ESRCH; + if (!tsk) + goto out_no_task; /* can set it only once to be even more secure */ + result = -EPERM; if (unlikely(tsk->seccomp.mode)) - return -EPERM; + goto out; + result = -EFAULT; memset(__buf, 0, sizeof(__buf)); count = min(count, sizeof(__buf) - 1); if (copy_from_user(__buf, buf, count)) - return -EFAULT; + goto out; + seccomp_mode = simple_strtoul(__buf, &end, 0); if (*end == '\n') end++; + result = -EINVAL; if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) { tsk->seccomp.mode = seccomp_mode; set_tsk_thread_flag(tsk, TIF_SECCOMP); } else - return -EINVAL; + goto out; + result = -EIO; if (unlikely(!(end - __buf))) - return -EIO; - return end - __buf; + goto out; + result = end - __buf; +out: + put_task_struct(tsk); +out_no_task: + return result; } static struct file_operations proc_seccomp_operations = { @@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd) /* We don't need a base pointer in the /proc filesystem */ path_release(nd); - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt); @@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b struct dentry *de; struct vfsmount *mnt = NULL; - lock_kernel(); - - if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE)) - goto out; - error = proc_check_root(inode); - if (error) + /* Are we allowed to snoop on the tasks file descriptors? */ + if (!proc_fd_access_allowed(inode)) goto out; error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt); @@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b dput(de); mntput(mnt); out: - unlock_kernel(); return error; } @@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = { .follow_link = proc_pid_follow_link }; -#define NUMBUF 10 - static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) { - struct inode *inode = filp->f_dentry->d_inode; - struct task_struct *p = proc_task(inode); + struct dentry *dentry = filp->f_dentry; + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); unsigned int fd, tid, ino; int retval; - char buf[NUMBUF]; + char buf[PROC_NUMBUF]; struct files_struct * files; struct fdtable *fdt; retval = -ENOENT; - if (!pid_alive(p)) - goto out; + if (!p) + goto out_no_task; retval = 0; tid = p->pid; @@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) goto out; filp->f_pos++; case 1: - ino = fake_ino(tid, PROC_TID_INO); + ino = parent_ino(dentry); if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0) goto out; filp->f_pos++; @@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) continue; rcu_read_unlock(); - j = NUMBUF; + j = PROC_NUMBUF; i = fd; do { j--; @@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) } while (i); ino = fake_ino(tid, PROC_TID_FD_DIR + fd); - if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) { + if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) { rcu_read_lock(); break; } @@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir) put_files_struct(files); } out: + put_task_struct(p); +out_no_task: return retval; } @@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp, int pid; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *task = get_proc_task(inode); struct pid_entry *p; ino_t ino; int ret; ret = -ENOENT; - if (!pid_alive(proc_task(inode))) + if (!task) goto out; ret = 0; - pid = proc_task(inode)->pid; + pid = task->pid; + put_task_struct(task); i = filp->f_pos; switch (i) { case 0: @@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st /* Common stuff */ ei = PROC_I(inode); - ei->task = NULL; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; inode->i_ino = fake_ino(task->pid, ino); - if (!pid_alive(task)) - goto out_unlock; - /* * grab the reference to task. */ - get_task_struct(task); - ei->task = task; - ei->type = ino; + ei->pid = get_pid(task->pids[PIDTYPE_PID].pid); + if (!ei->pid) + goto out_unlock; + inode->i_uid = 0; inode->i_gid = 0; - if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) { + if (task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } @@ -1379,7 +1306,6 @@ out: return inode; out_unlock: - ei->pde = NULL; iput(inode); return NULL; } @@ -1393,13 +1319,21 @@ out_unlock: * * Rewrite the inode's ownerships here because the owning task may have * performed a setuid(), etc. + * + * Before the /proc/pid/status file was created the only way to read + * the effective uid of a /process was to stat /proc/pid. Reading + * /proc/pid/status is slow enough that procps and other packages + * kept stating /proc/pid. To keep the rules in /proc simple I have + * made this apply to all per process world readable and executable + * directories. */ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - if (pid_alive(task)) { - if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) { + struct task_struct *task = get_proc_task(inode); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { inode->i_uid = task->euid; inode->i_gid = task->egid; } else { @@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd) inode->i_gid = 0; } security_task_to_inode(task, inode); + put_task_struct(task); return 1; } d_drop(dentry); return 0; } +static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *task; + generic_fillattr(inode, stat); + + rcu_read_lock(); + stat->uid = 0; + stat->gid = 0; + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) || + task_dumpable(task)) { + stat->uid = task->euid; + stat->gid = task->egid; + } + } + rcu_read_unlock(); + return 0; +} + static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct task_struct *task = proc_task(inode); - int fd = proc_type(inode) - PROC_TID_FD_DIR; + struct task_struct *task = get_proc_task(inode); + int fd = proc_fd(inode); struct files_struct *files; - files = get_files_struct(task); - if (files) { - rcu_read_lock(); - if (fcheck_files(files, fd)) { + if (task) { + files = get_files_struct(task); + if (files) { + rcu_read_lock(); + if (fcheck_files(files, fd)) { + rcu_read_unlock(); + put_files_struct(files); + if (task_dumpable(task)) { + inode->i_uid = task->euid; + inode->i_gid = task->egid; + } else { + inode->i_uid = 0; + inode->i_gid = 0; + } + security_task_to_inode(task, inode); + put_task_struct(task); + return 1; + } rcu_read_unlock(); put_files_struct(files); - if (task_dumpable(task)) { - inode->i_uid = task->euid; - inode->i_gid = task->egid; - } else { - inode->i_uid = 0; - inode->i_gid = 0; - } - security_task_to_inode(task, inode); - return 1; } - rcu_read_unlock(); - put_files_struct(files); + put_task_struct(task); } d_drop(dentry); return 0; } -static void pid_base_iput(struct dentry *dentry, struct inode *inode) -{ - struct task_struct *task = proc_task(inode); - spin_lock(&task->proc_lock); - if (task->proc_dentry == dentry) - task->proc_dentry = NULL; - spin_unlock(&task->proc_lock); - iput(inode); -} - static int pid_delete_dentry(struct dentry * dentry) { /* Is the task we represent dead? * If so, then don't put the dentry on the lru list, * kill it immediately. */ - return !pid_alive(proc_task(dentry->d_inode)); + return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first; } static struct dentry_operations tid_fd_dentry_operations = @@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations = .d_delete = pid_delete_dentry, }; -static struct dentry_operations pid_base_dentry_operations = -{ - .d_revalidate = pid_revalidate, - .d_iput = pid_base_iput, - .d_delete = pid_delete_dentry, -}; - /* Lookups */ static unsigned name_to_int(struct dentry *dentry) @@ -1508,22 +1451,24 @@ out: /* SMP-safe */ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd) { - struct task_struct *task = proc_task(dir); + struct task_struct *task = get_proc_task(dir); unsigned fd = name_to_int(dentry); + struct dentry *result = ERR_PTR(-ENOENT); struct file * file; struct files_struct * files; struct inode *inode; struct proc_inode *ei; + if (!task) + goto out_no_task; if (fd == ~0U) goto out; - if (!pid_alive(task)) - goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd); if (!inode) goto out; ei = PROC_I(inode); + ei->fd = fd; files = get_files_struct(task); if (!files) goto out_unlock; @@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, ei->op.proc_get_link = proc_fd_link; dentry->d_op = &tid_fd_dentry_operations; d_add(dentry, inode); - return NULL; + /* Close the race of the process dying before we return the dentry */ + if (tid_fd_revalidate(dentry, NULL)) + result = NULL; +out: + put_task_struct(task); +out_no_task: + return result; out_unlock2: spin_unlock(&files->file_lock); put_files_struct(files); out_unlock: iput(inode); -out: - return ERR_PTR(-ENOENT); + goto out; } static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir); static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd); +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat); static struct file_operations proc_fd_operations = { .read = generic_read_dir, @@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = { */ static struct inode_operations proc_fd_inode_operations = { .lookup = proc_lookupfd, - .permission = proc_permission, }; static struct inode_operations proc_task_inode_operations = { .lookup = proc_task_lookup, - .permission = proc_task_permission, + .getattr = proc_task_getattr, }; #ifdef CONFIG_SECURITY @@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, struct inode * inode = file->f_dentry->d_inode; unsigned long page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; + length = -ENOMEM; if (!(page = __get_free_page(GFP_KERNEL))) - return -ENOMEM; + goto out; length = security_getprocattr(task, (char*)file->f_dentry->d_name.name, @@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf, if (length >= 0) length = simple_read_from_buffer(buf, count, ppos, (char *)page, length); free_page(page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, struct inode * inode = file->f_dentry->d_inode; char *page; ssize_t length; - struct task_struct *task = proc_task(inode); + struct task_struct *task = get_proc_task(inode); + length = -ESRCH; + if (!task) + goto out_no_task; if (count > PAGE_SIZE) count = PAGE_SIZE; - if (*ppos != 0) { - /* No partial writes. */ - return -EINVAL; - } + + /* No partial writes. */ + length = -EINVAL; + if (*ppos != 0) + goto out; + + length = -ENOMEM; page = (char*)__get_free_page(GFP_USER); if (!page) - return -ENOMEM; + goto out; + length = -EFAULT; if (copy_from_user(page, buf, count)) - goto out; + goto out_free; length = security_setprocattr(task, (char*)file->f_dentry->d_name.name, (void*)page, count); -out: +out_free: free_page((unsigned long) page); +out: + put_task_struct(task); +out_no_task: return length; } @@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations; static struct inode_operations proc_tgid_attr_inode_operations; #endif -static int get_tid_list(int index, unsigned int *tids, struct inode *dir); - /* SMP-safe */ static struct dentry *proc_pident_lookup(struct inode *dir, struct dentry *dentry, struct pid_entry *ents) { struct inode *inode; - int error; - struct task_struct *task = proc_task(dir); + struct dentry *error; + struct task_struct *task = get_proc_task(dir); struct pid_entry *p; struct proc_inode *ei; - error = -ENOENT; + error = ERR_PTR(-ENOENT); inode = NULL; - if (!pid_alive(task)) - goto out; + if (!task) + goto out_no_task; for (p = ents; p->name; p++) { if (p->len != dentry->d_name.len) @@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, if (!p->name) goto out; - error = -EINVAL; + error = ERR_PTR(-EINVAL); inode = proc_pid_make_inode(dir->i_sb, task, p->type); if (!inode) goto out; @@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir, */ switch(p->type) { case PROC_TGID_TASK: - inode->i_nlink = 2 + get_tid_list(2, NULL, dir); + inode->i_nlink = 2; inode->i_op = &proc_task_inode_operations; inode->i_fop = &proc_task_operations; break; @@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir, #endif case PROC_TID_MEM: case PROC_TGID_MEM: - inode->i_op = &proc_mem_inode_operations; inode->i_fop = &proc_mem_operations; break; #ifdef CONFIG_SECCOMP @@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir, case PROC_TGID_ATTR_EXEC: case PROC_TID_ATTR_FSCREATE: case PROC_TGID_ATTR_FSCREATE: + case PROC_TID_ATTR_KEYCREATE: + case PROC_TGID_ATTR_KEYCREATE: + case PROC_TID_ATTR_SOCKCREATE: + case PROC_TGID_ATTR_SOCKCREATE: inode->i_fop = &proc_pid_attr_operations; break; #endif @@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir, default: printk("procfs: impossible type (%d)",p->type); iput(inode); - return ERR_PTR(-EINVAL); + error = ERR_PTR(-EINVAL); + goto out; } dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); - return NULL; - + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + error = NULL; out: - return ERR_PTR(error); + put_task_struct(task); +out_no_task: + return error; } static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){ @@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = { static struct inode_operations proc_tgid_base_inode_operations = { .lookup = proc_tgid_base_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_base_inode_operations = { .lookup = proc_tid_base_lookup, + .getattr = pid_getattr, }; #ifdef CONFIG_SECURITY @@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir, static struct inode_operations proc_tgid_attr_inode_operations = { .lookup = proc_tgid_attr_lookup, + .getattr = pid_getattr, }; static struct inode_operations proc_tid_attr_inode_operations = { .lookup = proc_tid_attr_lookup, + .getattr = pid_getattr, }; #endif @@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = { static int proc_self_readlink(struct dentry *dentry, char __user *buffer, int buflen) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return vfs_readlink(dentry,buffer,buflen,tmp); } static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd) { - char tmp[30]; + char tmp[PROC_NUMBUF]; sprintf(tmp, "%d", current->tgid); return ERR_PTR(vfs_follow_link(nd,tmp)); } @@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = { }; /** - * proc_pid_unhash - Unhash /proc/@pid entry from the dcache. - * @p: task that should be flushed. + * proc_flush_task - Remove dcache entries for @task from the /proc dcache. + * + * @task: task that should be flushed. + * + * Looks in the dcache for + * /proc/@pid + * /proc/@tgid/task/@pid + * if either directory is present flushes it and all of it'ts children + * from the dcache. * - * Drops the /proc/@pid dcache entry from the hash chains. + * It is safe and reasonable to cache /proc entries for a task until + * that task exits. After that they just clog up the dcache with + * useless entries, possibly causing useful dcache entries to be + * flushed instead. This routine is proved to flush those useless + * dcache entries at process exit time. * - * Dropping /proc/@pid entries and detach_pid must be synchroneous, - * otherwise e.g. /proc/@pid/exe might point to the wrong executable, - * if the pid value is immediately reused. This is enforced by - * - caller must acquire spin_lock(p->proc_lock) - * - must be called before detach_pid() - * - proc_pid_lookup acquires proc_lock, and checks that - * the target is not dead by looking at the attach count - * of PIDTYPE_PID. + * NOTE: This routine is just an optimization so it does not guarantee + * that no dcache entries will exist at process exit time it + * just makes it very unlikely that any will persist. */ - -struct dentry *proc_pid_unhash(struct task_struct *p) +void proc_flush_task(struct task_struct *task) { - struct dentry *proc_dentry; + struct dentry *dentry, *leader, *dir; + char buf[PROC_NUMBUF]; + struct qstr name; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); + } - proc_dentry = p->proc_dentry; - if (proc_dentry != NULL) { + if (thread_group_leader(task)) + goto out; - spin_lock(&dcache_lock); - spin_lock(&proc_dentry->d_lock); - if (!d_unhashed(proc_dentry)) { - dget_locked(proc_dentry); - __d_drop(proc_dentry); - spin_unlock(&proc_dentry->d_lock); - } else { - spin_unlock(&proc_dentry->d_lock); - proc_dentry = NULL; - } - spin_unlock(&dcache_lock); - } - return proc_dentry; -} + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->tgid); + leader = d_hash_and_lookup(proc_mnt->mnt_root, &name); + if (!leader) + goto out; -/** - * proc_pid_flush - recover memory used by stale /proc/@pid/x entries - * @proc_dentry: directoy to prune. - * - * Shrink the /proc directory that was used by the just killed thread. - */ - -void proc_pid_flush(struct dentry *proc_dentry) -{ - might_sleep(); - if(proc_dentry != NULL) { - shrink_dcache_parent(proc_dentry); - dput(proc_dentry); + name.name = "task"; + name.len = strlen(name.name); + dir = d_hash_and_lookup(leader, &name); + if (!dir) + goto out_put_leader; + + name.name = buf; + name.len = snprintf(buf, sizeof(buf), "%d", task->pid); + dentry = d_hash_and_lookup(dir, &name); + if (dentry) { + shrink_dcache_parent(dentry); + d_drop(dentry); + dput(dentry); } + + dput(dir); +out_put_leader: + dput(leader); +out: + return; } /* SMP-safe */ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; struct inode *inode; struct proc_inode *ei; unsigned tgid; - int died; if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) { inode = new_inode(dir->i_sb); @@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct if (tgid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tgid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO); + if (!inode) + goto out_put_task; - - if (!inode) { - put_task_struct(task); - goto out; - } inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO; inode->i_op = &proc_tgid_base_inode_operations; inode->i_fop = &proc_tgid_base_operations; @@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct inode->i_nlink = 4; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; - died = 0; d_add(dentry, inode); - spin_lock(&task->proc_lock); - task->proc_dentry = dentry; - if (!pid_alive(task)) { - dentry = proc_pid_unhash(task); - died = 1; - } - spin_unlock(&task->proc_lock); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; +out_put_task: put_task_struct(task); - if (died) { - proc_pid_flush(dentry); - goto out; - } - return NULL; out: - return ERR_PTR(-ENOENT); + return result; } /* SMP-safe */ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd) { + struct dentry *result = ERR_PTR(-ENOENT); struct task_struct *task; - struct task_struct *leader = proc_task(dir); + struct task_struct *leader = get_proc_task(dir); struct inode *inode; unsigned tid; + if (!leader) + goto out_no_task; + tid = name_to_int(dentry); if (tid == ~0U) goto out; - read_lock(&tasklist_lock); + rcu_read_lock(); task = find_task_by_pid(tid); if (task) get_task_struct(task); - read_unlock(&tasklist_lock); + rcu_read_unlock(); if (!task) goto out; if (leader->tgid != task->tgid) @@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry inode->i_nlink = 3; #endif - dentry->d_op = &pid_base_dentry_operations; + dentry->d_op = &pid_dentry_operations; d_add(dentry, inode); + /* Close the race of the process dying before we return the dentry */ + if (pid_revalidate(dentry, NULL)) + result = NULL; - put_task_struct(task); - return NULL; out_drop_task: put_task_struct(task); out: - return ERR_PTR(-ENOENT); + put_task_struct(leader); +out_no_task: + return result; } -#define PROC_NUMBUF 10 -#define PROC_MAXPIDS 20 - /* - * Get a few tgid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the first tgid to return to user space. + * + * Usually this is just whatever follows &init_task, but if the users + * buffer was too small to hold the full list or there was a seek into + * the middle of the directory we have more work to do. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with &init_task and walk nr + * threads past it. */ -static int get_tgid_list(int index, unsigned long version, unsigned int *tgids) -{ - struct task_struct *p; - int nr_tgids = 0; - - index--; - read_lock(&tasklist_lock); - p = NULL; - if (version) { - p = find_task_by_pid(version); - if (p && !thread_group_leader(p)) - p = NULL; +static struct task_struct *first_tgid(int tgid, unsigned int nr) +{ + struct task_struct *pos; + rcu_read_lock(); + if (tgid && nr) { + pos = find_task_by_pid(tgid); + if (pos && thread_group_leader(pos)) + goto found; } + /* If nr exceeds the number of processes get out quickly */ + pos = NULL; + if (nr && nr >= nr_processes()) + goto done; - if (p) - index = 0; - else - p = next_task(&init_task); - - for ( ; p != &init_task; p = next_task(p)) { - int tgid = p->pid; - if (!pid_alive(p)) - continue; - if (--index >= 0) - continue; - tgids[nr_tgids] = tgid; - nr_tgids++; - if (nr_tgids >= PROC_MAXPIDS) - break; + /* If we haven't found our starting place yet start with + * the init_task and walk nr tasks forward. + */ + for (pos = next_task(&init_task); nr > 0; --nr) { + pos = next_task(pos); + if (pos == &init_task) { + pos = NULL; + goto done; + } } - read_unlock(&tasklist_lock); - return nr_tgids; +found: + get_task_struct(pos); +done: + rcu_read_unlock(); + return pos; } /* - * Get a few tid's to return for filldir - we need to hold the - * tasklist lock while doing this, and we must release it before - * we actually do the filldir itself, so we use a temp buffer.. + * Find the next task in the task list. + * Return NULL if we loop or there is any error. + * + * The reference to the input task_struct is released. */ -static int get_tid_list(int index, unsigned int *tids, struct inode *dir) -{ - struct task_struct *leader_task = proc_task(dir); - struct task_struct *task = leader_task; - int nr_tids = 0; - - index -= 2; - read_lock(&tasklist_lock); - /* - * The starting point task (leader_task) might be an already - * unlinked task, which cannot be used to access the task-list - * via next_thread(). - */ - if (pid_alive(task)) do { - int tid = task->pid; - - if (--index >= 0) - continue; - if (tids != NULL) - tids[nr_tids] = tid; - nr_tids++; - if (nr_tids >= PROC_MAXPIDS) - break; - } while ((task = next_thread(task)) != leader_task); - read_unlock(&tasklist_lock); - return nr_tids; +static struct task_struct *next_tgid(struct task_struct *start) +{ + struct task_struct *pos; + rcu_read_lock(); + pos = start; + if (pid_alive(start)) + pos = next_task(start); + if (pid_alive(pos) && (pos != &init_task)) { + get_task_struct(pos); + goto done; + } + pos = NULL; +done: + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/ directory itself, after non-process stuff has been done */ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tgid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY; - unsigned int nr_tgids, i; - int next_tgid; + struct task_struct *task; + int tgid; if (!nr) { ino_t ino = fake_ino(0,PROC_TGID_INO); @@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir) filp->f_pos++; nr++; } + nr -= 1; /* f_version caches the tgid value that the last readdir call couldn't * return. lseek aka telldir automagically resets f_version to 0. */ - next_tgid = filp->f_version; + tgid = filp->f_version; filp->f_version = 0; - for (;;) { - nr_tgids = get_tgid_list(nr, next_tgid, tgid_array); - if (!nr_tgids) { - /* no more entries ! */ + for (task = first_tgid(tgid, nr); + task; + task = next_tgid(task), filp->f_pos++) { + int len; + ino_t ino; + tgid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tgid); + ino = fake_ino(tgid, PROC_TGID_INO); + if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tgid; + put_task_struct(task); break; } - next_tgid = 0; + } + return 0; +} - /* do not use the last found pid, reserve it for next_tgid */ - if (nr_tgids == PROC_MAXPIDS) { - nr_tgids--; - next_tgid = tgid_array[nr_tgids]; - } +/* + * Find the first tid of a thread group to return to user space. + * + * Usually this is just the thread group leader, but if the users + * buffer was too small or there was a seek into the middle of the + * directory we have more work todo. + * + * In the case of a short read we start with find_task_by_pid. + * + * In the case of a seek we start with the leader and walk nr + * threads past it. + */ +static struct task_struct *first_tid(struct task_struct *leader, + int tid, int nr) +{ + struct task_struct *pos; - for (i=0;i<nr_tgids;i++) { - int tgid = tgid_array[i]; - ino_t ino = fake_ino(tgid,PROC_TGID_INO); - unsigned long j = PROC_NUMBUF; + rcu_read_lock(); + /* Attempt to start with the pid of a thread */ + if (tid && (nr > 0)) { + pos = find_task_by_pid(tid); + if (pos && (pos->group_leader == leader)) + goto found; + } - do - buf[--j] = '0' + (tgid % 10); - while ((tgid /= 10) != 0); + /* If nr exceeds the number of threads there is nothing todo */ + pos = NULL; + if (nr && nr >= get_nr_threads(leader)) + goto out; - if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) { - /* returning this tgid failed, save it as the first - * pid for the next readir call */ - filp->f_version = tgid_array[i]; - goto out; - } - filp->f_pos++; - nr++; + /* If we haven't found our starting place yet start + * with the leader and walk nr threads forward. + */ + for (pos = leader; nr > 0; --nr) { + pos = next_thread(pos); + if (pos == leader) { + pos = NULL; + goto out; } } +found: + get_task_struct(pos); out: - return 0; + rcu_read_unlock(); + return pos; +} + +/* + * Find the next thread in the thread list. + * Return NULL if there is an error or no next thread. + * + * The reference to the input task_struct is released. + */ +static struct task_struct *next_tid(struct task_struct *start) +{ + struct task_struct *pos = NULL; + rcu_read_lock(); + if (pid_alive(start)) { + pos = next_thread(start); + if (thread_group_leader(pos)) + pos = NULL; + else + get_task_struct(pos); + } + rcu_read_unlock(); + put_task_struct(start); + return pos; } /* for the /proc/TGID/task/ directories */ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir) { - unsigned int tid_array[PROC_MAXPIDS]; char buf[PROC_NUMBUF]; - unsigned int nr_tids, i; struct dentry *dentry = filp->f_dentry; struct inode *inode = dentry->d_inode; + struct task_struct *leader = get_proc_task(inode); + struct task_struct *task; int retval = -ENOENT; ino_t ino; + int tid; unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ - if (!pid_alive(proc_task(inode))) - goto out; + if (!leader) + goto out_no_task; retval = 0; switch (pos) { @@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi /* fall through */ } - nr_tids = get_tid_list(pos, tid_array, inode); - inode->i_nlink = pos + nr_tids; - - for (i = 0; i < nr_tids; i++) { - unsigned long j = PROC_NUMBUF; - int tid = tid_array[i]; - - ino = fake_ino(tid,PROC_TID_INO); - - do - buf[--j] = '0' + (tid % 10); - while ((tid /= 10) != 0); - - if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0) + /* f_version caches the tgid value that the last readdir call couldn't + * return. lseek aka telldir automagically resets f_version to 0. + */ + tid = filp->f_version; + filp->f_version = 0; + for (task = first_tid(leader, tid, pos - 2); + task; + task = next_tid(task), pos++) { + int len; + tid = task->pid; + len = snprintf(buf, sizeof(buf), "%d", tid); + ino = fake_ino(tid, PROC_TID_INO); + if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) { + /* returning this tgid failed, save it as the first + * pid for the next readir call */ + filp->f_version = tid; + put_task_struct(task); break; - pos++; + } } out: filp->f_pos = pos; + put_task_struct(leader); +out_no_task: return retval; } + +static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + struct task_struct *p = get_proc_task(inode); + generic_fillattr(inode, stat); + + if (p) { + rcu_read_lock(); + stat->nlink += get_nr_threads(p); + rcu_read_unlock(); + put_task_struct(p); + } + + return 0; +} diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 722b9c46311..6dcef089e18 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de) static void proc_delete_inode(struct inode *inode) { struct proc_dir_entry *de; - struct task_struct *tsk; truncate_inode_pages(&inode->i_data, 0); - /* Let go of any associated process */ - tsk = PROC_I(inode)->task; - if (tsk) - put_task_struct(tsk); + /* Stop tracking associated processes */ + put_pid(PROC_I(inode)->pid); /* Let go of any associated proc directory entry */ de = PROC_I(inode)->pde; @@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb) ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL); if (!ei) return NULL; - ei->task = NULL; - ei->type = 0; + ei->pid = NULL; + ei->fd = 0; ei->op.proc_get_link = NULL; ei->pde = NULL; inode = &ei->vfs_inode; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 0502f17b860..146a434ba94 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *); extern int proc_pid_status(struct task_struct *, char *); extern int proc_pid_statm(struct task_struct *, char *); +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + +extern struct file_operations proc_maps_operations; +extern struct file_operations proc_numa_maps_operations; +extern struct file_operations proc_smaps_operations; + + void free_proc_entry(struct proc_dir_entry *de); int proc_init_inodecache(void); -static inline struct task_struct *proc_task(struct inode *inode) +static inline struct pid *proc_pid(struct inode *inode) +{ + return PROC_I(inode)->pid; +} + +static inline struct task_struct *get_proc_task(struct inode *inode) { - return PROC_I(inode)->task; + return get_pid_task(proc_pid(inode), PIDTYPE_PID); } -static inline int proc_type(struct inode *inode) +static inline int proc_fd(struct inode *inode) { - return PROC_I(inode)->type; + return PROC_I(inode)->fd; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 91b7c15ab37..0137ec4c136 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount * { struct vm_area_struct * vma; int result = -ENOENT; - struct task_struct *task = proc_task(inode); - struct mm_struct * mm = get_task_mm(task); + struct task_struct *task = get_proc_task(inode); + struct mm_struct * mm = NULL; + if (task) { + mm = get_task_mm(task); + put_task_struct(task); + } if (!mm) goto out; down_read(&mm->mmap_sem); @@ -120,7 +124,8 @@ struct mem_size_stats static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; + struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; @@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v) static void *m_start(struct seq_file *m, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma; + struct vm_area_struct *vma, *tail_vma = NULL; loff_t l = *pos; + /* Clear the per syscall fields in priv */ + priv->task = NULL; + priv->tail_vma = NULL; + /* * We remember last_addr rather than next_addr to hit with * mmap_cache most of the time. We have zero last_addr at @@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos) if (last_addr == -1UL) return NULL; - mm = get_task_mm(task); + priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + if (!priv->task) + return NULL; + + mm = get_task_mm(priv->task); if (!mm) return NULL; - tail_vma = get_gate_vma(task); + priv->tail_vma = tail_vma = get_gate_vma(priv->task); down_read(&mm->mmap_sem); /* Start with last addr hint */ @@ -350,11 +363,9 @@ out: return tail_vma; } -static void m_stop(struct seq_file *m, void *v) +static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) { - struct task_struct *task = m->private; - struct vm_area_struct *vma = v; - if (vma && vma != get_gate_vma(task)) { + if (vma && vma != priv->tail_vma) { struct mm_struct *mm = vma->vm_mm; up_read(&mm->mmap_sem); mmput(mm); @@ -363,38 +374,103 @@ static void m_stop(struct seq_file *m, void *v) static void *m_next(struct seq_file *m, void *v, loff_t *pos) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = get_gate_vma(task); + struct vm_area_struct *tail_vma = priv->tail_vma; (*pos)++; if (vma && (vma != tail_vma) && vma->vm_next) return vma->vm_next; - m_stop(m, v); + vma_stop(priv, vma); return (vma != tail_vma)? tail_vma: NULL; } -struct seq_operations proc_pid_maps_op = { +static void m_stop(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + + vma_stop(priv, vma); + if (priv->task) + put_task_struct(priv->task); +} + +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; -struct seq_operations proc_pid_smaps_op = { +static struct seq_operations proc_pid_smaps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_smap }; +static int do_maps_open(struct inode *inode, struct file *file, + struct seq_operations *ops) +{ + struct proc_maps_private *priv; + int ret = -ENOMEM; + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (priv) { + priv->pid = proc_pid(inode); + ret = seq_open(file, ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = priv; + } else { + kfree(priv); + } + } + return ret; +} + +static int maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_maps_op); +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + #ifdef CONFIG_NUMA extern int show_numa_map(struct seq_file *m, void *v); -struct seq_operations proc_pid_numa_maps_op = { +static struct seq_operations proc_pid_numa_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_numa_map }; + +static int numa_maps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_numa_maps_op); +} + +struct file_operations proc_numa_maps_operations = { + .open = numa_maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; #endif + +static int smaps_open(struct inode *inode, struct file *file) +{ + return do_maps_open(inode, file, &proc_pid_smaps_op); +} + +struct file_operations proc_smaps_operations = { + .open = smaps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 8f68827ed10..af69f28277b 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos) { return NULL; } -struct seq_operations proc_pid_maps_op = { +static struct seq_operations proc_pid_maps_op = { .start = m_start, .next = m_next, .stop = m_stop, .show = show_map }; + +static int maps_open(struct inode *inode, struct file *file) +{ + int ret; + ret = seq_open(file, &proc_pid_maps_op); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = NULL; + } + return ret; +} + +struct file_operations proc_maps_operations = { + .open = maps_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index cf6e1cf4035..752cea12e30 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t return res; } -static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf, - size_t count, loff_t pos) -{ - return generic_file_aio_write(iocb, buf, count, pos); -} - const struct file_operations reiserfs_file_operations = { .read = generic_file_read, .write = reiserfs_file_write, @@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = { .fsync = reiserfs_sync_file, .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, - .aio_write = reiserfs_aio_write, + .aio_write = generic_file_aio_write, .splice_read = generic_file_splice_read, .splice_write = generic_file_splice_write, }; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index 1b73529b809..49d1a53dbef 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock, get_bh(bh); if (test_set_buffer_locked(bh)) { if (!buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); goto loop_next; } spin_unlock(lock); @@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock, ret = -EIO; } if (buffer_dirty(bh)) { - list_del_init(&jh->list); - list_add(&jh->list, &tmp); + list_move(&jh->list, &tmp); add_to_chunk(&chunk, bh, lock, write_ordered_chunk); } else { reiserfs_free_jh(bh); diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c index c71dd2760d3..c8e96195b96 100644 --- a/fs/smbfs/request.c +++ b/fs/smbfs/request.c @@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req) if (!(req->rq_flags & SMB_REQ_TRANSMITTED)) goto out; - list_del_init(&req->rq_queue); - list_add_tail(&req->rq_queue, &server->recvq); + list_move_tail(&req->rq_queue, &server->recvq); result = 1; out: return result; @@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server) result = smb_request_send_req(req); if (result < 0) { server->conn_error = result; - list_del_init(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); result = -EIO; goto out; } diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c index 3f71384020c..24577e2c489 100644 --- a/fs/smbfs/smbiod.c +++ b/fs/smbfs/smbiod.c @@ -193,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server) if (req->rq_flags & SMB_REQ_RETRY) { /* must move the request to the xmitq */ VERBOSE("retrying request %p on recvq\n", req); - list_del(&req->rq_queue); - list_add(&req->rq_queue, &server->xmitq); + list_move(&req->rq_queue, &server->xmitq); continue; } #endif diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 610b5bdbe75..61c42430cba 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) i++; /* fallthrough */ default: - if (filp->f_pos == 2) { - list_del(q); - list_add(q, &parent_sd->s_children); - } + if (filp->f_pos == 2) + list_move(q, &parent_sd->s_children); + for (p=q->next; p!= &parent_sd->s_children; p=p->next) { struct sysfs_dirent *next; const char * name; @@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir) dt_type(next)) < 0) return 0; - list_del(q); - list_add(q, p); + list_move(q, p); p = q; filp->f_pos++; } |