summaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/mux.c2
-rw-r--r--fs/Kconfig12
-rw-r--r--fs/afs/cell.c3
-rw-r--r--fs/afs/kafsasyncd.c9
-rw-r--r--fs/afs/server.c6
-rw-r--r--fs/afs/vlocation.c6
-rw-r--r--fs/afs/vnode.c3
-rw-r--r--fs/aio.c2
-rw-r--r--fs/autofs4/expire.c3
-rw-r--r--fs/binfmt_flat.c2
-rw-r--r--fs/coda/psdev.c2
-rw-r--r--fs/coda/upcall.c2
-rw-r--r--fs/compat.c16
-rw-r--r--fs/compat_ioctl.c1
-rw-r--r--fs/configfs/dir.c6
-rw-r--r--fs/dcache.c5
-rw-r--r--fs/dquot.c4
-rw-r--r--fs/exec.c147
-rw-r--r--fs/ext3/super.c6
-rw-r--r--fs/jffs2/erase.c15
-rw-r--r--fs/jffs2/nodemgmt.c3
-rw-r--r--fs/jffs2/summary.c2
-rw-r--r--fs/jffs2/wbuf.c3
-rw-r--r--fs/jfs/jfs_extent.c8
-rw-r--r--fs/libfs.c10
-rw-r--r--fs/namespace.c6
-rw-r--r--fs/nfsd/nfs4state.c3
-rw-r--r--fs/nfsd/nfscache.c3
-rw-r--r--fs/ocfs2/dlm/dlmast.c15
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h63
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c33
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c6
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h30
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c101
-rw-r--r--fs/ocfs2/dlm/dlmfs.c6
-rw-r--r--fs/ocfs2/dlm/dlmlock.c71
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c448
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c589
-rw-r--r--fs/ocfs2/dlm/dlmthread.c74
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c13
-rw-r--r--fs/ocfs2/dlm/userdlm.c2
-rw-r--r--fs/ocfs2/journal.c3
-rw-r--r--fs/openpromfs/inode.c1158
-rw-r--r--fs/pnode.c9
-rw-r--r--fs/proc/base.c1086
-rw-r--r--fs/proc/inode.c11
-rw-r--r--fs/proc/internal.h22
-rw-r--r--fs/proc/task_mmu.c110
-rw-r--r--fs/proc/task_nommu.c21
-rw-r--r--fs/reiserfs/file.c8
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/smbfs/request.c6
-rw-r--r--fs/smbfs/smbiod.c3
-rw-r--r--fs/sysfs/dir.c10
54 files changed, 2175 insertions, 2019 deletions
diff --git a/fs/9p/mux.c b/fs/9p/mux.c
index f4407eb276c..12e1baa4508 100644
--- a/fs/9p/mux.c
+++ b/fs/9p/mux.c
@@ -712,7 +712,7 @@ static void v9fs_read_work(void *a)
* v9fs_send_request - send 9P request
* The function can sleep until the request is scheduled for sending.
* The function can be interrupted. Return from the function is not
- * a guarantee that the request is sent succesfully. Can return errors
+ * a guarantee that the request is sent successfully. Can return errors
* that can be retrieved by PTR_ERR macros.
*
* @m: mux data
diff --git a/fs/Kconfig b/fs/Kconfig
index 1cdc043922d..6c5051802bd 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -1490,7 +1490,12 @@ config NFSD
select LOCKD
select SUNRPC
select EXPORTFS
- select NFS_ACL_SUPPORT if NFSD_V3_ACL || NFSD_V2_ACL
+ select NFSD_V2_ACL if NFSD_V3_ACL
+ select NFS_ACL_SUPPORT if NFSD_V2_ACL
+ select NFSD_TCP if NFSD_V4
+ select CRYPTO_MD5 if NFSD_V4
+ select CRYPTO if NFSD_V4
+ select FS_POSIX_ACL if NFSD_V4
help
If you want your Linux box to act as an NFS *server*, so that other
computers on your local network which support NFS can access certain
@@ -1528,7 +1533,6 @@ config NFSD_V3
config NFSD_V3_ACL
bool "Provide server support for the NFSv3 ACL protocol extension"
depends on NFSD_V3
- select NFSD_V2_ACL
help
Implement the NFSv3 ACL protocol extension for manipulating POSIX
Access Control Lists on exported file systems. NFS clients should
@@ -1538,10 +1542,6 @@ config NFSD_V3_ACL
config NFSD_V4
bool "Provide NFSv4 server support (EXPERIMENTAL)"
depends on NFSD_V3 && EXPERIMENTAL
- select NFSD_TCP
- select CRYPTO_MD5
- select CRYPTO
- select FS_POSIX_ACL
help
If you would like to include the NFSv4 server as well as the NFSv2
and NFSv3 servers, say Y here. This feature is experimental, and
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 009a9ae88d6..bfc1fd22d5b 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -413,8 +413,7 @@ int afs_server_find_by_peer(const struct rxrpc_peer *peer,
/* we found it in the graveyard - resurrect it */
found_dead_server:
- list_del(&server->link);
- list_add_tail(&server->link, &cell->sv_list);
+ list_move_tail(&server->link, &cell->sv_list);
afs_get_server(server);
afs_kafstimod_del_timer(&server->timeout);
spin_unlock(&cell->sv_gylock);
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
index 7ac07d0d47b..f09a794f248 100644
--- a/fs/afs/kafsasyncd.c
+++ b/fs/afs/kafsasyncd.c
@@ -136,8 +136,7 @@ static int kafsasyncd(void *arg)
if (!list_empty(&kafsasyncd_async_attnq)) {
op = list_entry(kafsasyncd_async_attnq.next,
struct afs_async_op, link);
- list_del(&op->link);
- list_add_tail(&op->link,
+ list_move_tail(&op->link,
&kafsasyncd_async_busyq);
}
@@ -204,8 +203,7 @@ void afs_kafsasyncd_begin_op(struct afs_async_op *op)
init_waitqueue_entry(&op->waiter, kafsasyncd_task);
add_wait_queue(&op->call->waitq, &op->waiter);
- list_del(&op->link);
- list_add_tail(&op->link, &kafsasyncd_async_busyq);
+ list_move_tail(&op->link, &kafsasyncd_async_busyq);
spin_unlock(&kafsasyncd_async_lock);
@@ -223,8 +221,7 @@ void afs_kafsasyncd_attend_op(struct afs_async_op *op)
spin_lock(&kafsasyncd_async_lock);
- list_del(&op->link);
- list_add_tail(&op->link, &kafsasyncd_async_attnq);
+ list_move_tail(&op->link, &kafsasyncd_async_attnq);
spin_unlock(&kafsasyncd_async_lock);
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 62b093aa41c..22afaae1a4c 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -123,8 +123,7 @@ int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
resurrect_server:
_debug("resurrecting server");
- list_del(&zombie->link);
- list_add_tail(&zombie->link, &cell->sv_list);
+ list_move_tail(&zombie->link, &cell->sv_list);
afs_get_server(zombie);
afs_kafstimod_del_timer(&zombie->timeout);
spin_unlock(&cell->sv_gylock);
@@ -168,8 +167,7 @@ void afs_put_server(struct afs_server *server)
}
spin_lock(&cell->sv_gylock);
- list_del(&server->link);
- list_add_tail(&server->link, &cell->sv_graveyard);
+ list_move_tail(&server->link, &cell->sv_graveyard);
/* time out in 10 secs */
afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index eced20618ec..331f730a1fb 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -326,8 +326,7 @@ int afs_vlocation_lookup(struct afs_cell *cell,
/* found in the graveyard - resurrect */
_debug("found in graveyard");
atomic_inc(&vlocation->usage);
- list_del(&vlocation->link);
- list_add_tail(&vlocation->link, &cell->vl_list);
+ list_move_tail(&vlocation->link, &cell->vl_list);
spin_unlock(&cell->vl_gylock);
afs_kafstimod_del_timer(&vlocation->timeout);
@@ -478,8 +477,7 @@ static void __afs_put_vlocation(struct afs_vlocation *vlocation)
}
/* move to graveyard queue */
- list_del(&vlocation->link);
- list_add_tail(&vlocation->link,&cell->vl_graveyard);
+ list_move_tail(&vlocation->link,&cell->vl_graveyard);
/* remove from pending timeout queue (refcounted if actually being
* updated) */
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index 9867fef3261..cf62da5d782 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -104,8 +104,7 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
vnode->cb_expiry * HZ);
spin_lock(&afs_cb_hash_lock);
- list_del(&vnode->cb_hash_link);
- list_add_tail(&vnode->cb_hash_link,
+ list_move_tail(&vnode->cb_hash_link,
&afs_cb_hash(server, &vnode->fid));
spin_unlock(&afs_cb_hash_lock);
diff --git a/fs/aio.c b/fs/aio.c
index 8c34a62df7d..950630187ac 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -641,7 +641,7 @@ static inline int __queue_kicked_iocb(struct kiocb *iocb)
* invoked both for initial i/o submission and
* subsequent retries via the aio_kick_handler.
* Expects to be invoked with iocb->ki_ctx->lock
- * already held. The lock is released and reaquired
+ * already held. The lock is released and reacquired
* as needed during processing.
*
* Calls the iocb retry method (already setup for the
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index 4456d1daa40..8dbd44f10e9 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -376,8 +376,7 @@ next:
DPRINTK("returning %p %.*s",
expired, (int)expired->d_name.len, expired->d_name.name);
spin_lock(&dcache_lock);
- list_del(&expired->d_parent->d_subdirs);
- list_add(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
+ list_move(&expired->d_parent->d_subdirs, &expired->d_u.d_child);
spin_unlock(&dcache_lock);
return expired;
}
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b1c902e319c..c94d52eafd1 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -510,7 +510,7 @@ static int load_flat_file(struct linux_binprm * bprm,
}
/* OK, This is the point of no return */
- set_personality(PER_LINUX);
+ set_personality(PER_LINUX_32BIT);
}
/*
diff --git a/fs/coda/psdev.c b/fs/coda/psdev.c
index 6c6771db36d..7caee8d8ea3 100644
--- a/fs/coda/psdev.c
+++ b/fs/coda/psdev.c
@@ -259,7 +259,7 @@ static ssize_t coda_psdev_read(struct file * file, char __user * buf,
/* If request was not a signal, enqueue and don't free */
if (!(req->uc_flags & REQ_ASYNC)) {
req->uc_flags |= REQ_READ;
- list_add(&(req->uc_chain), vcp->vc_processing.prev);
+ list_add_tail(&(req->uc_chain), &vcp->vc_processing);
goto out;
}
diff --git a/fs/coda/upcall.c b/fs/coda/upcall.c
index b040eba13a7..a5b5e631ba6 100644
--- a/fs/coda/upcall.c
+++ b/fs/coda/upcall.c
@@ -725,7 +725,7 @@ static int coda_upcall(struct coda_sb_info *sbi,
((union inputArgs *)buffer)->ih.unique = req->uc_unique;
/* Append msg to pending queue and poke Venus. */
- list_add(&(req->uc_chain), vcommp->vc_pending.prev);
+ list_add_tail(&(req->uc_chain), &vcommp->vc_pending);
wake_up_interruptible(&vcommp->vc_waitq);
/* We can be interrupted while we wait for Venus to process
diff --git a/fs/compat.c b/fs/compat.c
index 7e7e5bc4f3c..e31e9cf9664 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -55,6 +55,20 @@
extern void sigset_from_compat(sigset_t *set, compat_sigset_t *compat);
+int compat_log = 1;
+
+int compat_printk(const char *fmt, ...)
+{
+ va_list ap;
+ int ret;
+ if (!compat_log)
+ return 0;
+ va_start(ap, fmt);
+ ret = vprintk(fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
/*
* Not all architectures have sys_utime, so implement this in terms
* of sys_utimes.
@@ -359,7 +373,7 @@ static void compat_ioctl_error(struct file *filp, unsigned int fd,
sprintf(buf,"'%c'", (cmd>>24) & 0x3f);
if (!isprint(buf[1]))
sprintf(buf, "%02x", buf[1]);
- printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
+ compat_printk("ioctl32(%s:%d): Unknown cmd fd(%d) "
"cmd(%08x){%s} arg(%08x) on %s\n",
current->comm, current->pid,
(int)fd, (unsigned int)cmd, buf,
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9eb9824dd33..d8ecfedef18 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -80,6 +80,7 @@
#include <net/bluetooth/rfcomm.h>
#include <linux/capi.h>
+#include <linux/gigaset_dev.h>
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 5f952187fc5..207f8006fd6 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -1009,8 +1009,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
/* fallthrough */
default:
if (filp->f_pos == 2) {
- list_del(q);
- list_add(q, &parent_sd->s_children);
+ list_move(q, &parent_sd->s_children);
}
for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
struct configfs_dirent *next;
@@ -1033,8 +1032,7 @@ static int configfs_readdir(struct file * filp, void * dirent, filldir_t filldir
dt_type(next)) < 0)
return 0;
- list_del(q);
- list_add(q, p);
+ list_move(q, p);
p = q;
filp->f_pos++;
}
diff --git a/fs/dcache.c b/fs/dcache.c
index b85fda36053..48b44a714b3 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -522,8 +522,7 @@ void shrink_dcache_sb(struct super_block * sb)
dentry = list_entry(tmp, struct dentry, d_lru);
if (dentry->d_sb != sb)
continue;
- list_del(tmp);
- list_add(tmp, &dentry_unused);
+ list_move(tmp, &dentry_unused);
}
/*
@@ -638,7 +637,7 @@ resume:
* of the unused list for prune_dcache
*/
if (!atomic_read(&dentry->d_count)) {
- list_add(&dentry->d_lru, dentry_unused.prev);
+ list_add_tail(&dentry->d_lru, &dentry_unused);
dentry_stat.nr_unused++;
found++;
}
diff --git a/fs/dquot.c b/fs/dquot.c
index 81d87a413c6..0122a279106 100644
--- a/fs/dquot.c
+++ b/fs/dquot.c
@@ -250,7 +250,7 @@ static inline struct dquot *find_dquot(unsigned int hashent, struct super_block
/* Add a dquot to the tail of the free list */
static inline void put_dquot_last(struct dquot *dquot)
{
- list_add(&dquot->dq_free, free_dquots.prev);
+ list_add_tail(&dquot->dq_free, &free_dquots);
dqstats.free_dquots++;
}
@@ -266,7 +266,7 @@ static inline void put_inuse(struct dquot *dquot)
{
/* We add to the back of inuse list so we don't have to restart
* when traversing this list and we block */
- list_add(&dquot->dq_inuse, inuse_list.prev);
+ list_add_tail(&dquot->dq_inuse, &inuse_list);
dqstats.allocated_dquots++;
}
diff --git a/fs/exec.c b/fs/exec.c
index 0b88bf64614..c8494f513ea 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -666,8 +666,6 @@ static int de_thread(struct task_struct *tsk)
* and to assume its PID:
*/
if (!thread_group_leader(current)) {
- struct dentry *proc_dentry1, *proc_dentry2;
-
/*
* Wait for the thread group leader to be a zombie.
* It should already be zombie at this point, most
@@ -689,10 +687,6 @@ static int de_thread(struct task_struct *tsk)
*/
current->start_time = leader->start_time;
- spin_lock(&leader->proc_lock);
- spin_lock(&current->proc_lock);
- proc_dentry1 = proc_pid_unhash(current);
- proc_dentry2 = proc_pid_unhash(leader);
write_lock_irq(&tasklist_lock);
BUG_ON(leader->tgid != current->tgid);
@@ -713,7 +707,7 @@ static int de_thread(struct task_struct *tsk)
attach_pid(current, PIDTYPE_PID, current->pid);
attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
attach_pid(current, PIDTYPE_SID, current->signal->session);
- list_add_tail_rcu(&current->tasks, &init_task.tasks);
+ list_replace_rcu(&leader->tasks, &current->tasks);
current->group_leader = current;
leader->group_leader = current;
@@ -721,7 +715,6 @@ static int de_thread(struct task_struct *tsk)
/* Reduce leader to a thread */
detach_pid(leader, PIDTYPE_PGID);
detach_pid(leader, PIDTYPE_SID);
- list_del_init(&leader->tasks);
current->exit_signal = SIGCHLD;
@@ -729,10 +722,6 @@ static int de_thread(struct task_struct *tsk)
leader->exit_state = EXIT_DEAD;
write_unlock_irq(&tasklist_lock);
- spin_unlock(&leader->proc_lock);
- spin_unlock(&current->proc_lock);
- proc_pid_flush(proc_dentry1);
- proc_pid_flush(proc_dentry2);
}
/*
@@ -1379,67 +1368,102 @@ static void format_corename(char *corename, const char *pattern, long signr)
*out_ptr = 0;
}
-static void zap_threads (struct mm_struct *mm)
+static void zap_process(struct task_struct *start)
{
- struct task_struct *g, *p;
- struct task_struct *tsk = current;
- struct completion *vfork_done = tsk->vfork_done;
- int traced = 0;
+ struct task_struct *t;
- /*
- * Make sure nobody is waiting for us to release the VM,
- * otherwise we can deadlock when we wait on each other
- */
- if (vfork_done) {
- tsk->vfork_done = NULL;
- complete(vfork_done);
- }
+ start->signal->flags = SIGNAL_GROUP_EXIT;
+ start->signal->group_stop_count = 0;
- read_lock(&tasklist_lock);
- do_each_thread(g,p)
- if (mm == p->mm && p != tsk) {
- force_sig_specific(SIGKILL, p);
- mm->core_waiters++;
- if (unlikely(p->ptrace) &&
- unlikely(p->parent->mm == mm))
- traced = 1;
+ t = start;
+ do {
+ if (t != current && t->mm) {
+ t->mm->core_waiters++;
+ sigaddset(&t->pending.signal, SIGKILL);
+ signal_wake_up(t, 1);
}
- while_each_thread(g,p);
+ } while ((t = next_thread(t)) != start);
+}
- read_unlock(&tasklist_lock);
+static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
+ int exit_code)
+{
+ struct task_struct *g, *p;
+ unsigned long flags;
+ int err = -EAGAIN;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ if (!(tsk->signal->flags & SIGNAL_GROUP_EXIT)) {
+ tsk->signal->group_exit_code = exit_code;
+ zap_process(tsk);
+ err = 0;
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+ if (err)
+ return err;
- if (unlikely(traced)) {
- /*
- * We are zapping a thread and the thread it ptraces.
- * If the tracee went into a ptrace stop for exit tracing,
- * we could deadlock since the tracer is waiting for this
- * coredump to finish. Detach them so they can both die.
- */
- write_lock_irq(&tasklist_lock);
- do_each_thread(g,p) {
- if (mm == p->mm && p != tsk &&
- p->ptrace && p->parent->mm == mm) {
- __ptrace_detach(p, 0);
+ if (atomic_read(&mm->mm_users) == mm->core_waiters + 1)
+ goto done;
+
+ rcu_read_lock();
+ for_each_process(g) {
+ if (g == tsk->group_leader)
+ continue;
+
+ p = g;
+ do {
+ if (p->mm) {
+ if (p->mm == mm) {
+ /*
+ * p->sighand can't disappear, but
+ * may be changed by de_thread()
+ */
+ lock_task_sighand(p, &flags);
+ zap_process(p);
+ unlock_task_sighand(p, &flags);
+ }
+ break;
}
- } while_each_thread(g,p);
- write_unlock_irq(&tasklist_lock);
+ } while ((p = next_thread(p)) != g);
}
+ rcu_read_unlock();
+done:
+ return mm->core_waiters;
}
-static void coredump_wait(struct mm_struct *mm)
+static int coredump_wait(int exit_code)
{
- DECLARE_COMPLETION(startup_done);
+ struct task_struct *tsk = current;
+ struct mm_struct *mm = tsk->mm;
+ struct completion startup_done;
+ struct completion *vfork_done;
int core_waiters;
+ init_completion(&mm->core_done);
+ init_completion(&startup_done);
mm->core_startup_done = &startup_done;
- zap_threads(mm);
- core_waiters = mm->core_waiters;
+ core_waiters = zap_threads(tsk, mm, exit_code);
up_write(&mm->mmap_sem);
+ if (unlikely(core_waiters < 0))
+ goto fail;
+
+ /*
+ * Make sure nobody is waiting for us to release the VM,
+ * otherwise we can deadlock when we wait on each other
+ */
+ vfork_done = tsk->vfork_done;
+ if (vfork_done) {
+ tsk->vfork_done = NULL;
+ complete(vfork_done);
+ }
+
if (core_waiters)
wait_for_completion(&startup_done);
+fail:
BUG_ON(mm->core_waiters);
+ return core_waiters;
}
int do_coredump(long signr, int exit_code, struct pt_regs * regs)
@@ -1473,22 +1497,9 @@ int do_coredump(long signr, int exit_code, struct pt_regs * regs)
}
mm->dumpable = 0;
- retval = -EAGAIN;
- spin_lock_irq(&current->sighand->siglock);
- if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
- current->signal->flags = SIGNAL_GROUP_EXIT;
- current->signal->group_exit_code = exit_code;
- current->signal->group_stop_count = 0;
- retval = 0;
- }
- spin_unlock_irq(&current->sighand->siglock);
- if (retval) {
- up_write(&mm->mmap_sem);
+ retval = coredump_wait(exit_code);
+ if (retval < 0)
goto fail;
- }
-
- init_completion(&mm->core_done);
- coredump_wait(mm);
/*
* Clear any false indication of pending signals that might
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b2891cc29db..b7483360a2d 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -630,7 +630,7 @@ enum {
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_nocheck, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh,
+ Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_journal_update, Opt_journal_inum, Opt_journal_dev,
Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
@@ -666,6 +666,7 @@ static match_table_t tokens = {
{Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
{Opt_nobh, "nobh"},
+ {Opt_bh, "bh"},
{Opt_commit, "commit=%u"},
{Opt_journal_update, "journal=update"},
{Opt_journal_inum, "journal=%u"},
@@ -1014,6 +1015,9 @@ clear_qf_name:
case Opt_nobh:
set_opt(sbi->s_mount_opt, NOBH);
break;
+ case Opt_bh:
+ clear_opt(sbi->s_mount_opt, NOBH);
+ break;
default:
printk (KERN_ERR
"EXT3-fs: Unrecognized mount option \"%s\" "
diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c
index 1862e8bc101..b8886f048ea 100644
--- a/fs/jffs2/erase.c
+++ b/fs/jffs2/erase.c
@@ -53,8 +53,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
if (!instr) {
printk(KERN_WARNING "kmalloc for struct erase_info in jffs2_erase_block failed. Refiling block for later\n");
spin_lock(&c->erase_completion_lock);
- list_del(&jeb->list);
- list_add(&jeb->list, &c->erase_pending_list);
+ list_move(&jeb->list, &c->erase_pending_list);
c->erasing_size -= c->sector_size;
c->dirty_size += c->sector_size;
jeb->dirty_size = c->sector_size;
@@ -86,8 +85,7 @@ static void jffs2_erase_block(struct jffs2_sb_info *c,
/* Erase failed immediately. Refile it on the list */
D1(printk(KERN_DEBUG "Erase at 0x%08x failed: %d. Refiling on erase_pending_list\n", jeb->offset, ret));
spin_lock(&c->erase_completion_lock);
- list_del(&jeb->list);
- list_add(&jeb->list, &c->erase_pending_list);
+ list_move(&jeb->list, &c->erase_pending_list);
c->erasing_size -= c->sector_size;
c->dirty_size += c->sector_size;
jeb->dirty_size = c->sector_size;
@@ -161,8 +159,7 @@ static void jffs2_erase_succeeded(struct jffs2_sb_info *c, struct jffs2_eraseblo
{
D1(printk(KERN_DEBUG "Erase completed successfully at 0x%08x\n", jeb->offset));
spin_lock(&c->erase_completion_lock);
- list_del(&jeb->list);
- list_add_tail(&jeb->list, &c->erase_complete_list);
+ list_move_tail(&jeb->list, &c->erase_complete_list);
spin_unlock(&c->erase_completion_lock);
/* Ensure that kupdated calls us again to mark them clean */
jffs2_erase_pending_trigger(c);
@@ -178,8 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) {
/* We'd like to give this block another try. */
spin_lock(&c->erase_completion_lock);
- list_del(&jeb->list);
- list_add(&jeb->list, &c->erase_pending_list);
+ list_move(&jeb->list, &c->erase_pending_list);
c->erasing_size -= c->sector_size;
c->dirty_size += c->sector_size;
jeb->dirty_size = c->sector_size;
@@ -191,8 +187,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock
spin_lock(&c->erase_completion_lock);
c->erasing_size -= c->sector_size;
c->bad_size += c->sector_size;
- list_del(&jeb->list);
- list_add(&jeb->list, &c->bad_list);
+ list_move(&jeb->list, &c->bad_list);
c->nr_erasing_blocks--;
spin_unlock(&c->erase_completion_lock);
wake_up(&c->erase_wait);
diff --git a/fs/jffs2/nodemgmt.c b/fs/jffs2/nodemgmt.c
index 8bedfd2ff68..ac0c350ed7d 100644
--- a/fs/jffs2/nodemgmt.c
+++ b/fs/jffs2/nodemgmt.c
@@ -211,8 +211,7 @@ static int jffs2_find_nextblock(struct jffs2_sb_info *c)
struct jffs2_eraseblock *ejeb;
ejeb = list_entry(c->erasable_list.next, struct jffs2_eraseblock, list);
- list_del(&ejeb->list);
- list_add_tail(&ejeb->list, &c->erase_pending_list);
+ list_move_tail(&ejeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_erase_pending_trigger(c);
D1(printk(KERN_DEBUG "jffs2_find_nextblock: Triggering erase of erasable block at 0x%08x\n",
diff --git a/fs/jffs2/summary.c b/fs/jffs2/summary.c
index 0b02fc79e4d..be1acc3dad9 100644
--- a/fs/jffs2/summary.c
+++ b/fs/jffs2/summary.c
@@ -43,7 +43,7 @@ int jffs2_sum_init(struct jffs2_sb_info *c)
return -ENOMEM;
}
- dbg_summary("returned succesfully\n");
+ dbg_summary("returned successfully\n");
return 0;
}
diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c
index a7f153f79ec..b9b700730df 100644
--- a/fs/jffs2/wbuf.c
+++ b/fs/jffs2/wbuf.c
@@ -495,8 +495,7 @@ static void jffs2_wbuf_recover(struct jffs2_sb_info *c)
/* Fix up the original jeb now it's on the bad_list */
if (first_raw == jeb->first_node) {
D1(printk(KERN_DEBUG "Failing block at %08x is now empty. Moving to erase_pending_list\n", jeb->offset));
- list_del(&jeb->list);
- list_add(&jeb->list, &c->erase_pending_list);
+ list_move(&jeb->list, &c->erase_pending_list);
c->nr_erasing_blocks++;
jffs2_erase_pending_trigger(c);
}
diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c
index 5549378358b..4d52593a5fc 100644
--- a/fs/jfs/jfs_extent.c
+++ b/fs/jfs/jfs_extent.c
@@ -126,7 +126,7 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, boolean_t abnr)
/* allocate the disk blocks for the extent. initially, extBalloc()
* will try to allocate disk blocks for the requested size (xlen).
- * if this fails (xlen contigious free blocks not avaliable), it'll
+ * if this fails (xlen contiguous free blocks not avaliable), it'll
* try to allocate a smaller number of blocks (producing a smaller
* extent), with this smaller number of blocks consisting of the
* requested number of blocks rounded down to the next smaller
@@ -493,7 +493,7 @@ int extFill(struct inode *ip, xad_t * xp)
*
* initially, we will try to allocate disk blocks for the
* requested size (nblocks). if this fails (nblocks
- * contigious free blocks not avaliable), we'll try to allocate
+ * contiguous free blocks not avaliable), we'll try to allocate
* a smaller number of blocks (producing a smaller extent), with
* this smaller number of blocks consisting of the requested
* number of blocks rounded down to the next smaller power of 2
@@ -529,7 +529,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
/* get the number of blocks to initially attempt to allocate.
* we'll first try the number of blocks requested unless this
- * number is greater than the maximum number of contigious free
+ * number is greater than the maximum number of contiguous free
* blocks in the map. in that case, we'll start off with the
* maximum free.
*/
@@ -586,7 +586,7 @@ extBalloc(struct inode *ip, s64 hint, s64 * nblocks, s64 * blkno)
* in place. if this fails, we'll try to move the extent
* to a new set of blocks. if moving the extent, we initially
* will try to allocate disk blocks for the requested size
- * (nnew). if this fails (nnew contigious free blocks not
+ * (nnew). if this fails (new contiguous free blocks not
* avaliable), we'll try to allocate a smaller number of
* blocks (producing a smaller extent), with this smaller
* number of blocks consisting of the requested number of
diff --git a/fs/libfs.c b/fs/libfs.c
index fc785d8befb..ac02ea602c3 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -149,10 +149,9 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
/* fallthrough */
default:
spin_lock(&dcache_lock);
- if (filp->f_pos == 2) {
- list_del(q);
- list_add(q, &dentry->d_subdirs);
- }
+ if (filp->f_pos == 2)
+ list_move(q, &dentry->d_subdirs);
+
for (p=q->next; p != &dentry->d_subdirs; p=p->next) {
struct dentry *next;
next = list_entry(p, struct dentry, d_u.d_child);
@@ -164,8 +163,7 @@ int dcache_readdir(struct file * filp, void * dirent, filldir_t filldir)
return 0;
spin_lock(&dcache_lock);
/* next is still alive */
- list_del(q);
- list_add(q, p);
+ list_move(q, p);
p = q;
filp->f_pos++;
}
diff --git a/fs/namespace.c b/fs/namespace.c
index 866430bb024..b3ed212ea41 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -526,10 +526,8 @@ void umount_tree(struct vfsmount *mnt, int propagate, struct list_head *kill)
{
struct vfsmount *p;
- for (p = mnt; p; p = next_mnt(p, mnt)) {
- list_del(&p->mnt_hash);
- list_add(&p->mnt_hash, kill);
- }
+ for (p = mnt; p; p = next_mnt(p, mnt))
+ list_move(&p->mnt_hash, kill);
if (propagate)
propagate_umount(kill);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 96c7578cbe1..1630b5670dc 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -529,8 +529,7 @@ move_to_confirmed(struct nfs4_client *clp)
dprintk("NFSD: move_to_confirm nfs4_client %p\n", clp);
list_del_init(&clp->cl_strhash);
- list_del_init(&clp->cl_idhash);
- list_add(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
+ list_move(&clp->cl_idhash, &conf_id_hashtbl[idhashval]);
strhashval = clientstr_hashval(clp->cl_recdir);
list_add(&clp->cl_strhash, &conf_str_hashtbl[strhashval]);
renew_client(clp);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index d852ebb538e..fdf7cf3dfad 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -103,8 +103,7 @@ nfsd_cache_shutdown(void)
static void
lru_put_end(struct svc_cacherep *rp)
{
- list_del(&rp->c_lru);
- list_add_tail(&rp->c_lru, &lru_head);
+ list_move_tail(&rp->c_lru, &lru_head);
}
/*
diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c
index 355593dd8ef..42775e2bbe2 100644
--- a/fs/ocfs2/dlm/dlmast.c
+++ b/fs/ocfs2/dlm/dlmast.c
@@ -197,12 +197,14 @@ static void dlm_update_lvb(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
lock->ml.node == dlm->node_num ? "master" :
"remote");
memcpy(lksb->lvb, res->lvb, DLM_LVB_LEN);
- } else if (lksb->flags & DLM_LKSB_PUT_LVB) {
- mlog(0, "setting lvb from lockres for %s node\n",
- lock->ml.node == dlm->node_num ? "master" :
- "remote");
- memcpy(res->lvb, lksb->lvb, DLM_LVB_LEN);
}
+ /* Do nothing for lvb put requests - they should be done in
+ * place when the lock is downconverted - otherwise we risk
+ * racing gets and puts which could result in old lvb data
+ * being propagated. We leave the put flag set and clear it
+ * here. In the future we might want to clear it at the time
+ * the put is actually done.
+ */
spin_unlock(&res->spinlock);
}
@@ -381,8 +383,7 @@ do_ast:
ret = DLM_NORMAL;
if (past->type == DLM_AST) {
/* do not alter lock refcount. switching lists. */
- list_del_init(&lock->list);
- list_add_tail(&lock->list, &res->granted);
+ list_move_tail(&lock->list, &res->granted);
mlog(0, "ast: adding to granted list... type=%d, "
"convert_type=%d\n", lock->ml.type, lock->ml.convert_type);
if (lock->ml.convert_type != LKM_IVMODE) {
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 88cc43df18f..9bdc9cf6599 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -37,7 +37,17 @@
#define DLM_THREAD_SHUFFLE_INTERVAL 5 // flush everything every 5 passes
#define DLM_THREAD_MS 200 // flush at least every 200 ms
-#define DLM_HASH_BUCKETS (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_SIZE_DEFAULT (1 << 14)
+#if DLM_HASH_SIZE_DEFAULT < PAGE_SIZE
+# define DLM_HASH_PAGES 1
+#else
+# define DLM_HASH_PAGES (DLM_HASH_SIZE_DEFAULT / PAGE_SIZE)
+#endif
+#define DLM_BUCKETS_PER_PAGE (PAGE_SIZE / sizeof(struct hlist_head))
+#define DLM_HASH_BUCKETS (DLM_HASH_PAGES * DLM_BUCKETS_PER_PAGE)
+
+/* Intended to make it easier for us to switch out hash functions */
+#define dlm_lockid_hash(_n, _l) full_name_hash(_n, _l)
enum dlm_ast_type {
DLM_AST = 0,
@@ -61,7 +71,8 @@ static inline int dlm_is_recovery_lock(const char *lock_name, int name_len)
return 0;
}
-#define DLM_RECO_STATE_ACTIVE 0x0001
+#define DLM_RECO_STATE_ACTIVE 0x0001
+#define DLM_RECO_STATE_FINALIZE 0x0002
struct dlm_recovery_ctxt
{
@@ -85,7 +96,7 @@ enum dlm_ctxt_state {
struct dlm_ctxt
{
struct list_head list;
- struct hlist_head *lockres_hash;
+ struct hlist_head **lockres_hash;
struct list_head dirty_list;
struct list_head purge_list;
struct list_head pending_asts;
@@ -120,6 +131,7 @@ struct dlm_ctxt
struct o2hb_callback_func dlm_hb_down;
struct task_struct *dlm_thread_task;
struct task_struct *dlm_reco_thread_task;
+ struct workqueue_struct *dlm_worker;
wait_queue_head_t dlm_thread_wq;
wait_queue_head_t dlm_reco_thread_wq;
wait_queue_head_t ast_wq;
@@ -132,6 +144,11 @@ struct dlm_ctxt
struct list_head dlm_eviction_callbacks;
};
+static inline struct hlist_head *dlm_lockres_hash(struct dlm_ctxt *dlm, unsigned i)
+{
+ return dlm->lockres_hash[(i / DLM_BUCKETS_PER_PAGE) % DLM_HASH_PAGES] + (i % DLM_BUCKETS_PER_PAGE);
+}
+
/* these keventd work queue items are for less-frequently
* called functions that cannot be directly called from the
* net message handlers for some reason, usually because
@@ -216,20 +233,29 @@ struct dlm_lock_resource
/* WARNING: Please see the comment in dlm_init_lockres before
* adding fields here. */
struct hlist_node hash_node;
+ struct qstr lockname;
struct kref refs;
- /* please keep these next 3 in this order
- * some funcs want to iterate over all lists */
+ /*
+ * Please keep granted, converting, and blocked in this order,
+ * as some funcs want to iterate over all lists.
+ *
+ * All four lists are protected by the hash's reference.
+ */
struct list_head granted;
struct list_head converting;
struct list_head blocked;
+ struct list_head purge;
+ /*
+ * These two lists require you to hold an additional reference
+ * while they are on the list.
+ */
struct list_head dirty;
struct list_head recovering; // dlm_recovery_ctxt.resources list
/* unused lock resources have their last_used stamped and are
* put on a list for the dlm thread to run. */
- struct list_head purge;
unsigned long last_used;
unsigned migration_pending:1;
@@ -238,7 +264,6 @@ struct dlm_lock_resource
wait_queue_head_t wq;
u8 owner; //node which owns the lock resource, or unknown
u16 state;
- struct qstr lockname;
char lvb[DLM_LVB_LEN];
};
@@ -300,6 +325,15 @@ enum dlm_lockres_list {
DLM_BLOCKED_LIST
};
+static inline int dlm_lvb_is_empty(char *lvb)
+{
+ int i;
+ for (i=0; i<DLM_LVB_LEN; i++)
+ if (lvb[i])
+ return 0;
+ return 1;
+}
+
static inline struct list_head *
dlm_list_idx_to_ptr(struct dlm_lock_resource *res, enum dlm_lockres_list idx)
{
@@ -609,7 +643,8 @@ struct dlm_finalize_reco
{
u8 node_idx;
u8 dead_node;
- __be16 pad1;
+ u8 flags;
+ u8 pad1;
__be32 pad2;
};
@@ -676,6 +711,7 @@ void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
void dlm_put(struct dlm_ctxt *dlm);
struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -687,14 +723,20 @@ void dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
void dlm_purge_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *lockres);
-void dlm_lockres_get(struct dlm_lock_resource *res);
+static inline void dlm_lockres_get(struct dlm_lock_resource *res)
+{
+ /* This is called on every lookup, so it might be worth
+ * inlining. */
+ kref_get(&res->refs);
+}
void dlm_lockres_put(struct dlm_lock_resource *res);
void __dlm_unhash_lockres(struct dlm_lock_resource *res);
void __dlm_insert_lockres(struct dlm_ctxt *dlm,
struct dlm_lock_resource *res);
struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
- unsigned int len);
+ unsigned int len,
+ unsigned int hash);
struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
const char *name,
unsigned int len);
@@ -819,6 +861,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm,
u8 dead_node);
int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock);
+int __dlm_lockres_unused(struct dlm_lock_resource *res);
static inline const char * dlm_lock_mode_name(int mode)
{
diff --git a/fs/ocfs2/dlm/dlmconvert.c b/fs/ocfs2/dlm/dlmconvert.c
index 8285228d9e3..c764dc8e40a 100644
--- a/fs/ocfs2/dlm/dlmconvert.c
+++ b/fs/ocfs2/dlm/dlmconvert.c
@@ -214,6 +214,9 @@ grant:
if (lock->ml.node == dlm->node_num)
mlog(0, "doing in-place convert for nonlocal lock\n");
lock->ml.type = type;
+ if (lock->lksb->flags & DLM_LKSB_PUT_LVB)
+ memcpy(res->lvb, lock->lksb->lvb, DLM_LVB_LEN);
+
status = DLM_NORMAL;
*call_ast = 1;
goto unlock_exit;
@@ -231,8 +234,7 @@ switch_queues:
lock->ml.convert_type = type;
/* do not alter lock refcount. switching lists. */
- list_del_init(&lock->list);
- list_add_tail(&lock->list, &res->converting);
+ list_move_tail(&lock->list, &res->converting);
unlock_exit:
spin_unlock(&lock->spinlock);
@@ -248,8 +250,7 @@ void dlm_revert_pending_convert(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
/* do not alter lock refcount. switching lists. */
- list_del_init(&lock->list);
- list_add_tail(&lock->list, &res->granted);
+ list_move_tail(&lock->list, &res->granted);
lock->ml.convert_type = LKM_IVMODE;
lock->lksb->flags &= ~(DLM_LKSB_GET_LVB|DLM_LKSB_PUT_LVB);
}
@@ -294,8 +295,7 @@ enum dlm_status dlmconvert_remote(struct dlm_ctxt *dlm,
res->state |= DLM_LOCK_RES_IN_PROGRESS;
/* move lock to local convert queue */
/* do not alter lock refcount. switching lists. */
- list_del_init(&lock->list);
- list_add_tail(&lock->list, &res->converting);
+ list_move_tail(&lock->list, &res->converting);
lock->convert_pending = 1;
lock->ml.convert_type = type;
@@ -464,6 +464,12 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
}
spin_lock(&res->spinlock);
+ status = __dlm_lockres_state_to_status(res);
+ if (status != DLM_NORMAL) {
+ spin_unlock(&res->spinlock);
+ dlm_error(status);
+ goto leave;
+ }
list_for_each(iter, &res->granted) {
lock = list_entry(iter, struct dlm_lock, list);
if (lock->ml.cookie == cnv->cookie &&
@@ -473,6 +479,21 @@ int dlm_convert_lock_handler(struct o2net_msg *msg, u32 len, void *data)
}
lock = NULL;
}
+ if (!lock) {
+ __dlm_print_one_lock_resource(res);
+ list_for_each(iter, &res->granted) {
+ lock = list_entry(iter, struct dlm_lock, list);
+ if (lock->ml.node == cnv->node_idx) {
+ mlog(ML_ERROR, "There is something here "
+ "for node %u, lock->ml.cookie=%llu, "
+ "cnv->cookie=%llu\n", cnv->node_idx,
+ (unsigned long long)lock->ml.cookie,
+ (unsigned long long)cnv->cookie);
+ break;
+ }
+ }
+ lock = NULL;
+ }
spin_unlock(&res->spinlock);
if (!lock) {
status = DLM_IVLOCKID;
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index c7eae5d3324..3f6c8d88f7a 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -37,10 +37,8 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-#include "dlmdebug.h"
#include "dlmdomain.h"
-#include "dlmdebug.h"
#define MLOG_MASK_PREFIX ML_DLM
#include "cluster/masklog.h"
@@ -120,6 +118,7 @@ void dlm_print_one_lock(struct dlm_lock *lockid)
}
EXPORT_SYMBOL_GPL(dlm_print_one_lock);
+#if 0
void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
{
struct dlm_lock_resource *res;
@@ -136,12 +135,13 @@ void dlm_dump_lock_resources(struct dlm_ctxt *dlm)
spin_lock(&dlm->spinlock);
for (i=0; i<DLM_HASH_BUCKETS; i++) {
- bucket = &(dlm->lockres_hash[i]);
+ bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, iter, bucket, hash_node)
dlm_print_one_lock_resource(res);
}
spin_unlock(&dlm->spinlock);
}
+#endif /* 0 */
static const char *dlm_errnames[] = {
[DLM_NORMAL] = "DLM_NORMAL",
diff --git a/fs/ocfs2/dlm/dlmdebug.h b/fs/ocfs2/dlm/dlmdebug.h
deleted file mode 100644
index 6858510c3cc..00000000000
--- a/fs/ocfs2/dlm/dlmdebug.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* -*- mode: c; c-basic-offset: 8; -*-
- * vim: noexpandtab sw=8 ts=8 sts=0:
- *
- * dlmdebug.h
- *
- * Copyright (C) 2004 Oracle. All rights reserved.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public
- * License as published by the Free Software Foundation; either
- * version 2 of the License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 021110-1307, USA.
- *
- */
-
-#ifndef DLMDEBUG_H
-#define DLMDEBUG_H
-
-void dlm_dump_lock_resources(struct dlm_ctxt *dlm);
-
-#endif
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 8f3a9e3106f..ba27c5c5e95 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -41,7 +41,6 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-#include "dlmdebug.h"
#include "dlmdomain.h"
#include "dlmver.h"
@@ -49,6 +48,33 @@
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_DOMAIN)
#include "cluster/masklog.h"
+static void dlm_free_pagevec(void **vec, int pages)
+{
+ while (pages--)
+ free_page((unsigned long)vec[pages]);
+ kfree(vec);
+}
+
+static void **dlm_alloc_pagevec(int pages)
+{
+ void **vec = kmalloc(pages * sizeof(void *), GFP_KERNEL);
+ int i;
+
+ if (!vec)
+ return NULL;
+
+ for (i = 0; i < pages; i++)
+ if (!(vec[i] = (void *)__get_free_page(GFP_KERNEL)))
+ goto out_free;
+
+ mlog(0, "Allocated DLM hash pagevec; %d pages (%lu expected), %lu buckets per page\n",
+ pages, DLM_HASH_PAGES, (unsigned long)DLM_BUCKETS_PER_PAGE);
+ return vec;
+out_free:
+ dlm_free_pagevec(vec, i);
+ return NULL;
+}
+
/*
*
* spinlock lock ordering: if multiple locks are needed, obey this ordering:
@@ -90,8 +116,7 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
assert_spin_locked(&dlm->spinlock);
q = &res->lockname;
- q->hash = full_name_hash(q->name, q->len);
- bucket = &(dlm->lockres_hash[q->hash % DLM_HASH_BUCKETS]);
+ bucket = dlm_lockres_hash(dlm, q->hash);
/* get a reference for our hashtable */
dlm_lockres_get(res);
@@ -100,34 +125,32 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
}
struct dlm_lock_resource * __dlm_lookup_lockres(struct dlm_ctxt *dlm,
- const char *name,
- unsigned int len)
+ const char *name,
+ unsigned int len,
+ unsigned int hash)
{
- unsigned int hash;
- struct hlist_node *iter;
- struct dlm_lock_resource *tmpres=NULL;
struct hlist_head *bucket;
+ struct hlist_node *list;
mlog_entry("%.*s\n", len, name);
assert_spin_locked(&dlm->spinlock);
- hash = full_name_hash(name, len);
-
- bucket = &(dlm->lockres_hash[hash % DLM_HASH_BUCKETS]);
-
- /* check for pre-existing lock */
- hlist_for_each(iter, bucket) {
- tmpres = hlist_entry(iter, struct dlm_lock_resource, hash_node);
- if (tmpres->lockname.len == len &&
- memcmp(tmpres->lockname.name, name, len) == 0) {
- dlm_lockres_get(tmpres);
- break;
- }
+ bucket = dlm_lockres_hash(dlm, hash);
- tmpres = NULL;
+ hlist_for_each(list, bucket) {
+ struct dlm_lock_resource *res = hlist_entry(list,
+ struct dlm_lock_resource, hash_node);
+ if (res->lockname.name[0] != name[0])
+ continue;
+ if (unlikely(res->lockname.len != len))
+ continue;
+ if (memcmp(res->lockname.name + 1, name + 1, len - 1))
+ continue;
+ dlm_lockres_get(res);
+ return res;
}
- return tmpres;
+ return NULL;
}
struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
@@ -135,9 +158,10 @@ struct dlm_lock_resource * dlm_lookup_lockres(struct dlm_ctxt *dlm,
unsigned int len)
{
struct dlm_lock_resource *res;
+ unsigned int hash = dlm_lockid_hash(name, len);
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, name, len);
+ res = __dlm_lookup_lockres(dlm, name, len, hash);
spin_unlock(&dlm->spinlock);
return res;
}
@@ -194,7 +218,7 @@ static int dlm_wait_on_domain_helper(const char *domain)
static void dlm_free_ctxt_mem(struct dlm_ctxt *dlm)
{
if (dlm->lockres_hash)
- free_page((unsigned long) dlm->lockres_hash);
+ dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES);
if (dlm->name)
kfree(dlm->name);
@@ -278,11 +302,21 @@ int dlm_domain_fully_joined(struct dlm_ctxt *dlm)
return ret;
}
+static void dlm_destroy_dlm_worker(struct dlm_ctxt *dlm)
+{
+ if (dlm->dlm_worker) {
+ flush_workqueue(dlm->dlm_worker);
+ destroy_workqueue(dlm->dlm_worker);
+ dlm->dlm_worker = NULL;
+ }
+}
+
static void dlm_complete_dlm_shutdown(struct dlm_ctxt *dlm)
{
dlm_unregister_domain_handlers(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
+ dlm_destroy_dlm_worker(dlm);
/* We've left the domain. Now we can take ourselves out of the
* list and allow the kref stuff to help us free the
@@ -304,8 +338,8 @@ static void dlm_migrate_all_locks(struct dlm_ctxt *dlm)
restart:
spin_lock(&dlm->spinlock);
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
- while (!hlist_empty(&dlm->lockres_hash[i])) {
- res = hlist_entry(dlm->lockres_hash[i].first,
+ while (!hlist_empty(dlm_lockres_hash(dlm, i))) {
+ res = hlist_entry(dlm_lockres_hash(dlm, i)->first,
struct dlm_lock_resource, hash_node);
/* need reference when manually grabbing lockres */
dlm_lockres_get(res);
@@ -1126,6 +1160,13 @@ static int dlm_join_domain(struct dlm_ctxt *dlm)
goto bail;
}
+ dlm->dlm_worker = create_singlethread_workqueue("dlm_wq");
+ if (!dlm->dlm_worker) {
+ status = -ENOMEM;
+ mlog_errno(status);
+ goto bail;
+ }
+
do {
unsigned int backoff;
status = dlm_try_to_join_domain(dlm);
@@ -1166,6 +1207,7 @@ bail:
dlm_unregister_domain_handlers(dlm);
dlm_complete_thread(dlm);
dlm_complete_recovery_thread(dlm);
+ dlm_destroy_dlm_worker(dlm);
}
return status;
@@ -1191,7 +1233,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
goto leave;
}
- dlm->lockres_hash = (struct hlist_head *) __get_free_page(GFP_KERNEL);
+ dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES);
if (!dlm->lockres_hash) {
mlog_errno(-ENOMEM);
kfree(dlm->name);
@@ -1200,8 +1242,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
goto leave;
}
- for (i=0; i<DLM_HASH_BUCKETS; i++)
- INIT_HLIST_HEAD(&dlm->lockres_hash[i]);
+ for (i = 0; i < DLM_HASH_BUCKETS; i++)
+ INIT_HLIST_HEAD(dlm_lockres_hash(dlm, i));
strcpy(dlm->name, domain);
dlm->key = key;
@@ -1231,6 +1273,7 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain,
dlm->dlm_thread_task = NULL;
dlm->dlm_reco_thread_task = NULL;
+ dlm->dlm_worker = NULL;
init_waitqueue_head(&dlm->dlm_thread_wq);
init_waitqueue_head(&dlm->dlm_reco_thread_wq);
init_waitqueue_head(&dlm->reco.event);
diff --git a/fs/ocfs2/dlm/dlmfs.c b/fs/ocfs2/dlm/dlmfs.c
index 7273d9fa6ba..033ad170123 100644
--- a/fs/ocfs2/dlm/dlmfs.c
+++ b/fs/ocfs2/dlm/dlmfs.c
@@ -116,7 +116,7 @@ static int dlmfs_file_open(struct inode *inode,
* doesn't make sense for LVB writes. */
file->f_flags &= ~O_APPEND;
- fp = kmalloc(sizeof(*fp), GFP_KERNEL);
+ fp = kmalloc(sizeof(*fp), GFP_NOFS);
if (!fp) {
status = -ENOMEM;
goto bail;
@@ -196,7 +196,7 @@ static ssize_t dlmfs_file_read(struct file *filp,
else
readlen = count - *ppos;
- lvb_buf = kmalloc(readlen, GFP_KERNEL);
+ lvb_buf = kmalloc(readlen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
@@ -240,7 +240,7 @@ static ssize_t dlmfs_file_write(struct file *filp,
else
writelen = count - *ppos;
- lvb_buf = kmalloc(writelen, GFP_KERNEL);
+ lvb_buf = kmalloc(writelen, GFP_NOFS);
if (!lvb_buf)
return -ENOMEM;
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 6fea28318d6..d6f89577e25 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -201,6 +201,7 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
struct dlm_lock *lock, int flags)
{
enum dlm_status status = DLM_DENIED;
+ int lockres_changed = 1;
mlog_entry("type=%d\n", lock->ml.type);
mlog(0, "lockres %.*s, flags = 0x%x\n", res->lockname.len,
@@ -226,8 +227,25 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
lock->lock_pending = 0;
if (status != DLM_NORMAL) {
- if (status != DLM_NOTQUEUED)
+ if (status == DLM_RECOVERING &&
+ dlm_is_recovery_lock(res->lockname.name,
+ res->lockname.len)) {
+ /* recovery lock was mastered by dead node.
+ * we need to have calc_usage shoot down this
+ * lockres and completely remaster it. */
+ mlog(0, "%s: recovery lock was owned by "
+ "dead node %u, remaster it now.\n",
+ dlm->name, res->owner);
+ } else if (status != DLM_NOTQUEUED) {
+ /*
+ * DO NOT call calc_usage, as this would unhash
+ * the remote lockres before we ever get to use
+ * it. treat as if we never made any change to
+ * the lockres.
+ */
+ lockres_changed = 0;
dlm_error(status);
+ }
dlm_revert_pending_lock(res, lock);
dlm_lock_put(lock);
} else if (dlm_is_recovery_lock(res->lockname.name,
@@ -239,12 +257,12 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
mlog(0, "%s: $RECOVERY lock for this node (%u) is "
"mastered by %u; got lock, manually granting (no ast)\n",
dlm->name, dlm->node_num, res->owner);
- list_del_init(&lock->list);
- list_add_tail(&lock->list, &res->granted);
+ list_move_tail(&lock->list, &res->granted);
}
spin_unlock(&res->spinlock);
- dlm_lockres_calc_usage(dlm, res);
+ if (lockres_changed)
+ dlm_lockres_calc_usage(dlm, res);
wake_up(&res->wq);
return status;
@@ -281,6 +299,14 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
if (tmpret >= 0) {
// successfully sent and received
ret = status; // this is already a dlm_status
+ if (ret == DLM_REJECTED) {
+ mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres "
+ "no longer owned by %u. that node is coming back "
+ "up currently.\n", dlm->name, create.namelen,
+ create.name, res->owner);
+ dlm_print_one_lock_resource(res);
+ BUG();
+ }
} else {
mlog_errno(tmpret);
if (dlm_is_host_down(tmpret)) {
@@ -382,13 +408,13 @@ struct dlm_lock * dlm_new_lock(int type, u8 node, u64 cookie,
struct dlm_lock *lock;
int kernel_allocated = 0;
- lock = kcalloc(1, sizeof(*lock), GFP_KERNEL);
+ lock = kcalloc(1, sizeof(*lock), GFP_NOFS);
if (!lock)
return NULL;
if (!lksb) {
/* zero memory only if kernel-allocated */
- lksb = kcalloc(1, sizeof(*lksb), GFP_KERNEL);
+ lksb = kcalloc(1, sizeof(*lksb), GFP_NOFS);
if (!lksb) {
kfree(lock);
return NULL;
@@ -429,11 +455,16 @@ int dlm_create_lock_handler(struct o2net_msg *msg, u32 len, void *data)
if (!dlm_grab(dlm))
return DLM_REJECTED;
- mlog_bug_on_msg(!dlm_domain_fully_joined(dlm),
- "Domain %s not fully joined!\n", dlm->name);
-
name = create->name;
namelen = create->namelen;
+ status = DLM_REJECTED;
+ if (!dlm_domain_fully_joined(dlm)) {
+ mlog(ML_ERROR, "Domain %s not fully joined, but node %u is "
+ "sending a create_lock message for lock %.*s!\n",
+ dlm->name, create->node_idx, namelen, name);
+ dlm_error(status);
+ goto leave;
+ }
status = DLM_IVBUFLEN;
if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -669,18 +700,22 @@ retry_lock:
msleep(100);
/* no waiting for dlm_reco_thread */
if (recovery) {
- if (status == DLM_RECOVERING) {
- mlog(0, "%s: got RECOVERING "
- "for $REOCVERY lock, master "
- "was %u\n", dlm->name,
- res->owner);
- dlm_wait_for_node_death(dlm, res->owner,
- DLM_NODE_DEATH_WAIT_MAX);
- }
+ if (status != DLM_RECOVERING)
+ goto retry_lock;
+
+ mlog(0, "%s: got RECOVERING "
+ "for $RECOVERY lock, master "
+ "was %u\n", dlm->name,
+ res->owner);
+ /* wait to see the node go down, then
+ * drop down and allow the lockres to
+ * get cleaned up. need to remaster. */
+ dlm_wait_for_node_death(dlm, res->owner,
+ DLM_NODE_DEATH_WAIT_MAX);
} else {
dlm_wait_for_recovery(dlm);
+ goto retry_lock;
}
- goto retry_lock;
}
if (status != DLM_NORMAL) {
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 940be4c13b1..1b8346dd057 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -47,7 +47,6 @@
#include "dlmapi.h"
#include "dlmcommon.h"
-#include "dlmdebug.h"
#include "dlmdomain.h"
#define MLOG_MASK_PREFIX (ML_DLM|ML_DLM_MASTER)
@@ -74,6 +73,7 @@ struct dlm_master_list_entry
wait_queue_head_t wq;
atomic_t woken;
struct kref mle_refs;
+ int inuse;
unsigned long maybe_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long vote_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
unsigned long response_map[BITS_TO_LONGS(O2NM_MAX_NODES)];
@@ -127,18 +127,30 @@ static inline int dlm_mle_equal(struct dlm_ctxt *dlm,
return 1;
}
-#if 0
-/* Code here is included but defined out as it aids debugging */
+#define dlm_print_nodemap(m) _dlm_print_nodemap(m,#m)
+static void _dlm_print_nodemap(unsigned long *map, const char *mapname)
+{
+ int i;
+ printk("%s=[ ", mapname);
+ for (i=0; i<O2NM_MAX_NODES; i++)
+ if (test_bit(i, map))
+ printk("%d ", i);
+ printk("]");
+}
-void dlm_print_one_mle(struct dlm_master_list_entry *mle)
+static void dlm_print_one_mle(struct dlm_master_list_entry *mle)
{
- int i = 0, refs;
+ int refs;
char *type;
char attached;
u8 master;
unsigned int namelen;
const char *name;
struct kref *k;
+ unsigned long *maybe = mle->maybe_map,
+ *vote = mle->vote_map,
+ *resp = mle->response_map,
+ *node = mle->node_map;
k = &mle->mle_refs;
if (mle->type == DLM_MLE_BLOCK)
@@ -159,18 +171,29 @@ void dlm_print_one_mle(struct dlm_master_list_entry *mle)
name = mle->u.res->lockname.name;
}
- mlog(ML_NOTICE, " #%3d: %3s %3d %3u %3u %c (%d)%.*s\n",
- i, type, refs, master, mle->new_master, attached,
- namelen, namelen, name);
+ mlog(ML_NOTICE, "%.*s: %3s refs=%3d mas=%3u new=%3u evt=%c inuse=%d ",
+ namelen, name, type, refs, master, mle->new_master, attached,
+ mle->inuse);
+ dlm_print_nodemap(maybe);
+ printk(", ");
+ dlm_print_nodemap(vote);
+ printk(", ");
+ dlm_print_nodemap(resp);
+ printk(", ");
+ dlm_print_nodemap(node);
+ printk(", ");
+ printk("\n");
}
+#if 0
+/* Code here is included but defined out as it aids debugging */
+
static void dlm_dump_mles(struct dlm_ctxt *dlm)
{
struct dlm_master_list_entry *mle;
struct list_head *iter;
mlog(ML_NOTICE, "dumping all mles for domain %s:\n", dlm->name);
- mlog(ML_NOTICE, " ####: type refs owner new events? lockname nodemap votemap respmap maybemap\n");
spin_lock(&dlm->master_lock);
list_for_each(iter, &dlm->master_list) {
mle = list_entry(iter, struct dlm_master_list_entry, list);
@@ -314,6 +337,31 @@ static inline void dlm_mle_detach_hb_events(struct dlm_ctxt *dlm,
spin_unlock(&dlm->spinlock);
}
+static void dlm_get_mle_inuse(struct dlm_master_list_entry *mle)
+{
+ struct dlm_ctxt *dlm;
+ dlm = mle->dlm;
+
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&dlm->master_lock);
+ mle->inuse++;
+ kref_get(&mle->mle_refs);
+}
+
+static void dlm_put_mle_inuse(struct dlm_master_list_entry *mle)
+{
+ struct dlm_ctxt *dlm;
+ dlm = mle->dlm;
+
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+ mle->inuse--;
+ __dlm_put_mle(mle);
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+
+}
+
/* remove from list and free */
static void __dlm_put_mle(struct dlm_master_list_entry *mle)
{
@@ -322,9 +370,14 @@ static void __dlm_put_mle(struct dlm_master_list_entry *mle)
assert_spin_locked(&dlm->spinlock);
assert_spin_locked(&dlm->master_lock);
- BUG_ON(!atomic_read(&mle->mle_refs.refcount));
-
- kref_put(&mle->mle_refs, dlm_mle_release);
+ if (!atomic_read(&mle->mle_refs.refcount)) {
+ /* this may or may not crash, but who cares.
+ * it's a BUG. */
+ mlog(ML_ERROR, "bad mle: %p\n", mle);
+ dlm_print_one_mle(mle);
+ BUG();
+ } else
+ kref_put(&mle->mle_refs, dlm_mle_release);
}
@@ -367,6 +420,7 @@ static void dlm_init_mle(struct dlm_master_list_entry *mle,
memset(mle->response_map, 0, sizeof(mle->response_map));
mle->master = O2NM_MAX_NODES;
mle->new_master = O2NM_MAX_NODES;
+ mle->inuse = 0;
if (mle->type == DLM_MLE_MASTER) {
BUG_ON(!res);
@@ -564,6 +618,28 @@ static void dlm_lockres_release(struct kref *kref)
mlog(0, "destroying lockres %.*s\n", res->lockname.len,
res->lockname.name);
+ if (!hlist_unhashed(&res->hash_node) ||
+ !list_empty(&res->granted) ||
+ !list_empty(&res->converting) ||
+ !list_empty(&res->blocked) ||
+ !list_empty(&res->dirty) ||
+ !list_empty(&res->recovering) ||
+ !list_empty(&res->purge)) {
+ mlog(ML_ERROR,
+ "Going to BUG for resource %.*s."
+ " We're on a list! [%c%c%c%c%c%c%c]\n",
+ res->lockname.len, res->lockname.name,
+ !hlist_unhashed(&res->hash_node) ? 'H' : ' ',
+ !list_empty(&res->granted) ? 'G' : ' ',
+ !list_empty(&res->converting) ? 'C' : ' ',
+ !list_empty(&res->blocked) ? 'B' : ' ',
+ !list_empty(&res->dirty) ? 'D' : ' ',
+ !list_empty(&res->recovering) ? 'R' : ' ',
+ !list_empty(&res->purge) ? 'P' : ' ');
+
+ dlm_print_one_lock_resource(res);
+ }
+
/* By the time we're ready to blow this guy away, we shouldn't
* be on any lists. */
BUG_ON(!hlist_unhashed(&res->hash_node));
@@ -579,11 +655,6 @@ static void dlm_lockres_release(struct kref *kref)
kfree(res);
}
-void dlm_lockres_get(struct dlm_lock_resource *res)
-{
- kref_get(&res->refs);
-}
-
void dlm_lockres_put(struct dlm_lock_resource *res)
{
kref_put(&res->refs, dlm_lockres_release);
@@ -603,7 +674,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm,
memcpy(qname, name, namelen);
res->lockname.len = namelen;
- res->lockname.hash = full_name_hash(name, namelen);
+ res->lockname.hash = dlm_lockid_hash(name, namelen);
init_waitqueue_head(&res->wq);
spin_lock_init(&res->spinlock);
@@ -637,11 +708,11 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
{
struct dlm_lock_resource *res;
- res = kmalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
+ res = kmalloc(sizeof(struct dlm_lock_resource), GFP_NOFS);
if (!res)
return NULL;
- res->lockname.name = kmalloc(namelen, GFP_KERNEL);
+ res->lockname.name = kmalloc(namelen, GFP_NOFS);
if (!res->lockname.name) {
kfree(res);
return NULL;
@@ -677,19 +748,20 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
int blocked = 0;
int ret, nodenum;
struct dlm_node_iter iter;
- unsigned int namelen;
+ unsigned int namelen, hash;
int tries = 0;
int bit, wait_on_recovery = 0;
BUG_ON(!lockid);
namelen = strlen(lockid);
+ hash = dlm_lockid_hash(lockid, namelen);
mlog(0, "get lockres %s (len %d)\n", lockid, namelen);
lookup:
spin_lock(&dlm->spinlock);
- tmpres = __dlm_lookup_lockres(dlm, lockid, namelen);
+ tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash);
if (tmpres) {
spin_unlock(&dlm->spinlock);
mlog(0, "found in hash!\n");
@@ -704,7 +776,7 @@ lookup:
mlog(0, "allocating a new resource\n");
/* nothing found and we need to allocate one. */
alloc_mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+ kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!alloc_mle)
goto leave;
res = dlm_new_lockres(dlm, lockid, namelen);
@@ -790,10 +862,11 @@ lookup:
* if so, the creator of the BLOCK may try to put the last
* ref at this time in the assert master handler, so we
* need an extra one to keep from a bad ptr deref. */
- dlm_get_mle(mle);
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
+redo_request:
while (wait_on_recovery) {
/* any cluster changes that occurred after dropping the
* dlm spinlock would be detectable be a change on the mle,
@@ -812,7 +885,7 @@ lookup:
}
dlm_kick_recovery_thread(dlm);
- msleep(100);
+ msleep(1000);
dlm_wait_for_recovery(dlm);
spin_lock(&dlm->spinlock);
@@ -825,13 +898,15 @@ lookup:
} else
wait_on_recovery = 0;
spin_unlock(&dlm->spinlock);
+
+ if (wait_on_recovery)
+ dlm_wait_for_node_recovery(dlm, bit, 10000);
}
/* must wait for lock to be mastered elsewhere */
if (blocked)
goto wait;
-redo_request:
ret = -EINVAL;
dlm_node_iter_init(mle->vote_map, &iter);
while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
@@ -856,6 +931,7 @@ wait:
/* keep going until the response map includes all nodes */
ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
if (ret < 0) {
+ wait_on_recovery = 1;
mlog(0, "%s:%.*s: node map changed, redo the "
"master request now, blocked=%d\n",
dlm->name, res->lockname.len,
@@ -866,7 +942,7 @@ wait:
dlm->name, res->lockname.len,
res->lockname.name, blocked);
dlm_print_one_lock_resource(res);
- /* dlm_print_one_mle(mle); */
+ dlm_print_one_mle(mle);
tries = 0;
}
goto redo_request;
@@ -880,7 +956,7 @@ wait:
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
/* put the extra ref */
- dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
wake_waiters:
spin_lock(&res->spinlock);
@@ -921,12 +997,14 @@ recheck:
spin_unlock(&res->spinlock);
/* this will cause the master to re-assert across
* the whole cluster, freeing up mles */
- ret = dlm_do_master_request(mle, res->owner);
- if (ret < 0) {
- /* give recovery a chance to run */
- mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
- msleep(500);
- goto recheck;
+ if (res->owner != dlm->node_num) {
+ ret = dlm_do_master_request(mle, res->owner);
+ if (ret < 0) {
+ /* give recovery a chance to run */
+ mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret);
+ msleep(500);
+ goto recheck;
+ }
}
ret = 0;
goto leave;
@@ -962,6 +1040,12 @@ recheck:
"rechecking now\n", dlm->name, res->lockname.len,
res->lockname.name);
goto recheck;
+ } else {
+ if (!voting_done) {
+ mlog(0, "map not changed and voting not done "
+ "for %s:%.*s\n", dlm->name, res->lockname.len,
+ res->lockname.name);
+ }
}
if (m != O2NM_MAX_NODES) {
@@ -1129,18 +1213,6 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
set_bit(node, mle->vote_map);
} else {
mlog(ML_ERROR, "node down! %d\n", node);
-
- /* if the node wasn't involved in mastery skip it,
- * but clear it out from the maps so that it will
- * not affect mastery of this lockres */
- clear_bit(node, mle->response_map);
- clear_bit(node, mle->vote_map);
- if (!test_bit(node, mle->maybe_map))
- goto next;
-
- /* if we're already blocked on lock mastery, and the
- * dead node wasn't the expected master, or there is
- * another node in the maybe_map, keep waiting */
if (blocked) {
int lowest = find_next_bit(mle->maybe_map,
O2NM_MAX_NODES, 0);
@@ -1148,54 +1220,53 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
/* act like it was never there */
clear_bit(node, mle->maybe_map);
- if (node != lowest)
- goto next;
-
- mlog(ML_ERROR, "expected master %u died while "
- "this node was blocked waiting on it!\n",
- node);
- lowest = find_next_bit(mle->maybe_map,
- O2NM_MAX_NODES,
- lowest+1);
- if (lowest < O2NM_MAX_NODES) {
- mlog(0, "still blocked. waiting "
- "on %u now\n", lowest);
- goto next;
+ if (node == lowest) {
+ mlog(0, "expected master %u died"
+ " while this node was blocked "
+ "waiting on it!\n", node);
+ lowest = find_next_bit(mle->maybe_map,
+ O2NM_MAX_NODES,
+ lowest+1);
+ if (lowest < O2NM_MAX_NODES) {
+ mlog(0, "%s:%.*s:still "
+ "blocked. waiting on %u "
+ "now\n", dlm->name,
+ res->lockname.len,
+ res->lockname.name,
+ lowest);
+ } else {
+ /* mle is an MLE_BLOCK, but
+ * there is now nothing left to
+ * block on. we need to return
+ * all the way back out and try
+ * again with an MLE_MASTER.
+ * dlm_do_local_recovery_cleanup
+ * has already run, so the mle
+ * refcount is ok */
+ mlog(0, "%s:%.*s: no "
+ "longer blocking. try to "
+ "master this here\n",
+ dlm->name,
+ res->lockname.len,
+ res->lockname.name);
+ mle->type = DLM_MLE_MASTER;
+ mle->u.res = res;
+ }
}
-
- /* mle is an MLE_BLOCK, but there is now
- * nothing left to block on. we need to return
- * all the way back out and try again with
- * an MLE_MASTER. dlm_do_local_recovery_cleanup
- * has already run, so the mle refcount is ok */
- mlog(0, "no longer blocking. we can "
- "try to master this here\n");
- mle->type = DLM_MLE_MASTER;
- memset(mle->maybe_map, 0,
- sizeof(mle->maybe_map));
- memset(mle->response_map, 0,
- sizeof(mle->maybe_map));
- memcpy(mle->vote_map, mle->node_map,
- sizeof(mle->node_map));
- mle->u.res = res;
- set_bit(dlm->node_num, mle->maybe_map);
-
- ret = -EAGAIN;
- goto next;
}
- clear_bit(node, mle->maybe_map);
- if (node > dlm->node_num)
- goto next;
-
- mlog(0, "dead node in map!\n");
- /* yuck. go back and re-contact all nodes
- * in the vote_map, removing this node. */
- memset(mle->response_map, 0,
- sizeof(mle->response_map));
+ /* now blank out everything, as if we had never
+ * contacted anyone */
+ memset(mle->maybe_map, 0, sizeof(mle->maybe_map));
+ memset(mle->response_map, 0, sizeof(mle->response_map));
+ /* reset the vote_map to the current node_map */
+ memcpy(mle->vote_map, mle->node_map,
+ sizeof(mle->node_map));
+ /* put myself into the maybe map */
+ if (mle->type != DLM_MLE_BLOCK)
+ set_bit(dlm->node_num, mle->maybe_map);
}
ret = -EAGAIN;
-next:
node = dlm_bitmap_diff_iter_next(&bdi, &sc);
}
return ret;
@@ -1316,7 +1387,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
struct dlm_master_request *request = (struct dlm_master_request *) msg->buf;
struct dlm_master_list_entry *mle = NULL, *tmpmle = NULL;
char *name;
- unsigned int namelen;
+ unsigned int namelen, hash;
int found, ret;
int set_maybe;
int dispatch_assert = 0;
@@ -1331,6 +1402,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
name = request->name;
namelen = request->namelen;
+ hash = dlm_lockid_hash(name, namelen);
if (namelen > DLM_LOCKID_NAME_MAX) {
response = DLM_IVBUFLEN;
@@ -1339,7 +1411,7 @@ int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data)
way_up_top:
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, name, namelen);
+ res = __dlm_lookup_lockres(dlm, name, namelen, hash);
if (res) {
spin_unlock(&dlm->spinlock);
@@ -1459,21 +1531,18 @@ way_up_top:
spin_unlock(&dlm->spinlock);
mle = (struct dlm_master_list_entry *)
- kmem_cache_alloc(dlm_mle_cache, GFP_KERNEL);
+ kmem_cache_alloc(dlm_mle_cache, GFP_NOFS);
if (!mle) {
response = DLM_MASTER_RESP_ERROR;
mlog_errno(-ENOMEM);
goto send_response;
}
- spin_lock(&dlm->spinlock);
- dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL,
- name, namelen);
- spin_unlock(&dlm->spinlock);
goto way_up_top;
}
// mlog(0, "this is second time thru, already allocated, "
// "add the block.\n");
+ dlm_init_mle(mle, DLM_MLE_BLOCK, dlm, NULL, name, namelen);
set_bit(request->node_idx, mle->maybe_map);
list_add(&mle->list, &dlm->master_list);
response = DLM_MASTER_RESP_NO;
@@ -1556,6 +1625,8 @@ again:
dlm_node_iter_init(nodemap, &iter);
while ((to = dlm_node_iter_next(&iter)) >= 0) {
int r = 0;
+ struct dlm_master_list_entry *mle = NULL;
+
mlog(0, "sending assert master to %d (%.*s)\n", to,
namelen, lockname);
memset(&assert, 0, sizeof(assert));
@@ -1567,20 +1638,28 @@ again:
tmpret = o2net_send_message(DLM_ASSERT_MASTER_MSG, dlm->key,
&assert, sizeof(assert), to, &r);
if (tmpret < 0) {
- mlog(ML_ERROR, "assert_master returned %d!\n", tmpret);
+ mlog(0, "assert_master returned %d!\n", tmpret);
if (!dlm_is_host_down(tmpret)) {
- mlog(ML_ERROR, "unhandled error!\n");
+ mlog(ML_ERROR, "unhandled error=%d!\n", tmpret);
BUG();
}
/* a node died. finish out the rest of the nodes. */
- mlog(ML_ERROR, "link to %d went down!\n", to);
+ mlog(0, "link to %d went down!\n", to);
/* any nonzero status return will do */
ret = tmpret;
} else if (r < 0) {
/* ok, something horribly messed. kill thyself. */
mlog(ML_ERROR,"during assert master of %.*s to %u, "
"got %d.\n", namelen, lockname, to, r);
- dlm_dump_lock_resources(dlm);
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+ if (dlm_find_mle(dlm, &mle, (char *)lockname,
+ namelen)) {
+ dlm_print_one_mle(mle);
+ __dlm_put_mle(mle);
+ }
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
BUG();
} else if (r == EAGAIN) {
mlog(0, "%.*s: node %u create mles on other "
@@ -1612,7 +1691,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
struct dlm_assert_master *assert = (struct dlm_assert_master *)msg->buf;
struct dlm_lock_resource *res = NULL;
char *name;
- unsigned int namelen;
+ unsigned int namelen, hash;
u32 flags;
int master_request = 0;
int ret = 0;
@@ -1622,6 +1701,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
name = assert->name;
namelen = assert->namelen;
+ hash = dlm_lockid_hash(name, namelen);
flags = be32_to_cpu(assert->flags);
if (namelen > DLM_LOCKID_NAME_MAX) {
@@ -1646,7 +1726,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
if (bit >= O2NM_MAX_NODES) {
/* not necessarily an error, though less likely.
* could be master just re-asserting. */
- mlog(ML_ERROR, "no bits set in the maybe_map, but %u "
+ mlog(0, "no bits set in the maybe_map, but %u "
"is asserting! (%.*s)\n", assert->node_idx,
namelen, name);
} else if (bit != assert->node_idx) {
@@ -1658,19 +1738,36 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
* number winning the mastery will respond
* YES to mastery requests, but this node
* had no way of knowing. let it pass. */
- mlog(ML_ERROR, "%u is the lowest node, "
+ mlog(0, "%u is the lowest node, "
"%u is asserting. (%.*s) %u must "
"have begun after %u won.\n", bit,
assert->node_idx, namelen, name, bit,
assert->node_idx);
}
}
+ if (mle->type == DLM_MLE_MIGRATION) {
+ if (flags & DLM_ASSERT_MASTER_MLE_CLEANUP) {
+ mlog(0, "%s:%.*s: got cleanup assert"
+ " from %u for migration\n",
+ dlm->name, namelen, name,
+ assert->node_idx);
+ } else if (!(flags & DLM_ASSERT_MASTER_FINISH_MIGRATION)) {
+ mlog(0, "%s:%.*s: got unrelated assert"
+ " from %u for migration, ignoring\n",
+ dlm->name, namelen, name,
+ assert->node_idx);
+ __dlm_put_mle(mle);
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+ goto done;
+ }
+ }
}
spin_unlock(&dlm->master_lock);
/* ok everything checks out with the MLE
* now check to see if there is a lockres */
- res = __dlm_lookup_lockres(dlm, name, namelen);
+ res = __dlm_lookup_lockres(dlm, name, namelen, hash);
if (res) {
spin_lock(&res->spinlock);
if (res->state & DLM_LOCK_RES_RECOVERING) {
@@ -1679,7 +1776,8 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data)
goto kill;
}
if (!mle) {
- if (res->owner != assert->node_idx) {
+ if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
+ res->owner != assert->node_idx) {
mlog(ML_ERROR, "assert_master from "
"%u, but current owner is "
"%u! (%.*s)\n",
@@ -1732,6 +1830,7 @@ ok:
if (mle) {
int extra_ref = 0;
int nn = -1;
+ int rr, err = 0;
spin_lock(&mle->spinlock);
if (mle->type == DLM_MLE_BLOCK || mle->type == DLM_MLE_MIGRATION)
@@ -1751,27 +1850,64 @@ ok:
wake_up(&mle->wq);
spin_unlock(&mle->spinlock);
- if (mle->type == DLM_MLE_MIGRATION && res) {
- mlog(0, "finishing off migration of lockres %.*s, "
- "from %u to %u\n",
- res->lockname.len, res->lockname.name,
- dlm->node_num, mle->new_master);
+ if (res) {
spin_lock(&res->spinlock);
- res->state &= ~DLM_LOCK_RES_MIGRATING;
- dlm_change_lockres_owner(dlm, res, mle->new_master);
- BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+ if (mle->type == DLM_MLE_MIGRATION) {
+ mlog(0, "finishing off migration of lockres %.*s, "
+ "from %u to %u\n",
+ res->lockname.len, res->lockname.name,
+ dlm->node_num, mle->new_master);
+ res->state &= ~DLM_LOCK_RES_MIGRATING;
+ dlm_change_lockres_owner(dlm, res, mle->new_master);
+ BUG_ON(res->state & DLM_LOCK_RES_DIRTY);
+ } else {
+ dlm_change_lockres_owner(dlm, res, mle->master);
+ }
spin_unlock(&res->spinlock);
}
- /* master is known, detach if not already detached */
- dlm_mle_detach_hb_events(dlm, mle);
- dlm_put_mle(mle);
-
+
+ /* master is known, detach if not already detached.
+ * ensures that only one assert_master call will happen
+ * on this mle. */
+ spin_lock(&dlm->spinlock);
+ spin_lock(&dlm->master_lock);
+
+ rr = atomic_read(&mle->mle_refs.refcount);
+ if (mle->inuse > 0) {
+ if (extra_ref && rr < 3)
+ err = 1;
+ else if (!extra_ref && rr < 2)
+ err = 1;
+ } else {
+ if (extra_ref && rr < 2)
+ err = 1;
+ else if (!extra_ref && rr < 1)
+ err = 1;
+ }
+ if (err) {
+ mlog(ML_ERROR, "%s:%.*s: got assert master from %u "
+ "that will mess up this node, refs=%d, extra=%d, "
+ "inuse=%d\n", dlm->name, namelen, name,
+ assert->node_idx, rr, extra_ref, mle->inuse);
+ dlm_print_one_mle(mle);
+ }
+ list_del_init(&mle->list);
+ __dlm_mle_detach_hb_events(dlm, mle);
+ __dlm_put_mle(mle);
if (extra_ref) {
/* the assert master message now balances the extra
* ref given by the master / migration request message.
* if this is the last put, it will be removed
* from the list. */
- dlm_put_mle(mle);
+ __dlm_put_mle(mle);
+ }
+ spin_unlock(&dlm->master_lock);
+ spin_unlock(&dlm->spinlock);
+ } else if (res) {
+ if (res->owner != assert->node_idx) {
+ mlog(0, "assert_master from %u, but current "
+ "owner is %u (%.*s), no mle\n", assert->node_idx,
+ res->owner, namelen, name);
}
}
@@ -1788,12 +1924,12 @@ done:
kill:
/* kill the caller! */
+ mlog(ML_ERROR, "Bad message received from another node. Dumping state "
+ "and killing the other node now! This node is OK and can continue.\n");
+ __dlm_print_one_lock_resource(res);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
dlm_lockres_put(res);
- mlog(ML_ERROR, "Bad message received from another node. Dumping state "
- "and killing the other node now! This node is OK and can continue.\n");
- dlm_dump_lock_resources(dlm);
dlm_put(dlm);
return -EINVAL;
}
@@ -1803,7 +1939,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
int ignore_higher, u8 request_from, u32 flags)
{
struct dlm_work_item *item;
- item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+ item = kcalloc(1, sizeof(*item), GFP_NOFS);
if (!item)
return -ENOMEM;
@@ -1825,7 +1961,7 @@ int dlm_dispatch_assert_master(struct dlm_ctxt *dlm,
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
- schedule_work(&dlm->dispatched_work);
+ queue_work(dlm->dlm_worker, &dlm->dispatched_work);
return 0;
}
@@ -1866,6 +2002,23 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
}
}
+ /*
+ * If we're migrating this lock to someone else, we are no
+ * longer allowed to assert out own mastery. OTOH, we need to
+ * prevent migration from starting while we're still asserting
+ * our dominance. The reserved ast delays migration.
+ */
+ spin_lock(&res->spinlock);
+ if (res->state & DLM_LOCK_RES_MIGRATING) {
+ mlog(0, "Someone asked us to assert mastery, but we're "
+ "in the middle of migration. Skipping assert, "
+ "the new master will handle that.\n");
+ spin_unlock(&res->spinlock);
+ goto put;
+ } else
+ __dlm_lockres_reserve_ast(res);
+ spin_unlock(&res->spinlock);
+
/* this call now finishes out the nodemap
* even if one or more nodes die */
mlog(0, "worker about to master %.*s here, this=%u\n",
@@ -1875,9 +2028,14 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data)
nodemap, flags);
if (ret < 0) {
/* no need to restart, we are done */
- mlog_errno(ret);
+ if (!dlm_is_host_down(ret))
+ mlog_errno(ret);
}
+ /* Ok, we've asserted ourselves. Let's let migration start. */
+ dlm_lockres_release_ast(dlm, res);
+
+put:
dlm_lockres_put(res);
mlog(0, "finished with dlm_assert_master_worker\n");
@@ -1916,6 +2074,7 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm,
BUG();
/* host is down, so answer for that node would be
* DLM_LOCK_RES_OWNER_UNKNOWN. continue. */
+ ret = 0;
}
if (master != DLM_LOCK_RES_OWNER_UNKNOWN) {
@@ -2016,14 +2175,14 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
*/
ret = -ENOMEM;
- mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_KERNEL);
+ mres = (struct dlm_migratable_lockres *) __get_free_page(GFP_NOFS);
if (!mres) {
mlog_errno(ret);
goto leave;
}
mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_KERNEL);
+ GFP_NOFS);
if (!mle) {
mlog_errno(ret);
goto leave;
@@ -2117,7 +2276,7 @@ fail:
* take both dlm->spinlock and dlm->master_lock */
spin_lock(&dlm->spinlock);
spin_lock(&dlm->master_lock);
- dlm_get_mle(mle);
+ dlm_get_mle_inuse(mle);
spin_unlock(&dlm->master_lock);
spin_unlock(&dlm->spinlock);
@@ -2134,7 +2293,10 @@ fail:
/* migration failed, detach and clean up mle */
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
- dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_MIGRATING;
+ spin_unlock(&res->spinlock);
goto leave;
}
@@ -2164,8 +2326,8 @@ fail:
/* avoid hang during shutdown when migrating lockres
* to a node which also goes down */
if (dlm_is_node_dead(dlm, target)) {
- mlog(0, "%s:%.*s: expected migration target %u "
- "is no longer up. restarting.\n",
+ mlog(0, "%s:%.*s: expected migration "
+ "target %u is no longer up, restarting\n",
dlm->name, res->lockname.len,
res->lockname.name, target);
ret = -ERESTARTSYS;
@@ -2175,7 +2337,10 @@ fail:
/* migration failed, detach and clean up mle */
dlm_mle_detach_hb_events(dlm, mle);
dlm_put_mle(mle);
- dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
+ spin_lock(&res->spinlock);
+ res->state &= ~DLM_LOCK_RES_MIGRATING;
+ spin_unlock(&res->spinlock);
goto leave;
}
/* TODO: if node died: stop, clean up, return error */
@@ -2191,7 +2356,7 @@ fail:
/* master is known, detach if not already detached */
dlm_mle_detach_hb_events(dlm, mle);
- dlm_put_mle(mle);
+ dlm_put_mle_inuse(mle);
ret = 0;
dlm_lockres_calc_usage(dlm, res);
@@ -2462,7 +2627,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
struct dlm_migrate_request *migrate = (struct dlm_migrate_request *) msg->buf;
struct dlm_master_list_entry *mle = NULL, *oldmle = NULL;
const char *name;
- unsigned int namelen;
+ unsigned int namelen, hash;
int ret = 0;
if (!dlm_grab(dlm))
@@ -2470,10 +2635,11 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
name = migrate->name;
namelen = migrate->namelen;
+ hash = dlm_lockid_hash(name, namelen);
/* preallocate.. if this fails, abort */
mle = (struct dlm_master_list_entry *) kmem_cache_alloc(dlm_mle_cache,
- GFP_KERNEL);
+ GFP_NOFS);
if (!mle) {
ret = -ENOMEM;
@@ -2482,7 +2648,7 @@ int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data)
/* check for pre-existing lock */
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, name, namelen);
+ res = __dlm_lookup_lockres(dlm, name, namelen, hash);
spin_lock(&dlm->master_lock);
if (res) {
@@ -2580,6 +2746,7 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm,
/* remove it from the list so that only one
* mle will be found */
list_del_init(&tmp->list);
+ __dlm_mle_detach_hb_events(dlm, mle);
}
spin_unlock(&tmp->spinlock);
}
@@ -2601,6 +2768,7 @@ void dlm_clean_master_list(struct dlm_ctxt *dlm, u8 dead_node)
struct list_head *iter, *iter2;
struct dlm_master_list_entry *mle;
struct dlm_lock_resource *res;
+ unsigned int hash;
mlog_entry("dlm=%s, dead node=%u\n", dlm->name, dead_node);
top:
@@ -2640,7 +2808,7 @@ top:
* may result in the mle being unlinked and
* freed, but there may still be a process
* waiting in the dlmlock path which is fine. */
- mlog(ML_ERROR, "node %u was expected master\n",
+ mlog(0, "node %u was expected master\n",
dead_node);
atomic_set(&mle->woken, 1);
spin_unlock(&mle->spinlock);
@@ -2673,19 +2841,21 @@ top:
/* remove from the list early. NOTE: unlinking
* list_head while in list_for_each_safe */
+ __dlm_mle_detach_hb_events(dlm, mle);
spin_lock(&mle->spinlock);
list_del_init(&mle->list);
atomic_set(&mle->woken, 1);
spin_unlock(&mle->spinlock);
wake_up(&mle->wq);
- mlog(0, "node %u died during migration from "
- "%u to %u!\n", dead_node,
+ mlog(0, "%s: node %u died during migration from "
+ "%u to %u!\n", dlm->name, dead_node,
mle->master, mle->new_master);
/* if there is a lockres associated with this
* mle, find it and set its owner to UNKNOWN */
+ hash = dlm_lockid_hash(mle->u.name.name, mle->u.name.len);
res = __dlm_lookup_lockres(dlm, mle->u.name.name,
- mle->u.name.len);
+ mle->u.name.len, hash);
if (res) {
/* unfortunately if we hit this rare case, our
* lock ordering is messed. we need to drop
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 805cbabac05..da399013516 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -115,12 +115,37 @@ static u64 dlm_get_next_mig_cookie(void)
return c;
}
+static inline void dlm_set_reco_dead_node(struct dlm_ctxt *dlm,
+ u8 dead_node)
+{
+ assert_spin_locked(&dlm->spinlock);
+ if (dlm->reco.dead_node != dead_node)
+ mlog(0, "%s: changing dead_node from %u to %u\n",
+ dlm->name, dlm->reco.dead_node, dead_node);
+ dlm->reco.dead_node = dead_node;
+}
+
+static inline void dlm_set_reco_master(struct dlm_ctxt *dlm,
+ u8 master)
+{
+ assert_spin_locked(&dlm->spinlock);
+ mlog(0, "%s: changing new_master from %u to %u\n",
+ dlm->name, dlm->reco.new_master, master);
+ dlm->reco.new_master = master;
+}
+
+static inline void __dlm_reset_recovery(struct dlm_ctxt *dlm)
+{
+ assert_spin_locked(&dlm->spinlock);
+ clear_bit(dlm->reco.dead_node, dlm->recovery_map);
+ dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
+ dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
+}
+
static inline void dlm_reset_recovery(struct dlm_ctxt *dlm)
{
spin_lock(&dlm->spinlock);
- clear_bit(dlm->reco.dead_node, dlm->recovery_map);
- dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
- dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+ __dlm_reset_recovery(dlm);
spin_unlock(&dlm->spinlock);
}
@@ -132,12 +157,21 @@ void dlm_dispatch_work(void *data)
struct list_head *iter, *iter2;
struct dlm_work_item *item;
dlm_workfunc_t *workfunc;
+ int tot=0;
+
+ if (!dlm_joined(dlm))
+ return;
spin_lock(&dlm->work_lock);
list_splice_init(&dlm->work_list, &tmp_list);
spin_unlock(&dlm->work_lock);
list_for_each_safe(iter, iter2, &tmp_list) {
+ tot++;
+ }
+ mlog(0, "%s: work thread has %d work items\n", dlm->name, tot);
+
+ list_for_each_safe(iter, iter2, &tmp_list) {
item = list_entry(iter, struct dlm_work_item, list);
workfunc = item->func;
list_del_init(&item->list);
@@ -220,6 +254,52 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm)
*
*/
+static void dlm_print_reco_node_status(struct dlm_ctxt *dlm)
+{
+ struct dlm_reco_node_data *ndata;
+ struct dlm_lock_resource *res;
+
+ mlog(ML_NOTICE, "%s(%d): recovery info, state=%s, dead=%u, master=%u\n",
+ dlm->name, dlm->dlm_reco_thread_task->pid,
+ dlm->reco.state & DLM_RECO_STATE_ACTIVE ? "ACTIVE" : "inactive",
+ dlm->reco.dead_node, dlm->reco.new_master);
+
+ list_for_each_entry(ndata, &dlm->reco.node_data, list) {
+ char *st = "unknown";
+ switch (ndata->state) {
+ case DLM_RECO_NODE_DATA_INIT:
+ st = "init";
+ break;
+ case DLM_RECO_NODE_DATA_REQUESTING:
+ st = "requesting";
+ break;
+ case DLM_RECO_NODE_DATA_DEAD:
+ st = "dead";
+ break;
+ case DLM_RECO_NODE_DATA_RECEIVING:
+ st = "receiving";
+ break;
+ case DLM_RECO_NODE_DATA_REQUESTED:
+ st = "requested";
+ break;
+ case DLM_RECO_NODE_DATA_DONE:
+ st = "done";
+ break;
+ case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+ st = "finalize-sent";
+ break;
+ default:
+ st = "bad";
+ break;
+ }
+ mlog(ML_NOTICE, "%s: reco state, node %u, state=%s\n",
+ dlm->name, ndata->node_num, st);
+ }
+ list_for_each_entry(res, &dlm->reco.resources, recovering) {
+ mlog(ML_NOTICE, "%s: lockres %.*s on recovering list\n",
+ dlm->name, res->lockname.len, res->lockname.name);
+ }
+}
#define DLM_RECO_THREAD_TIMEOUT_MS (5 * 1000)
@@ -267,11 +347,23 @@ int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
{
int dead;
spin_lock(&dlm->spinlock);
- dead = test_bit(node, dlm->domain_map);
+ dead = !test_bit(node, dlm->domain_map);
spin_unlock(&dlm->spinlock);
return dead;
}
+/* returns true if node is no longer in the domain
+ * could be dead or just not joined */
+static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
+{
+ int recovered;
+ spin_lock(&dlm->spinlock);
+ recovered = !test_bit(node, dlm->recovery_map);
+ spin_unlock(&dlm->spinlock);
+ return recovered;
+}
+
+
int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
{
if (timeout) {
@@ -290,6 +382,24 @@ int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
return 0;
}
+int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
+{
+ if (timeout) {
+ mlog(0, "%s: waiting %dms for notification of "
+ "recovery of node %u\n", dlm->name, timeout, node);
+ wait_event_timeout(dlm->dlm_reco_thread_wq,
+ dlm_is_node_recovered(dlm, node),
+ msecs_to_jiffies(timeout));
+ } else {
+ mlog(0, "%s: waiting indefinitely for notification "
+ "of recovery of node %u\n", dlm->name, node);
+ wait_event(dlm->dlm_reco_thread_wq,
+ dlm_is_node_recovered(dlm, node));
+ }
+ /* for now, return 0 */
+ return 0;
+}
+
/* callers of the top-level api calls (dlmlock/dlmunlock) should
* block on the dlm->reco.event when recovery is in progress.
* the dlm recovery thread will set this state when it begins
@@ -308,6 +418,13 @@ static int dlm_in_recovery(struct dlm_ctxt *dlm)
void dlm_wait_for_recovery(struct dlm_ctxt *dlm)
{
+ if (dlm_in_recovery(dlm)) {
+ mlog(0, "%s: reco thread %d in recovery: "
+ "state=%d, master=%u, dead=%u\n",
+ dlm->name, dlm->dlm_reco_thread_task->pid,
+ dlm->reco.state, dlm->reco.new_master,
+ dlm->reco.dead_node);
+ }
wait_event(dlm->reco.event, !dlm_in_recovery(dlm));
}
@@ -341,7 +458,7 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
mlog(0, "new master %u died while recovering %u!\n",
dlm->reco.new_master, dlm->reco.dead_node);
/* unset the new_master, leave dead_node */
- dlm->reco.new_master = O2NM_INVALID_NODE_NUM;
+ dlm_set_reco_master(dlm, O2NM_INVALID_NODE_NUM);
}
/* select a target to recover */
@@ -350,14 +467,14 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
bit = find_next_bit (dlm->recovery_map, O2NM_MAX_NODES+1, 0);
if (bit >= O2NM_MAX_NODES || bit < 0)
- dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+ dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
else
- dlm->reco.dead_node = bit;
+ dlm_set_reco_dead_node(dlm, bit);
} else if (!test_bit(dlm->reco.dead_node, dlm->recovery_map)) {
/* BUG? */
mlog(ML_ERROR, "dead_node %u no longer in recovery map!\n",
dlm->reco.dead_node);
- dlm->reco.dead_node = O2NM_INVALID_NODE_NUM;
+ dlm_set_reco_dead_node(dlm, O2NM_INVALID_NODE_NUM);
}
if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
@@ -366,7 +483,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
/* return to main thread loop and sleep. */
return 0;
}
- mlog(0, "recovery thread found node %u in the recovery map!\n",
+ mlog(0, "%s(%d):recovery thread found node %u in the recovery map!\n",
+ dlm->name, dlm->dlm_reco_thread_task->pid,
dlm->reco.dead_node);
spin_unlock(&dlm->spinlock);
@@ -389,8 +507,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
}
mlog(0, "another node will master this recovery session.\n");
}
- mlog(0, "dlm=%s, new_master=%u, this node=%u, dead_node=%u\n",
- dlm->name, dlm->reco.new_master,
+ mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n",
+ dlm->name, dlm->dlm_reco_thread_task->pid, dlm->reco.new_master,
dlm->node_num, dlm->reco.dead_node);
/* it is safe to start everything back up here
@@ -402,11 +520,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
return 0;
master_here:
- mlog(0, "mastering recovery of %s:%u here(this=%u)!\n",
+ mlog(0, "(%d) mastering recovery of %s:%u here(this=%u)!\n",
+ dlm->dlm_reco_thread_task->pid,
dlm->name, dlm->reco.dead_node, dlm->node_num);
status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
if (status < 0) {
+ /* we should never hit this anymore */
mlog(ML_ERROR, "error %d remastering locks for node %u, "
"retrying.\n", status, dlm->reco.dead_node);
/* yield a bit to allow any final network messages
@@ -433,9 +553,16 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
int destroy = 0;
int pass = 0;
- status = dlm_init_recovery_area(dlm, dead_node);
- if (status < 0)
- goto leave;
+ do {
+ /* we have become recovery master. there is no escaping
+ * this, so just keep trying until we get it. */
+ status = dlm_init_recovery_area(dlm, dead_node);
+ if (status < 0) {
+ mlog(ML_ERROR, "%s: failed to alloc recovery area, "
+ "retrying\n", dlm->name);
+ msleep(1000);
+ }
+ } while (status != 0);
/* safe to access the node data list without a lock, since this
* process is the only one to change the list */
@@ -452,16 +579,36 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
continue;
}
- status = dlm_request_all_locks(dlm, ndata->node_num, dead_node);
- if (status < 0) {
- mlog_errno(status);
- if (dlm_is_host_down(status))
- ndata->state = DLM_RECO_NODE_DATA_DEAD;
- else {
- destroy = 1;
- goto leave;
+ do {
+ status = dlm_request_all_locks(dlm, ndata->node_num,
+ dead_node);
+ if (status < 0) {
+ mlog_errno(status);
+ if (dlm_is_host_down(status)) {
+ /* node died, ignore it for recovery */
+ status = 0;
+ ndata->state = DLM_RECO_NODE_DATA_DEAD;
+ /* wait for the domain map to catch up
+ * with the network state. */
+ wait_event_timeout(dlm->dlm_reco_thread_wq,
+ dlm_is_node_dead(dlm,
+ ndata->node_num),
+ msecs_to_jiffies(1000));
+ mlog(0, "waited 1 sec for %u, "
+ "dead? %s\n", ndata->node_num,
+ dlm_is_node_dead(dlm, ndata->node_num) ?
+ "yes" : "no");
+ } else {
+ /* -ENOMEM on the other node */
+ mlog(0, "%s: node %u returned "
+ "%d during recovery, retrying "
+ "after a short wait\n",
+ dlm->name, ndata->node_num,
+ status);
+ msleep(100);
+ }
}
- }
+ } while (status != 0);
switch (ndata->state) {
case DLM_RECO_NODE_DATA_INIT:
@@ -473,10 +620,9 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
mlog(0, "node %u died after requesting "
"recovery info for node %u\n",
ndata->node_num, dead_node);
- // start all over
- destroy = 1;
- status = -EAGAIN;
- goto leave;
+ /* fine. don't need this node's info.
+ * continue without it. */
+ break;
case DLM_RECO_NODE_DATA_REQUESTING:
ndata->state = DLM_RECO_NODE_DATA_REQUESTED;
mlog(0, "now receiving recovery data from "
@@ -520,35 +666,26 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
BUG();
break;
case DLM_RECO_NODE_DATA_DEAD:
- mlog(ML_NOTICE, "node %u died after "
+ mlog(0, "node %u died after "
"requesting recovery info for "
"node %u\n", ndata->node_num,
dead_node);
- spin_unlock(&dlm_reco_state_lock);
- // start all over
- destroy = 1;
- status = -EAGAIN;
- /* instead of spinning like crazy here,
- * wait for the domain map to catch up
- * with the network state. otherwise this
- * can be hit hundreds of times before
- * the node is really seen as dead. */
- wait_event_timeout(dlm->dlm_reco_thread_wq,
- dlm_is_node_dead(dlm,
- ndata->node_num),
- msecs_to_jiffies(1000));
- mlog(0, "waited 1 sec for %u, "
- "dead? %s\n", ndata->node_num,
- dlm_is_node_dead(dlm, ndata->node_num) ?
- "yes" : "no");
- goto leave;
+ break;
case DLM_RECO_NODE_DATA_RECEIVING:
case DLM_RECO_NODE_DATA_REQUESTED:
+ mlog(0, "%s: node %u still in state %s\n",
+ dlm->name, ndata->node_num,
+ ndata->state==DLM_RECO_NODE_DATA_RECEIVING ?
+ "receiving" : "requested");
all_nodes_done = 0;
break;
case DLM_RECO_NODE_DATA_DONE:
+ mlog(0, "%s: node %u state is done\n",
+ dlm->name, ndata->node_num);
break;
case DLM_RECO_NODE_DATA_FINALIZE_SENT:
+ mlog(0, "%s: node %u state is finalize\n",
+ dlm->name, ndata->node_num);
break;
}
}
@@ -578,7 +715,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
jiffies, dlm->reco.dead_node,
dlm->node_num, dlm->reco.new_master);
destroy = 1;
- status = ret;
+ status = 0;
/* rescan everything marked dirty along the way */
dlm_kick_thread(dlm, NULL);
break;
@@ -591,7 +728,6 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
}
-leave:
if (destroy)
dlm_destroy_recovery_area(dlm, dead_node);
@@ -617,7 +753,7 @@ static int dlm_init_recovery_area(struct dlm_ctxt *dlm, u8 dead_node)
}
BUG_ON(num == dead_node);
- ndata = kcalloc(1, sizeof(*ndata), GFP_KERNEL);
+ ndata = kcalloc(1, sizeof(*ndata), GFP_NOFS);
if (!ndata) {
dlm_destroy_recovery_area(dlm, dead_node);
return -ENOMEM;
@@ -691,16 +827,25 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
if (!dlm_grab(dlm))
return -EINVAL;
+ if (lr->dead_node != dlm->reco.dead_node) {
+ mlog(ML_ERROR, "%s: node %u sent dead_node=%u, but local "
+ "dead_node is %u\n", dlm->name, lr->node_idx,
+ lr->dead_node, dlm->reco.dead_node);
+ dlm_print_reco_node_status(dlm);
+ /* this is a hack */
+ dlm_put(dlm);
+ return -ENOMEM;
+ }
BUG_ON(lr->dead_node != dlm->reco.dead_node);
- item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+ item = kcalloc(1, sizeof(*item), GFP_NOFS);
if (!item) {
dlm_put(dlm);
return -ENOMEM;
}
/* this will get freed by dlm_request_all_locks_worker */
- buf = (char *) __get_free_page(GFP_KERNEL);
+ buf = (char *) __get_free_page(GFP_NOFS);
if (!buf) {
kfree(item);
dlm_put(dlm);
@@ -715,7 +860,7 @@ int dlm_request_all_locks_handler(struct o2net_msg *msg, u32 len, void *data)
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
- schedule_work(&dlm->dispatched_work);
+ queue_work(dlm->dlm_worker, &dlm->dispatched_work);
dlm_put(dlm);
return 0;
@@ -730,32 +875,34 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
struct list_head *iter;
int ret;
u8 dead_node, reco_master;
+ int skip_all_done = 0;
dlm = item->dlm;
dead_node = item->u.ral.dead_node;
reco_master = item->u.ral.reco_master;
mres = (struct dlm_migratable_lockres *)data;
+ mlog(0, "%s: recovery worker started, dead=%u, master=%u\n",
+ dlm->name, dead_node, reco_master);
+
if (dead_node != dlm->reco.dead_node ||
reco_master != dlm->reco.new_master) {
- /* show extra debug info if the recovery state is messed */
- mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
- "request(dead=%u, master=%u)\n",
- dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
- dead_node, reco_master);
- mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
- "entry[0]={c=%u:%llu,l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
- dlm->name, mres->lockname_len, mres->lockname, mres->master,
- mres->num_locks, mres->total_locks, mres->flags,
- dlm_get_lock_cookie_node(mres->ml[0].cookie),
- dlm_get_lock_cookie_seq(mres->ml[0].cookie),
- mres->ml[0].list, mres->ml[0].flags,
- mres->ml[0].type, mres->ml[0].convert_type,
- mres->ml[0].highest_blocked, mres->ml[0].node);
- BUG();
+ /* worker could have been created before the recovery master
+ * died. if so, do not continue, but do not error. */
+ if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
+ mlog(ML_NOTICE, "%s: will not send recovery state, "
+ "recovery master %u died, thread=(dead=%u,mas=%u)"
+ " current=(dead=%u,mas=%u)\n", dlm->name,
+ reco_master, dead_node, reco_master,
+ dlm->reco.dead_node, dlm->reco.new_master);
+ } else {
+ mlog(ML_NOTICE, "%s: reco state invalid: reco(dead=%u, "
+ "master=%u), request(dead=%u, master=%u)\n",
+ dlm->name, dlm->reco.dead_node,
+ dlm->reco.new_master, dead_node, reco_master);
+ }
+ goto leave;
}
- BUG_ON(dead_node != dlm->reco.dead_node);
- BUG_ON(reco_master != dlm->reco.new_master);
/* lock resources should have already been moved to the
* dlm->reco.resources list. now move items from that list
@@ -766,12 +913,20 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
/* now we can begin blasting lockreses without the dlm lock */
+
+ /* any errors returned will be due to the new_master dying,
+ * the dlm_reco_thread should detect this */
list_for_each(iter, &resources) {
res = list_entry (iter, struct dlm_lock_resource, recovering);
ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
DLM_MRES_RECOVERY);
- if (ret < 0)
- mlog_errno(ret);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: node %u went down while sending "
+ "recovery state for dead node %u, ret=%d\n", dlm->name,
+ reco_master, dead_node, ret);
+ skip_all_done = 1;
+ break;
+ }
}
/* move the resources back to the list */
@@ -779,10 +934,15 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
list_splice_init(&resources, &dlm->reco.resources);
spin_unlock(&dlm->spinlock);
- ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
- if (ret < 0)
- mlog_errno(ret);
-
+ if (!skip_all_done) {
+ ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
+ if (ret < 0) {
+ mlog(ML_ERROR, "%s: node %u went down while sending "
+ "recovery all-done for dead node %u, ret=%d\n",
+ dlm->name, reco_master, dead_node, ret);
+ }
+ }
+leave:
free_page((unsigned long)data);
}
@@ -801,8 +961,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
sizeof(done_msg), send_to, &tmpret);
- /* negative status is ignored by the caller */
- if (ret >= 0)
+ if (ret < 0) {
+ if (!dlm_is_host_down(ret)) {
+ mlog_errno(ret);
+ mlog(ML_ERROR, "%s: unknown error sending data-done "
+ "to %u\n", dlm->name, send_to);
+ BUG();
+ }
+ } else
ret = tmpret;
return ret;
}
@@ -822,7 +988,11 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
mlog(0, "got DATA DONE: dead_node=%u, reco.dead_node=%u, "
"node_idx=%u, this node=%u\n", done->dead_node,
dlm->reco.dead_node, done->node_idx, dlm->node_num);
- BUG_ON(done->dead_node != dlm->reco.dead_node);
+
+ mlog_bug_on_msg((done->dead_node != dlm->reco.dead_node),
+ "Got DATA DONE: dead_node=%u, reco.dead_node=%u, "
+ "node_idx=%u, this node=%u\n", done->dead_node,
+ dlm->reco.dead_node, done->node_idx, dlm->node_num);
spin_lock(&dlm_reco_state_lock);
list_for_each(iter, &dlm->reco.node_data) {
@@ -905,13 +1075,11 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
mlog(0, "found lockres owned by dead node while "
"doing recovery for node %u. sending it.\n",
dead_node);
- list_del_init(&res->recovering);
- list_add_tail(&res->recovering, list);
+ list_move_tail(&res->recovering, list);
} else if (res->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
mlog(0, "found UNKNOWN owner while doing recovery "
"for node %u. sending it.\n", dead_node);
- list_del_init(&res->recovering);
- list_add_tail(&res->recovering, list);
+ list_move_tail(&res->recovering, list);
}
}
spin_unlock(&dlm->spinlock);
@@ -1023,8 +1191,9 @@ static int dlm_add_lock_to_array(struct dlm_lock *lock,
ml->type == LKM_PRMODE) {
/* if it is already set, this had better be a PR
* and it has to match */
- if (mres->lvb[0] && (ml->type == LKM_EXMODE ||
- memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
+ if (!dlm_lvb_is_empty(mres->lvb) &&
+ (ml->type == LKM_EXMODE ||
+ memcmp(mres->lvb, lock->lksb->lvb, DLM_LVB_LEN))) {
mlog(ML_ERROR, "mismatched lvbs!\n");
__dlm_print_one_lock_resource(lock->lockres);
BUG();
@@ -1083,22 +1252,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
* we must send it immediately. */
ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
res, total_locks);
- if (ret < 0) {
- // TODO
- mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
- "returned %d, TODO\n", ret);
- BUG();
- }
+ if (ret < 0)
+ goto error;
}
}
/* flush any remaining locks */
ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
- if (ret < 0) {
- // TODO
- mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, "
- "TODO\n", ret);
+ if (ret < 0)
+ goto error;
+ return ret;
+
+error:
+ mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
+ dlm->name, ret);
+ if (!dlm_is_host_down(ret))
BUG();
- }
+ mlog(0, "%s: node %u went down while sending %s "
+ "lockres %.*s\n", dlm->name, send_to,
+ flags & DLM_MRES_RECOVERY ? "recovery" : "migration",
+ res->lockname.len, res->lockname.name);
return ret;
}
@@ -1146,8 +1318,8 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
mlog(0, "all done flag. all lockres data received!\n");
ret = -ENOMEM;
- buf = kmalloc(be16_to_cpu(msg->data_len), GFP_KERNEL);
- item = kcalloc(1, sizeof(*item), GFP_KERNEL);
+ buf = kmalloc(be16_to_cpu(msg->data_len), GFP_NOFS);
+ item = kcalloc(1, sizeof(*item), GFP_NOFS);
if (!buf || !item)
goto leave;
@@ -1238,7 +1410,7 @@ int dlm_mig_lockres_handler(struct o2net_msg *msg, u32 len, void *data)
spin_lock(&dlm->work_lock);
list_add_tail(&item->list, &dlm->work_list);
spin_unlock(&dlm->work_lock);
- schedule_work(&dlm->dispatched_work);
+ queue_work(dlm->dlm_worker, &dlm->dispatched_work);
leave:
dlm_put(dlm);
@@ -1406,6 +1578,7 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
struct dlm_ctxt *dlm = data;
struct dlm_master_requery *req = (struct dlm_master_requery *)msg->buf;
struct dlm_lock_resource *res = NULL;
+ unsigned int hash;
int master = DLM_LOCK_RES_OWNER_UNKNOWN;
u32 flags = DLM_ASSERT_MASTER_REQUERY;
@@ -1415,8 +1588,10 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data)
return master;
}
+ hash = dlm_lockid_hash(req->name, req->namelen);
+
spin_lock(&dlm->spinlock);
- res = __dlm_lookup_lockres(dlm, req->name, req->namelen);
+ res = __dlm_lookup_lockres(dlm, req->name, req->namelen, hash);
if (res) {
spin_lock(&res->spinlock);
master = res->owner;
@@ -1483,7 +1658,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
struct dlm_lock *newlock = NULL;
struct dlm_lockstatus *lksb = NULL;
int ret = 0;
- int i;
+ int i, bad;
struct list_head *iter;
struct dlm_lock *lock = NULL;
@@ -1529,8 +1704,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
/* move the lock to its proper place */
/* do not alter lock refcount. switching lists. */
- list_del_init(&lock->list);
- list_add_tail(&lock->list, queue);
+ list_move_tail(&lock->list, queue);
spin_unlock(&res->spinlock);
mlog(0, "just reordered a local lock!\n");
@@ -1553,28 +1727,48 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
}
lksb->flags |= (ml->flags &
(DLM_LKSB_PUT_LVB|DLM_LKSB_GET_LVB));
-
- if (mres->lvb[0]) {
+
+ if (ml->type == LKM_NLMODE)
+ goto skip_lvb;
+
+ if (!dlm_lvb_is_empty(mres->lvb)) {
if (lksb->flags & DLM_LKSB_PUT_LVB) {
/* other node was trying to update
* lvb when node died. recreate the
* lksb with the updated lvb. */
memcpy(lksb->lvb, mres->lvb, DLM_LVB_LEN);
+ /* the lock resource lvb update must happen
+ * NOW, before the spinlock is dropped.
+ * we no longer wait for the AST to update
+ * the lvb. */
+ memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
} else {
/* otherwise, the node is sending its
* most recent valid lvb info */
BUG_ON(ml->type != LKM_EXMODE &&
ml->type != LKM_PRMODE);
- if (res->lvb[0] && (ml->type == LKM_EXMODE ||
- memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
- mlog(ML_ERROR, "received bad lvb!\n");
- __dlm_print_one_lock_resource(res);
- BUG();
+ if (!dlm_lvb_is_empty(res->lvb) &&
+ (ml->type == LKM_EXMODE ||
+ memcmp(res->lvb, mres->lvb, DLM_LVB_LEN))) {
+ int i;
+ mlog(ML_ERROR, "%s:%.*s: received bad "
+ "lvb! type=%d\n", dlm->name,
+ res->lockname.len,
+ res->lockname.name, ml->type);
+ printk("lockres lvb=[");
+ for (i=0; i<DLM_LVB_LEN; i++)
+ printk("%02x", res->lvb[i]);
+ printk("]\nmigrated lvb=[");
+ for (i=0; i<DLM_LVB_LEN; i++)
+ printk("%02x", mres->lvb[i]);
+ printk("]\n");
+ dlm_print_one_lock_resource(res);
+ BUG();
}
memcpy(res->lvb, mres->lvb, DLM_LVB_LEN);
}
}
-
+skip_lvb:
/* NOTE:
* wrt lock queue ordering and recovery:
@@ -1592,9 +1786,33 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
* relative to each other, but clearly *not*
* preserved relative to locks from other nodes.
*/
+ bad = 0;
spin_lock(&res->spinlock);
- dlm_lock_get(newlock);
- list_add_tail(&newlock->list, queue);
+ list_for_each_entry(lock, queue, list) {
+ if (lock->ml.cookie == ml->cookie) {
+ u64 c = lock->ml.cookie;
+ mlog(ML_ERROR, "%s:%.*s: %u:%llu: lock already "
+ "exists on this lockres!\n", dlm->name,
+ res->lockname.len, res->lockname.name,
+ dlm_get_lock_cookie_node(c),
+ dlm_get_lock_cookie_seq(c));
+
+ mlog(ML_NOTICE, "sent lock: type=%d, conv=%d, "
+ "node=%u, cookie=%u:%llu, queue=%d\n",
+ ml->type, ml->convert_type, ml->node,
+ dlm_get_lock_cookie_node(ml->cookie),
+ dlm_get_lock_cookie_seq(ml->cookie),
+ ml->list);
+
+ __dlm_print_one_lock_resource(res);
+ bad = 1;
+ break;
+ }
+ }
+ if (!bad) {
+ dlm_lock_get(newlock);
+ list_add_tail(&newlock->list, queue);
+ }
spin_unlock(&res->spinlock);
}
mlog(0, "done running all the locks\n");
@@ -1618,8 +1836,14 @@ void dlm_move_lockres_to_recovery_list(struct dlm_ctxt *dlm,
struct dlm_lock *lock;
res->state |= DLM_LOCK_RES_RECOVERING;
- if (!list_empty(&res->recovering))
+ if (!list_empty(&res->recovering)) {
+ mlog(0,
+ "Recovering res %s:%.*s, is already on recovery list!\n",
+ dlm->name, res->lockname.len, res->lockname.name);
list_del_init(&res->recovering);
+ }
+ /* We need to hold a reference while on the recovery list */
+ dlm_lockres_get(res);
list_add_tail(&res->recovering, &dlm->reco.resources);
/* find any pending locks and put them back on proper list */
@@ -1708,9 +1932,11 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
spin_lock(&res->spinlock);
dlm_change_lockres_owner(dlm, res, new_master);
res->state &= ~DLM_LOCK_RES_RECOVERING;
- __dlm_dirty_lockres(dlm, res);
+ if (!__dlm_lockres_unused(res))
+ __dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
+ dlm_lockres_put(res);
}
}
@@ -1719,7 +1945,7 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
* the RECOVERING state and set the owner
* if necessary */
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
- bucket = &(dlm->lockres_hash[i]);
+ bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
if (res->state & DLM_LOCK_RES_RECOVERING) {
if (res->owner == dead_node) {
@@ -1743,11 +1969,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
dlm->name, res->lockname.len,
res->lockname.name, res->owner);
list_del_init(&res->recovering);
+ dlm_lockres_put(res);
}
spin_lock(&res->spinlock);
dlm_change_lockres_owner(dlm, res, new_master);
res->state &= ~DLM_LOCK_RES_RECOVERING;
- __dlm_dirty_lockres(dlm, res);
+ if (!__dlm_lockres_unused(res))
+ __dlm_dirty_lockres(dlm, res);
spin_unlock(&res->spinlock);
wake_up(&res->wq);
}
@@ -1884,7 +2112,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
* need to be fired as a result.
*/
for (i = 0; i < DLM_HASH_BUCKETS; i++) {
- bucket = &(dlm->lockres_hash[i]);
+ bucket = dlm_lockres_hash(dlm, i);
hlist_for_each_entry(res, iter, bucket, hash_node) {
/* always prune any $RECOVERY entries for dead nodes,
* otherwise hangs can occur during later recovery */
@@ -1924,6 +2152,20 @@ static void __dlm_hb_node_down(struct dlm_ctxt *dlm, int idx)
{
assert_spin_locked(&dlm->spinlock);
+ if (dlm->reco.new_master == idx) {
+ mlog(0, "%s: recovery master %d just died\n",
+ dlm->name, idx);
+ if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+ /* finalize1 was reached, so it is safe to clear
+ * the new_master and dead_node. that recovery
+ * is complete. */
+ mlog(0, "%s: dead master %d had reached "
+ "finalize1 state, clearing\n", dlm->name, idx);
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ __dlm_reset_recovery(dlm);
+ }
+ }
+
/* check to see if the node is already considered dead */
if (!test_bit(idx, dlm->live_nodes_map)) {
mlog(0, "for domain %s, node %d is already dead. "
@@ -2087,7 +2329,7 @@ again:
/* set the new_master to this node */
spin_lock(&dlm->spinlock);
- dlm->reco.new_master = dlm->node_num;
+ dlm_set_reco_master(dlm, dlm->node_num);
spin_unlock(&dlm->spinlock);
}
@@ -2125,6 +2367,10 @@ again:
mlog(0, "%s: reco master %u is ready to recover %u\n",
dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
status = -EEXIST;
+ } else if (ret == DLM_RECOVERING) {
+ mlog(0, "dlm=%s dlmlock says master node died (this=%u)\n",
+ dlm->name, dlm->node_num);
+ goto again;
} else {
struct dlm_lock_resource *res;
@@ -2156,7 +2402,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
mlog_entry("%u\n", dead_node);
- mlog(0, "dead node is %u\n", dead_node);
+ mlog(0, "%s: dead node is %u\n", dlm->name, dead_node);
spin_lock(&dlm->spinlock);
dlm_node_iter_init(dlm->domain_map, &iter);
@@ -2214,6 +2460,14 @@ retry:
* another ENOMEM */
msleep(100);
goto retry;
+ } else if (ret == EAGAIN) {
+ mlog(0, "%s: trying to start recovery of node "
+ "%u, but node %u is waiting for last recovery "
+ "to complete, backoff for a bit\n", dlm->name,
+ dead_node, nodenum);
+ /* TODO Look into replacing msleep with cond_resched() */
+ msleep(100);
+ goto retry;
}
}
@@ -2229,8 +2483,20 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
if (!dlm_grab(dlm))
return 0;
- mlog(0, "node %u wants to recover node %u\n",
- br->node_idx, br->dead_node);
+ spin_lock(&dlm->spinlock);
+ if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+ mlog(0, "%s: node %u wants to recover node %u (%u:%u) "
+ "but this node is in finalize state, waiting on finalize2\n",
+ dlm->name, br->node_idx, br->dead_node,
+ dlm->reco.dead_node, dlm->reco.new_master);
+ spin_unlock(&dlm->spinlock);
+ return EAGAIN;
+ }
+ spin_unlock(&dlm->spinlock);
+
+ mlog(0, "%s: node %u wants to recover node %u (%u:%u)\n",
+ dlm->name, br->node_idx, br->dead_node,
+ dlm->reco.dead_node, dlm->reco.new_master);
dlm_fire_domain_eviction_callbacks(dlm, br->dead_node);
@@ -2252,8 +2518,8 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
"node %u changing it to %u\n", dlm->name,
dlm->reco.dead_node, br->node_idx, br->dead_node);
}
- dlm->reco.new_master = br->node_idx;
- dlm->reco.dead_node = br->dead_node;
+ dlm_set_reco_master(dlm, br->node_idx);
+ dlm_set_reco_dead_node(dlm, br->dead_node);
if (!test_bit(br->dead_node, dlm->recovery_map)) {
mlog(0, "recovery master %u sees %u as dead, but this "
"node has not yet. marking %u as dead\n",
@@ -2272,10 +2538,16 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
spin_unlock(&dlm->spinlock);
dlm_kick_recovery_thread(dlm);
+
+ mlog(0, "%s: recovery started by node %u, for %u (%u:%u)\n",
+ dlm->name, br->node_idx, br->dead_node,
+ dlm->reco.dead_node, dlm->reco.new_master);
+
dlm_put(dlm);
return 0;
}
+#define DLM_FINALIZE_STAGE2 0x01
static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
{
int ret = 0;
@@ -2283,25 +2555,31 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
struct dlm_node_iter iter;
int nodenum;
int status;
+ int stage = 1;
- mlog(0, "finishing recovery for node %s:%u\n",
- dlm->name, dlm->reco.dead_node);
+ mlog(0, "finishing recovery for node %s:%u, "
+ "stage %d\n", dlm->name, dlm->reco.dead_node, stage);
spin_lock(&dlm->spinlock);
dlm_node_iter_init(dlm->domain_map, &iter);
spin_unlock(&dlm->spinlock);
+stage2:
memset(&fr, 0, sizeof(fr));
fr.node_idx = dlm->node_num;
fr.dead_node = dlm->reco.dead_node;
+ if (stage == 2)
+ fr.flags |= DLM_FINALIZE_STAGE2;
while ((nodenum = dlm_node_iter_next(&iter)) >= 0) {
if (nodenum == dlm->node_num)
continue;
ret = o2net_send_message(DLM_FINALIZE_RECO_MSG, dlm->key,
&fr, sizeof(fr), nodenum, &status);
- if (ret >= 0) {
+ if (ret >= 0)
ret = status;
+ if (ret < 0) {
+ mlog_errno(ret);
if (dlm_is_host_down(ret)) {
/* this has no effect on this recovery
* session, so set the status to zero to
@@ -2309,13 +2587,17 @@ static int dlm_send_finalize_reco_message(struct dlm_ctxt *dlm)
mlog(ML_ERROR, "node %u went down after this "
"node finished recovery.\n", nodenum);
ret = 0;
+ continue;
}
- }
- if (ret < 0) {
- mlog_errno(ret);
break;
}
}
+ if (stage == 1) {
+ /* reset the node_iter back to the top and send finalize2 */
+ iter.curnode = -1;
+ stage = 2;
+ goto stage2;
+ }
return ret;
}
@@ -2324,14 +2606,19 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
{
struct dlm_ctxt *dlm = data;
struct dlm_finalize_reco *fr = (struct dlm_finalize_reco *)msg->buf;
+ int stage = 1;
/* ok to return 0, domain has gone away */
if (!dlm_grab(dlm))
return 0;
- mlog(0, "node %u finalizing recovery of node %u\n",
- fr->node_idx, fr->dead_node);
+ if (fr->flags & DLM_FINALIZE_STAGE2)
+ stage = 2;
+ mlog(0, "%s: node %u finalizing recovery stage%d of "
+ "node %u (%u:%u)\n", dlm->name, fr->node_idx, stage,
+ fr->dead_node, dlm->reco.dead_node, dlm->reco.new_master);
+
spin_lock(&dlm->spinlock);
if (dlm->reco.new_master != fr->node_idx) {
@@ -2347,13 +2634,41 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data)
BUG();
}
- dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
-
- spin_unlock(&dlm->spinlock);
+ switch (stage) {
+ case 1:
+ dlm_finish_local_lockres_recovery(dlm, fr->dead_node, fr->node_idx);
+ if (dlm->reco.state & DLM_RECO_STATE_FINALIZE) {
+ mlog(ML_ERROR, "%s: received finalize1 from "
+ "new master %u for dead node %u, but "
+ "this node has already received it!\n",
+ dlm->name, fr->node_idx, fr->dead_node);
+ dlm_print_reco_node_status(dlm);
+ BUG();
+ }
+ dlm->reco.state |= DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+ break;
+ case 2:
+ if (!(dlm->reco.state & DLM_RECO_STATE_FINALIZE)) {
+ mlog(ML_ERROR, "%s: received finalize2 from "
+ "new master %u for dead node %u, but "
+ "this node did not have finalize1!\n",
+ dlm->name, fr->node_idx, fr->dead_node);
+ dlm_print_reco_node_status(dlm);
+ BUG();
+ }
+ dlm->reco.state &= ~DLM_RECO_STATE_FINALIZE;
+ spin_unlock(&dlm->spinlock);
+ dlm_reset_recovery(dlm);
+ dlm_kick_recovery_thread(dlm);
+ break;
+ default:
+ BUG();
+ }
- dlm_reset_recovery(dlm);
+ mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
+ dlm->name, fr->node_idx, dlm->reco.dead_node, dlm->reco.new_master);
- dlm_kick_recovery_thread(dlm);
dlm_put(dlm);
return 0;
}
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 5be9d14f12c..0c822f3ffb0 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -39,6 +39,7 @@
#include <linux/inet.h>
#include <linux/timer.h>
#include <linux/kthread.h>
+#include <linux/delay.h>
#include "cluster/heartbeat.h"
@@ -53,6 +54,8 @@
#include "cluster/masklog.h"
static int dlm_thread(void *data);
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *lockres);
static void dlm_flush_asts(struct dlm_ctxt *dlm);
@@ -80,7 +83,7 @@ repeat:
}
-static int __dlm_lockres_unused(struct dlm_lock_resource *res)
+int __dlm_lockres_unused(struct dlm_lock_resource *res)
{
if (list_empty(&res->granted) &&
list_empty(&res->converting) &&
@@ -103,6 +106,20 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
assert_spin_locked(&res->spinlock);
if (__dlm_lockres_unused(res)){
+ /* For now, just keep any resource we master */
+ if (res->owner == dlm->node_num)
+ {
+ if (!list_empty(&res->purge)) {
+ mlog(0, "we master %s:%.*s, but it is on "
+ "the purge list. Removing\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name);
+ list_del_init(&res->purge);
+ dlm->purge_count--;
+ }
+ return;
+ }
+
if (list_empty(&res->purge)) {
mlog(0, "putting lockres %.*s from purge list\n",
res->lockname.len, res->lockname.name);
@@ -110,10 +127,23 @@ void __dlm_lockres_calc_usage(struct dlm_ctxt *dlm,
res->last_used = jiffies;
list_add_tail(&res->purge, &dlm->purge_list);
dlm->purge_count++;
+
+ /* if this node is not the owner, there is
+ * no way to keep track of who the owner could be.
+ * unhash it to avoid serious problems. */
+ if (res->owner != dlm->node_num) {
+ mlog(0, "%s:%.*s: doing immediate "
+ "purge of lockres owned by %u\n",
+ dlm->name, res->lockname.len,
+ res->lockname.name, res->owner);
+
+ dlm_purge_lockres_now(dlm, res);
+ }
}
} else if (!list_empty(&res->purge)) {
- mlog(0, "removing lockres %.*s from purge list\n",
- res->lockname.len, res->lockname.name);
+ mlog(0, "removing lockres %.*s from purge list, "
+ "owner=%u\n", res->lockname.len, res->lockname.name,
+ res->owner);
list_del_init(&res->purge);
dlm->purge_count--;
@@ -165,6 +195,7 @@ again:
} else if (ret < 0) {
mlog(ML_NOTICE, "lockres %.*s: migrate failed, retrying\n",
lockres->lockname.len, lockres->lockname.name);
+ msleep(100);
goto again;
}
@@ -178,6 +209,24 @@ finish:
__dlm_unhash_lockres(lockres);
}
+/* make an unused lockres go away immediately.
+ * as soon as the dlm spinlock is dropped, this lockres
+ * will not be found. kfree still happens on last put. */
+static void dlm_purge_lockres_now(struct dlm_ctxt *dlm,
+ struct dlm_lock_resource *lockres)
+{
+ assert_spin_locked(&dlm->spinlock);
+ assert_spin_locked(&lockres->spinlock);
+
+ BUG_ON(!__dlm_lockres_unused(lockres));
+
+ if (!list_empty(&lockres->purge)) {
+ list_del_init(&lockres->purge);
+ dlm->purge_count--;
+ }
+ __dlm_unhash_lockres(lockres);
+}
+
static void dlm_run_purge_list(struct dlm_ctxt *dlm,
int purge_now)
{
@@ -318,8 +367,7 @@ converting:
target->ml.type = target->ml.convert_type;
target->ml.convert_type = LKM_IVMODE;
- list_del_init(&target->list);
- list_add_tail(&target->list, &res->granted);
+ list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
target->lksb->status = DLM_NORMAL;
@@ -380,8 +428,7 @@ blocked:
target->ml.type, target->ml.node);
// target->ml.type is already correct
- list_del_init(&target->list);
- list_add_tail(&target->list, &res->granted);
+ list_move_tail(&target->list, &res->granted);
BUG_ON(!target->lksb);
target->lksb->status = DLM_NORMAL;
@@ -422,6 +469,8 @@ void __dlm_dirty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
/* don't shuffle secondary queues */
if ((res->owner == dlm->node_num) &&
!(res->state & DLM_LOCK_RES_DIRTY)) {
+ /* ref for dirty_list */
+ dlm_lockres_get(res);
list_add_tail(&res->dirty, &dlm->dirty_list);
res->state |= DLM_LOCK_RES_DIRTY;
}
@@ -606,6 +655,8 @@ static int dlm_thread(void *data)
list_del_init(&res->dirty);
spin_unlock(&res->spinlock);
spin_unlock(&dlm->spinlock);
+ /* Drop dirty_list ref */
+ dlm_lockres_put(res);
/* lockres can be re-dirtied/re-added to the
* dirty_list in this gap, but that is ok */
@@ -642,8 +693,9 @@ static int dlm_thread(void *data)
* spinlock and do NOT have the dlm lock.
* safe to reserve/queue asts and run the lists. */
- mlog(0, "calling dlm_shuffle_lists with dlm=%p, "
- "res=%p\n", dlm, res);
+ mlog(0, "calling dlm_shuffle_lists with dlm=%s, "
+ "res=%.*s\n", dlm->name,
+ res->lockname.len, res->lockname.name);
/* called while holding lockres lock */
dlm_shuffle_lists(dlm, res);
@@ -657,6 +709,8 @@ in_progress:
/* if the lock was in-progress, stick
* it on the back of the list */
if (delay) {
+ /* ref for dirty_list */
+ dlm_lockres_get(res);
spin_lock(&res->spinlock);
list_add_tail(&res->dirty, &dlm->dirty_list);
res->state |= DLM_LOCK_RES_DIRTY;
@@ -677,7 +731,7 @@ in_progress:
/* yield and continue right away if there is more work to do */
if (!n) {
- yield();
+ cond_resched();
continue;
}
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 7b1a2754267..b0c3134f4f7 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -271,8 +271,7 @@ void dlm_commit_pending_unlock(struct dlm_lock_resource *res,
void dlm_commit_pending_cancel(struct dlm_lock_resource *res,
struct dlm_lock *lock)
{
- list_del_init(&lock->list);
- list_add_tail(&lock->list, &res->granted);
+ list_move_tail(&lock->list, &res->granted);
lock->ml.convert_type = LKM_IVMODE;
}
@@ -319,6 +318,16 @@ static enum dlm_status dlm_send_remote_unlock_request(struct dlm_ctxt *dlm,
mlog_entry("%.*s\n", res->lockname.len, res->lockname.name);
+ if (owner == dlm->node_num) {
+ /* ended up trying to contact ourself. this means
+ * that the lockres had been remote but became local
+ * via a migration. just retry it, now as local */
+ mlog(0, "%s:%.*s: this node became the master due to a "
+ "migration, re-evaluate now\n", dlm->name,
+ res->lockname.len, res->lockname.name);
+ return DLM_FORWARD;
+ }
+
memset(&unlock, 0, sizeof(unlock));
unlock.node_idx = dlm->node_num;
unlock.flags = cpu_to_be32(flags);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index 74ca4e5f976..e641b084b34 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -672,7 +672,7 @@ struct dlm_ctxt *user_dlm_register_context(struct qstr *name)
u32 dlm_key;
char *domain;
- domain = kmalloc(name->len + 1, GFP_KERNEL);
+ domain = kmalloc(name->len + 1, GFP_NOFS);
if (!domain) {
mlog_errno(-ENOMEM);
return ERR_PTR(-ENOMEM);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index eebc3cfa6be..3fe8781c22c 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -222,8 +222,7 @@ void ocfs2_handle_add_inode(struct ocfs2_journal_handle *handle,
BUG_ON(!list_empty(&OCFS2_I(inode)->ip_handle_list));
OCFS2_I(inode)->ip_handle = handle;
- list_del(&(OCFS2_I(inode)->ip_handle_list));
- list_add_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
+ list_move_tail(&(OCFS2_I(inode)->ip_handle_list), &(handle->inode_list));
}
static void ocfs2_handle_unlock_inodes(struct ocfs2_journal_handle *handle)
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index efc7c91128a..93a56bd4a2b 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -1,5 +1,4 @@
-/* $Id: inode.c,v 1.15 2001/11/12 09:43:39 davem Exp $
- * openpromfs.c: /proc/openprom handling routines
+/* inode.c: /proc/openprom handling routines
*
* Copyright (C) 1996-1999 Jakub Jelinek (jakub@redhat.com)
* Copyright (C) 1998 Eddie C. Dost (ecd@skynet.be)
@@ -12,762 +11,245 @@
#include <linux/openprom_fs.h>
#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/seq_file.h>
#include <asm/openprom.h>
#include <asm/oplib.h>
+#include <asm/prom.h>
#include <asm/uaccess.h>
-#define ALIASES_NNODES 64
-
-typedef struct {
- u16 parent;
- u16 next;
- u16 child;
- u16 first_prop;
- u32 node;
-} openpromfs_node;
-
-typedef struct {
-#define OPP_STRING 0x10
-#define OPP_STRINGLIST 0x20
-#define OPP_BINARY 0x40
-#define OPP_HEXSTRING 0x80
-#define OPP_DIRTY 0x01
-#define OPP_QUOTED 0x02
-#define OPP_NOTQUOTED 0x04
-#define OPP_ASCIIZ 0x08
- u32 flag;
- u32 alloclen;
- u32 len;
- char *value;
- char name[8];
-} openprom_property;
-
-static openpromfs_node *nodes;
-static int alloced;
-static u16 last_node;
-static u16 first_prop;
-static u16 options = 0xffff;
-static u16 aliases = 0xffff;
-static int aliases_nodes;
-static char *alias_names [ALIASES_NNODES];
-
-#define OPENPROM_ROOT_INO 16
-#define OPENPROM_FIRST_INO OPENPROM_ROOT_INO
-#define NODE(ino) nodes[ino - OPENPROM_FIRST_INO]
-#define NODE2INO(node) (node + OPENPROM_FIRST_INO)
-#define NODEP2INO(no) (no + OPENPROM_FIRST_INO + last_node)
-
-static int openpromfs_create (struct inode *, struct dentry *, int, struct nameidata *);
-static int openpromfs_readdir(struct file *, void *, filldir_t);
-static struct dentry *openpromfs_lookup(struct inode *, struct dentry *dentry, struct nameidata *nd);
-static int openpromfs_unlink (struct inode *, struct dentry *dentry);
+static DEFINE_MUTEX(op_mutex);
-static inline u16 ptr_nod(void *p)
-{
- return (long)p & 0xFFFF;
-}
+#define OPENPROM_ROOT_INO 0
-static ssize_t nodenum_read(struct file *file, char __user *buf,
- size_t count, loff_t *ppos)
+enum op_inode_type {
+ op_inode_node,
+ op_inode_prop,
+};
+
+union op_inode_data {
+ struct device_node *node;
+ struct property *prop;
+};
+
+struct op_inode_info {
+ struct inode vfs_inode;
+ enum op_inode_type type;
+ union op_inode_data u;
+};
+
+static inline struct op_inode_info *OP_I(struct inode *inode)
{
- struct inode *inode = file->f_dentry->d_inode;
- char buffer[10];
-
- if (count < 0 || !inode->u.generic_ip)
- return -EINVAL;
- sprintf (buffer, "%8.8lx\n", (long)inode->u.generic_ip);
- if (file->f_pos >= 9)
- return 0;
- if (count > 9 - file->f_pos)
- count = 9 - file->f_pos;
- if (copy_to_user(buf, buffer + file->f_pos, count))
- return -EFAULT;
- *ppos += count;
- return count;
+ return container_of(inode, struct op_inode_info, vfs_inode);
}
-static ssize_t property_read(struct file *filp, char __user *buf,
- size_t count, loff_t *ppos)
+static int is_string(unsigned char *p, int len)
{
- struct inode *inode = filp->f_dentry->d_inode;
- int i, j, k;
- u32 node;
- char *p, *s;
- u32 *q;
- openprom_property *op;
- char buffer[64];
-
- if (!filp->private_data) {
- node = nodes[ptr_nod(inode->u.generic_ip)].node;
- i = ((u32)(long)inode->u.generic_ip) >> 16;
- if (ptr_nod(inode->u.generic_ip) == aliases) {
- if (i >= aliases_nodes)
- p = NULL;
- else
- p = alias_names [i];
- } else
- for (p = prom_firstprop (node, buffer);
- i && p && *p;
- p = prom_nextprop (node, p, buffer), i--)
- /* nothing */ ;
- if (!p || !*p)
- return -EIO;
- i = prom_getproplen (node, p);
- if (i < 0) {
- if (ptr_nod(inode->u.generic_ip) == aliases)
- i = 0;
- else
- return -EIO;
- }
- k = i;
- if (i < 64) i = 64;
- filp->private_data = kmalloc (sizeof (openprom_property)
- + (j = strlen (p)) + 2 * i,
- GFP_KERNEL);
- if (!filp->private_data)
- return -ENOMEM;
- op = filp->private_data;
- op->flag = 0;
- op->alloclen = 2 * i;
- strcpy (op->name, p);
- op->value = (char *)(((unsigned long)(op->name + j + 4)) & ~3);
- op->len = k;
- if (k && prom_getproperty (node, p, op->value, i) < 0)
- return -EIO;
- op->value [k] = 0;
- if (k) {
- for (s = NULL, p = op->value; p < op->value + k; p++) {
- if ((*p >= ' ' && *p <= '~') || *p == '\n') {
- op->flag |= OPP_STRING;
- s = p;
- continue;
- }
- if (p > op->value && !*p && s == p - 1) {
- if (p < op->value + k - 1)
- op->flag |= OPP_STRINGLIST;
- else
- op->flag |= OPP_ASCIIZ;
- continue;
- }
- if (k == 1 && !*p) {
- op->flag |= (OPP_STRING|OPP_ASCIIZ);
- break;
- }
- op->flag &= ~(OPP_STRING|OPP_STRINGLIST);
- if (k & 3)
- op->flag |= OPP_HEXSTRING;
- else
- op->flag |= OPP_BINARY;
- break;
- }
- if (op->flag & OPP_STRINGLIST)
- op->flag &= ~(OPP_STRING);
- if (op->flag & OPP_ASCIIZ)
- op->len--;
- }
- } else
- op = filp->private_data;
- if (!count || !(op->len || (op->flag & OPP_ASCIIZ)))
- return 0;
- if (*ppos >= 0xffffff || count >= 0xffffff)
- return -EINVAL;
- if (op->flag & OPP_STRINGLIST) {
- for (k = 0, p = op->value; p < op->value + op->len; p++)
- if (!*p)
- k++;
- i = op->len + 4 * k + 3;
- } else if (op->flag & OPP_STRING) {
- i = op->len + 3;
- } else if (op->flag & OPP_BINARY) {
- i = (op->len * 9) >> 2;
- } else {
- i = (op->len << 1) + 1;
- }
- k = *ppos;
- if (k >= i) return 0;
- if (count > i - k) count = i - k;
- if (op->flag & OPP_STRING) {
- if (!k) {
- if (put_user('\'', buf))
- return -EFAULT;
- k++;
- count--;
- }
+ int i;
- if (k + count >= i - 2)
- j = i - 2 - k;
- else
- j = count;
-
- if (j >= 0) {
- if (copy_to_user(buf + k - *ppos,
- op->value + k - 1, j))
- return -EFAULT;
- count -= j;
- k += j;
- }
+ for (i = 0; i < len; i++) {
+ unsigned char val = p[i];
- if (count) {
- if (put_user('\'', &buf [k++ - *ppos]))
- return -EFAULT;
- }
- if (count > 1) {
- if (put_user('\n', &buf [k++ - *ppos]))
- return -EFAULT;
- }
- } else if (op->flag & OPP_STRINGLIST) {
- char *tmp;
-
- tmp = kmalloc (i, GFP_KERNEL);
- if (!tmp)
- return -ENOMEM;
-
- s = tmp;
- *s++ = '\'';
- for (p = op->value; p < op->value + op->len; p++) {
- if (!*p) {
- strcpy(s, "' + '");
- s += 5;
- continue;
- }
- *s++ = *p;
- }
- strcpy(s, "'\n");
-
- if (copy_to_user(buf, tmp + k, count))
- return -EFAULT;
-
- kfree(tmp);
- k += count;
-
- } else if (op->flag & OPP_BINARY) {
- char buffer[10];
- u32 *first, *last;
- int first_off, last_cnt;
-
- first = ((u32 *)op->value) + k / 9;
- first_off = k % 9;
- last = ((u32 *)op->value) + (k + count - 1) / 9;
- last_cnt = (k + count) % 9;
- if (!last_cnt) last_cnt = 9;
-
- if (first == last) {
- sprintf (buffer, "%08x.", *first);
- if (copy_to_user(buf, buffer + first_off,
- last_cnt - first_off))
- return -EFAULT;
- buf += last_cnt - first_off;
- } else {
- for (q = first; q <= last; q++) {
- sprintf (buffer, "%08x.", *q);
- if (q == first) {
- if (copy_to_user(buf, buffer + first_off,
- 9 - first_off))
- return -EFAULT;
- buf += 9 - first_off;
- } else if (q == last) {
- if (copy_to_user(buf, buffer, last_cnt))
- return -EFAULT;
- buf += last_cnt;
- } else {
- if (copy_to_user(buf, buffer, 9))
- return -EFAULT;
- buf += 9;
- }
- }
- }
+ if ((i && !val) ||
+ (val >= ' ' && val <= '~'))
+ continue;
- if (last == (u32 *)(op->value + op->len - 4) && last_cnt == 9) {
- if (put_user('\n', (buf - 1)))
- return -EFAULT;
- }
+ return 0;
+ }
- k += count;
+ return 1;
+}
- } else if (op->flag & OPP_HEXSTRING) {
- char buffer[3];
+static int property_show(struct seq_file *f, void *v)
+{
+ struct property *prop = f->private;
+ void *pval;
+ int len;
- if ((k < i - 1) && (k & 1)) {
- sprintf (buffer, "%02x",
- (unsigned char) *(op->value + (k >> 1)) & 0xff);
- if (put_user(buffer[1], &buf[k++ - *ppos]))
- return -EFAULT;
- count--;
- }
+ len = prop->length;
+ pval = prop->value;
- for (; (count > 1) && (k < i - 1); k += 2) {
- sprintf (buffer, "%02x",
- (unsigned char) *(op->value + (k >> 1)) & 0xff);
- if (copy_to_user(buf + k - *ppos, buffer, 2))
- return -EFAULT;
- count -= 2;
- }
+ if (is_string(pval, len)) {
+ while (len > 0) {
+ int n = strlen(pval);
- if (count && (k < i - 1)) {
- sprintf (buffer, "%02x",
- (unsigned char) *(op->value + (k >> 1)) & 0xff);
- if (put_user(buffer[0], &buf[k++ - *ppos]))
- return -EFAULT;
- count--;
- }
+ seq_printf(f, "%s", (char *) pval);
- if (count) {
- if (put_user('\n', &buf [k++ - *ppos]))
- return -EFAULT;
- }
- }
- count = k - *ppos;
- *ppos = k;
- return count;
-}
+ /* Skip over the NULL byte too. */
+ pval += n + 1;
+ len -= n + 1;
-static ssize_t property_write(struct file *filp, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- int i, j, k;
- char *p;
- u32 *q;
- void *b;
- openprom_property *op;
-
- if (*ppos >= 0xffffff || count >= 0xffffff)
- return -EINVAL;
- if (!filp->private_data) {
- i = property_read (filp, NULL, 0, NULL);
- if (i)
- return i;
- }
- k = *ppos;
- op = filp->private_data;
- if (!(op->flag & OPP_STRING)) {
- u32 *first, *last;
- int first_off, last_cnt;
- u32 mask, mask2;
- char tmp [9];
- int forcelen = 0;
-
- j = k % 9;
- for (i = 0; i < count; i++, j++) {
- if (j == 9) j = 0;
- if (!j) {
- char ctmp;
- if (get_user(ctmp, &buf[i]))
- return -EFAULT;
- if (ctmp != '.') {
- if (ctmp != '\n') {
- if (op->flag & OPP_BINARY)
- return -EINVAL;
- else
- goto write_try_string;
- } else {
- count = i + 1;
- forcelen = 1;
- break;
- }
- }
- } else {
- char ctmp;
- if (get_user(ctmp, &buf[i]))
- return -EFAULT;
- if (ctmp < '0' ||
- (ctmp > '9' && ctmp < 'A') ||
- (ctmp > 'F' && ctmp < 'a') ||
- ctmp > 'f') {
- if (op->flag & OPP_BINARY)
- return -EINVAL;
- else
- goto write_try_string;
- }
- }
- }
- op->flag |= OPP_BINARY;
- tmp [8] = 0;
- i = ((count + k + 8) / 9) << 2;
- if (op->alloclen <= i) {
- b = kmalloc (sizeof (openprom_property) + 2 * i,
- GFP_KERNEL);
- if (!b)
- return -ENOMEM;
- memcpy (b, filp->private_data,
- sizeof (openprom_property)
- + strlen (op->name) + op->alloclen);
- memset (b + sizeof (openprom_property)
- + strlen (op->name) + op->alloclen,
- 0, 2 * i - op->alloclen);
- op = b;
- op->alloclen = 2*i;
- b = filp->private_data;
- filp->private_data = op;
- kfree (b);
+ if (len > 0)
+ seq_printf(f, " + ");
}
- first = ((u32 *)op->value) + (k / 9);
- first_off = k % 9;
- last = (u32 *)(op->value + i);
- last_cnt = (k + count) % 9;
- if (first + 1 == last) {
- memset (tmp, '0', 8);
- if (copy_from_user(tmp + first_off, buf,
- (count + first_off > 8) ?
- 8 - first_off : count))
- return -EFAULT;
- mask = 0xffffffff;
- mask2 = 0xffffffff;
- for (j = 0; j < first_off; j++)
- mask >>= 1;
- for (j = 8 - count - first_off; j > 0; j--)
- mask2 <<= 1;
- mask &= mask2;
- if (mask) {
- *first &= ~mask;
- *first |= simple_strtoul (tmp, NULL, 16);
- op->flag |= OPP_DIRTY;
+ } else {
+ if (len & 3) {
+ while (len) {
+ len--;
+ if (len)
+ seq_printf(f, "%02x.",
+ *(unsigned char *) pval);
+ else
+ seq_printf(f, "%02x",
+ *(unsigned char *) pval);
+ pval++;
}
} else {
- op->flag |= OPP_DIRTY;
- for (q = first; q < last; q++) {
- if (q == first) {
- if (first_off < 8) {
- memset (tmp, '0', 8);
- if (copy_from_user(tmp + first_off,
- buf,
- 8 - first_off))
- return -EFAULT;
- mask = 0xffffffff;
- for (j = 0; j < first_off; j++)
- mask >>= 1;
- *q &= ~mask;
- *q |= simple_strtoul (tmp,NULL,16);
- }
- buf += 9;
- } else if ((q == last - 1) && last_cnt
- && (last_cnt < 8)) {
- memset (tmp, '0', 8);
- if (copy_from_user(tmp, buf, last_cnt))
- return -EFAULT;
- mask = 0xffffffff;
- for (j = 0; j < 8 - last_cnt; j++)
- mask <<= 1;
- *q &= ~mask;
- *q |= simple_strtoul (tmp, NULL, 16);
- buf += last_cnt;
- } else {
- char tchars[2 * sizeof(long) + 1];
-
- if (copy_from_user(tchars, buf, sizeof(tchars) - 1))
- return -EFAULT;
- tchars[sizeof(tchars) - 1] = '\0';
- *q = simple_strtoul (tchars, NULL, 16);
- buf += 9;
- }
- }
- }
- if (!forcelen) {
- if (op->len < i)
- op->len = i;
- } else
- op->len = i;
- *ppos += count;
- }
-write_try_string:
- if (!(op->flag & OPP_BINARY)) {
- if (!(op->flag & (OPP_QUOTED | OPP_NOTQUOTED))) {
- char ctmp;
-
- /* No way, if somebody starts writing from the middle,
- * we don't know whether he uses quotes around or not
- */
- if (k > 0)
- return -EINVAL;
- if (get_user(ctmp, buf))
- return -EFAULT;
- if (ctmp == '\'') {
- op->flag |= OPP_QUOTED;
- buf++;
- count--;
- (*ppos)++;
- if (!count) {
- op->flag |= OPP_STRING;
- return 1;
- }
- } else
- op->flag |= OPP_NOTQUOTED;
- }
- op->flag |= OPP_STRING;
- if (op->alloclen <= count + *ppos) {
- b = kmalloc (sizeof (openprom_property)
- + 2 * (count + *ppos), GFP_KERNEL);
- if (!b)
- return -ENOMEM;
- memcpy (b, filp->private_data,
- sizeof (openprom_property)
- + strlen (op->name) + op->alloclen);
- memset (b + sizeof (openprom_property)
- + strlen (op->name) + op->alloclen,
- 0, 2*(count - *ppos) - op->alloclen);
- op = b;
- op->alloclen = 2*(count + *ppos);
- b = filp->private_data;
- filp->private_data = op;
- kfree (b);
- }
- p = op->value + *ppos - ((op->flag & OPP_QUOTED) ? 1 : 0);
- if (copy_from_user(p, buf, count))
- return -EFAULT;
- op->flag |= OPP_DIRTY;
- for (i = 0; i < count; i++, p++)
- if (*p == '\n') {
- *p = 0;
- break;
+ while (len >= 4) {
+ len -= 4;
+
+ if (len)
+ seq_printf(f, "%08x.",
+ *(unsigned int *) pval);
+ else
+ seq_printf(f, "%08x",
+ *(unsigned int *) pval);
+ pval += 4;
}
- if (i < count) {
- op->len = p - op->value;
- *ppos += i + 1;
- if ((p > op->value) && (op->flag & OPP_QUOTED)
- && (*(p - 1) == '\''))
- op->len--;
- } else {
- if (p - op->value > op->len)
- op->len = p - op->value;
- *ppos += count;
}
}
- return *ppos - k;
+ seq_printf(f, "\n");
+
+ return 0;
}
-int property_release (struct inode *inode, struct file *filp)
+static void *property_start(struct seq_file *f, loff_t *pos)
{
- openprom_property *op = filp->private_data;
- int error;
- u32 node;
-
- if (!op)
- return 0;
- lock_kernel();
- node = nodes[ptr_nod(inode->u.generic_ip)].node;
- if (ptr_nod(inode->u.generic_ip) == aliases) {
- if ((op->flag & OPP_DIRTY) && (op->flag & OPP_STRING)) {
- char *p = op->name;
- int i = (op->value - op->name) - strlen (op->name) - 1;
- op->value [op->len] = 0;
- *(op->value - 1) = ' ';
- if (i) {
- for (p = op->value - i - 2; p >= op->name; p--)
- p[i] = *p;
- p = op->name + i;
- }
- memcpy (p - 8, "nvalias ", 8);
- prom_feval (p - 8);
- }
- } else if (op->flag & OPP_DIRTY) {
- if (op->flag & OPP_STRING) {
- op->value [op->len] = 0;
- error = prom_setprop (node, op->name,
- op->value, op->len + 1);
- if (error <= 0)
- printk (KERN_WARNING "openpromfs: "
- "Couldn't write property %s\n",
- op->name);
- } else if ((op->flag & OPP_BINARY) || !op->len) {
- error = prom_setprop (node, op->name,
- op->value, op->len);
- if (error <= 0)
- printk (KERN_WARNING "openpromfs: "
- "Couldn't write property %s\n",
- op->name);
- } else {
- printk (KERN_WARNING "openpromfs: "
- "Unknown property type of %s\n",
- op->name);
- }
+ if (*pos == 0)
+ return pos;
+ return NULL;
+}
+
+static void *property_next(struct seq_file *f, void *v, loff_t *pos)
+{
+ (*pos)++;
+ return NULL;
+}
+
+static void property_stop(struct seq_file *f, void *v)
+{
+ /* Nothing to do */
+}
+
+static struct seq_operations property_op = {
+ .start = property_start,
+ .next = property_next,
+ .stop = property_stop,
+ .show = property_show
+};
+
+static int property_open(struct inode *inode, struct file *file)
+{
+ struct op_inode_info *oi = OP_I(inode);
+ int ret;
+
+ BUG_ON(oi->type != op_inode_prop);
+
+ ret = seq_open(file, &property_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = oi->u.prop;
}
- unlock_kernel();
- kfree (filp->private_data);
- return 0;
+ return ret;
}
static const struct file_operations openpromfs_prop_ops = {
- .read = property_read,
- .write = property_write,
- .release = property_release,
+ .open = property_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
-static const struct file_operations openpromfs_nodenum_ops = {
- .read = nodenum_read,
-};
+static int openpromfs_readdir(struct file *, void *, filldir_t);
static const struct file_operations openprom_operations = {
.read = generic_read_dir,
.readdir = openpromfs_readdir,
};
-static struct inode_operations openprom_alias_inode_operations = {
- .create = openpromfs_create,
- .lookup = openpromfs_lookup,
- .unlink = openpromfs_unlink,
-};
+static struct dentry *openpromfs_lookup(struct inode *, struct dentry *, struct nameidata *);
static struct inode_operations openprom_inode_operations = {
.lookup = openpromfs_lookup,
};
-static int lookup_children(u16 n, const char * name, int len)
+static struct dentry *openpromfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
{
- int ret;
- u16 node;
- for (; n != 0xffff; n = nodes[n].next) {
- node = nodes[n].child;
- if (node != 0xffff) {
- char buffer[128];
- int i;
- char *p;
-
- while (node != 0xffff) {
- if (prom_getname (nodes[node].node,
- buffer, 128) >= 0) {
- i = strlen (buffer);
- if ((len == i)
- && !strncmp (buffer, name, len))
- return NODE2INO(node);
- p = strchr (buffer, '@');
- if (p && (len == p - buffer)
- && !strncmp (buffer, name, len))
- return NODE2INO(node);
- }
- node = nodes[node].next;
- }
- } else
- continue;
- ret = lookup_children (nodes[n].child, name, len);
- if (ret) return ret;
- }
- return 0;
-}
-
-static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentry, struct nameidata *nd)
-{
- int ino = 0;
-#define OPFSL_DIR 0
-#define OPFSL_PROPERTY 1
-#define OPFSL_NODENUM 2
- int type = 0;
- char buffer[128];
- char *p;
+ struct op_inode_info *ent_oi, *oi = OP_I(dir);
+ struct device_node *dp, *child;
+ struct property *prop;
+ enum op_inode_type ent_type;
+ union op_inode_data ent_data;
const char *name;
- u32 n;
- u16 dirnode;
- unsigned int len;
- int i;
struct inode *inode;
- char buffer2[64];
+ unsigned int ino;
+ int len;
- inode = NULL;
+ BUG_ON(oi->type != op_inode_node);
+
+ dp = oi->u.node;
+
name = dentry->d_name.name;
len = dentry->d_name.len;
- lock_kernel();
- if (name [0] == '.' && len == 5 && !strncmp (name + 1, "node", 4)) {
- ino = NODEP2INO(NODE(dir->i_ino).first_prop);
- type = OPFSL_NODENUM;
- }
- if (!ino) {
- u16 node = NODE(dir->i_ino).child;
- while (node != 0xffff) {
- if (prom_getname (nodes[node].node, buffer, 128) >= 0) {
- i = strlen (buffer);
- if (len == i && !strncmp (buffer, name, len)) {
- ino = NODE2INO(node);
- type = OPFSL_DIR;
- break;
- }
- p = strchr (buffer, '@');
- if (p && (len == p - buffer)
- && !strncmp (buffer, name, len)) {
- ino = NODE2INO(node);
- type = OPFSL_DIR;
- break;
- }
- }
- node = nodes[node].next;
- }
- }
- n = NODE(dir->i_ino).node;
- dirnode = dir->i_ino - OPENPROM_FIRST_INO;
- if (!ino) {
- int j = NODEP2INO(NODE(dir->i_ino).first_prop);
- if (dirnode != aliases) {
- for (p = prom_firstprop (n, buffer2);
- p && *p;
- p = prom_nextprop (n, p, buffer2)) {
- j++;
- if ((len == strlen (p))
- && !strncmp (p, name, len)) {
- ino = j;
- type = OPFSL_PROPERTY;
- break;
- }
- }
- } else {
- int k;
- for (k = 0; k < aliases_nodes; k++) {
- j++;
- if (alias_names [k]
- && (len == strlen (alias_names [k]))
- && !strncmp (alias_names [k], name, len)) {
- ino = j;
- type = OPFSL_PROPERTY;
- break;
- }
- }
+
+ mutex_lock(&op_mutex);
+
+ child = dp->child;
+ while (child) {
+ int n = strlen(child->path_component_name);
+
+ if (len == n &&
+ !strncmp(child->path_component_name, name, len)) {
+ ent_type = op_inode_node;
+ ent_data.node = child;
+ ino = child->unique_id;
+ goto found;
}
+ child = child->sibling;
}
- if (!ino) {
- ino = lookup_children (NODE(dir->i_ino).child, name, len);
- if (ino)
- type = OPFSL_DIR;
- else {
- unlock_kernel();
- return ERR_PTR(-ENOENT);
+
+ prop = dp->properties;
+ while (prop) {
+ int n = strlen(prop->name);
+
+ if (len == n && !strncmp(prop->name, name, len)) {
+ ent_type = op_inode_prop;
+ ent_data.prop = prop;
+ ino = prop->unique_id;
+ goto found;
}
+
+ prop = prop->next;
}
- inode = iget (dir->i_sb, ino);
- unlock_kernel();
+
+ mutex_unlock(&op_mutex);
+ return ERR_PTR(-ENOENT);
+
+found:
+ inode = iget(dir->i_sb, ino);
+ mutex_unlock(&op_mutex);
if (!inode)
return ERR_PTR(-EINVAL);
- switch (type) {
- case OPFSL_DIR:
+ ent_oi = OP_I(inode);
+ ent_oi->type = ent_type;
+ ent_oi->u = ent_data;
+
+ switch (ent_type) {
+ case op_inode_node:
inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO;
- if (ino == OPENPROM_FIRST_INO + aliases) {
- inode->i_mode |= S_IWUSR;
- inode->i_op = &openprom_alias_inode_operations;
- } else
- inode->i_op = &openprom_inode_operations;
+ inode->i_op = &openprom_inode_operations;
inode->i_fop = &openprom_operations;
inode->i_nlink = 2;
break;
- case OPFSL_NODENUM:
- inode->i_mode = S_IFREG | S_IRUGO;
- inode->i_fop = &openpromfs_nodenum_ops;
- inode->i_nlink = 1;
- inode->u.generic_ip = (void *)(long)(n);
- break;
- case OPFSL_PROPERTY:
- if ((dirnode == options) && (len == 17)
- && !strncmp (name, "security-password", 17))
+ case op_inode_prop:
+ if (!strcmp(dp->name, "options") && (len == 17) &&
+ !strncmp (name, "security-password", 17))
inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR;
- else {
+ else
inode->i_mode = S_IFREG | S_IRUGO;
- if (dirnode == options || dirnode == aliases) {
- if (len != 4 || strncmp (name, "name", 4))
- inode->i_mode |= S_IWUSR;
- }
- }
inode->i_fop = &openpromfs_prop_ops;
inode->i_nlink = 1;
- if (inode->i_size < 0)
- inode->i_size = 0;
- inode->u.generic_ip = (void *)(long)(((u16)dirnode) |
- (((u16)(ino - NODEP2INO(NODE(dir->i_ino).first_prop) - 1)) << 16));
+ inode->i_size = ent_oi->u.prop->length;
break;
}
@@ -781,237 +263,89 @@ static struct dentry *openpromfs_lookup(struct inode * dir, struct dentry *dentr
static int openpromfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
struct inode *inode = filp->f_dentry->d_inode;
+ struct op_inode_info *oi = OP_I(inode);
+ struct device_node *dp = oi->u.node;
+ struct device_node *child;
+ struct property *prop;
unsigned int ino;
- u32 n;
- int i, j;
- char buffer[128];
- u16 node;
- char *p;
- char buffer2[64];
-
- lock_kernel();
+ int i;
+
+ mutex_lock(&op_mutex);
ino = inode->i_ino;
i = filp->f_pos;
switch (i) {
case 0:
- if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0) goto out;
+ if (filldir(dirent, ".", 1, i, ino, DT_DIR) < 0)
+ goto out;
i++;
filp->f_pos++;
/* fall thru */
case 1:
- if (filldir(dirent, "..", 2, i,
- (NODE(ino).parent == 0xffff) ?
- OPENPROM_ROOT_INO : NODE2INO(NODE(ino).parent), DT_DIR) < 0)
+ if (filldir(dirent, "..", 2, i,
+ (dp->parent == NULL ?
+ OPENPROM_ROOT_INO :
+ dp->parent->unique_id), DT_DIR) < 0)
goto out;
i++;
filp->f_pos++;
/* fall thru */
default:
i -= 2;
- node = NODE(ino).child;
- while (i && node != 0xffff) {
- node = nodes[node].next;
+
+ /* First, the children nodes as directories. */
+ child = dp->child;
+ while (i && child) {
+ child = child->sibling;
i--;
}
- while (node != 0xffff) {
- if (prom_getname (nodes[node].node, buffer, 128) < 0)
- goto out;
- if (filldir(dirent, buffer, strlen(buffer),
- filp->f_pos, NODE2INO(node), DT_DIR) < 0)
+ while (child) {
+ if (filldir(dirent,
+ child->path_component_name,
+ strlen(child->path_component_name),
+ filp->f_pos, child->unique_id, DT_DIR) < 0)
goto out;
+
filp->f_pos++;
- node = nodes[node].next;
+ child = child->sibling;
}
- j = NODEP2INO(NODE(ino).first_prop);
- if (!i) {
- if (filldir(dirent, ".node", 5, filp->f_pos, j, DT_REG) < 0)
+
+ /* Next, the properties as files. */
+ prop = dp->properties;
+ while (i && prop) {
+ prop = prop->next;
+ i--;
+ }
+ while (prop) {
+ if (filldir(dirent, prop->name, strlen(prop->name),
+ filp->f_pos, prop->unique_id, DT_REG) < 0)
goto out;
+
filp->f_pos++;
- } else
- i--;
- n = NODE(ino).node;
- if (ino == OPENPROM_FIRST_INO + aliases) {
- for (j++; i < aliases_nodes; i++, j++) {
- if (alias_names [i]) {
- if (filldir (dirent, alias_names [i],
- strlen (alias_names [i]),
- filp->f_pos, j, DT_REG) < 0) goto out;
- filp->f_pos++;
- }
- }
- } else {
- for (p = prom_firstprop (n, buffer2);
- p && *p;
- p = prom_nextprop (n, p, buffer2)) {
- j++;
- if (i) i--;
- else {
- if (filldir(dirent, p, strlen(p),
- filp->f_pos, j, DT_REG) < 0)
- goto out;
- filp->f_pos++;
- }
- }
+ prop = prop->next;
}
}
out:
- unlock_kernel();
- return 0;
-}
-
-static int openpromfs_create (struct inode *dir, struct dentry *dentry, int mode,
- struct nameidata *nd)
-{
- char *p;
- struct inode *inode;
-
- if (!dir)
- return -ENOENT;
- if (dentry->d_name.len > 256)
- return -EINVAL;
- p = kmalloc (dentry->d_name.len + 1, GFP_KERNEL);
- if (!p)
- return -ENOMEM;
- strncpy (p, dentry->d_name.name, dentry->d_name.len);
- p [dentry->d_name.len] = 0;
- lock_kernel();
- if (aliases_nodes == ALIASES_NNODES) {
- kfree(p);
- unlock_kernel();
- return -EIO;
- }
- alias_names [aliases_nodes++] = p;
- inode = iget (dir->i_sb,
- NODEP2INO(NODE(dir->i_ino).first_prop) + aliases_nodes);
- if (!inode) {
- unlock_kernel();
- return -EINVAL;
- }
- inode->i_mode = S_IFREG | S_IRUGO | S_IWUSR;
- inode->i_fop = &openpromfs_prop_ops;
- inode->i_nlink = 1;
- if (inode->i_size < 0) inode->i_size = 0;
- inode->u.generic_ip = (void *)(long)(((u16)aliases) |
- (((u16)(aliases_nodes - 1)) << 16));
- unlock_kernel();
- d_instantiate(dentry, inode);
+ mutex_unlock(&op_mutex);
return 0;
}
-static int openpromfs_unlink (struct inode *dir, struct dentry *dentry)
-{
- unsigned int len;
- char *p;
- const char *name;
- int i;
-
- name = dentry->d_name.name;
- len = dentry->d_name.len;
- lock_kernel();
- for (i = 0; i < aliases_nodes; i++)
- if ((strlen (alias_names [i]) == len)
- && !strncmp (name, alias_names[i], len)) {
- char buffer[512];
-
- p = alias_names [i];
- alias_names [i] = NULL;
- kfree (p);
- strcpy (buffer, "nvunalias ");
- memcpy (buffer + 10, name, len);
- buffer [10 + len] = 0;
- prom_feval (buffer);
- }
- unlock_kernel();
- return 0;
-}
+static kmem_cache_t *op_inode_cachep;
-/* {{{ init section */
-static int __init check_space (u16 n)
+static struct inode *openprom_alloc_inode(struct super_block *sb)
{
- unsigned long pages;
+ struct op_inode_info *oi;
- if ((1 << alloced) * PAGE_SIZE < (n + 2) * sizeof(openpromfs_node)) {
- pages = __get_free_pages (GFP_KERNEL, alloced + 1);
- if (!pages)
- return -1;
+ oi = kmem_cache_alloc(op_inode_cachep, SLAB_KERNEL);
+ if (!oi)
+ return NULL;
- if (nodes) {
- memcpy ((char *)pages, nodes,
- (1 << alloced) * PAGE_SIZE);
- free_pages ((unsigned long)nodes, alloced);
- }
- alloced++;
- nodes = (openpromfs_node *)pages;
- }
- return 0;
+ return &oi->vfs_inode;
}
-static u16 __init get_nodes (u16 parent, u32 node)
+static void openprom_destroy_inode(struct inode *inode)
{
- char *p;
- u16 n = last_node++, i;
- char buffer[64];
-
- if (check_space (n) < 0)
- return 0xffff;
- nodes[n].parent = parent;
- nodes[n].node = node;
- nodes[n].next = 0xffff;
- nodes[n].child = 0xffff;
- nodes[n].first_prop = first_prop++;
- if (!parent) {
- char buffer[8];
- int j;
-
- if ((j = prom_getproperty (node, "name", buffer, 8)) >= 0) {
- buffer[j] = 0;
- if (!strcmp (buffer, "options"))
- options = n;
- else if (!strcmp (buffer, "aliases"))
- aliases = n;
- }
- }
- if (n != aliases)
- for (p = prom_firstprop (node, buffer);
- p && p != (char *)-1 && *p;
- p = prom_nextprop (node, p, buffer))
- first_prop++;
- else {
- char *q;
- for (p = prom_firstprop (node, buffer);
- p && p != (char *)-1 && *p;
- p = prom_nextprop (node, p, buffer)) {
- if (aliases_nodes == ALIASES_NNODES)
- break;
- for (i = 0; i < aliases_nodes; i++)
- if (!strcmp (p, alias_names [i]))
- break;
- if (i < aliases_nodes)
- continue;
- q = kmalloc (strlen (p) + 1, GFP_KERNEL);
- if (!q)
- return 0xffff;
- strcpy (q, p);
- alias_names [aliases_nodes++] = q;
- }
- first_prop += ALIASES_NNODES;
- }
- node = prom_getchild (node);
- if (node) {
- parent = get_nodes (n, node);
- if (parent == 0xffff)
- return 0xffff;
- nodes[n].child = parent;
- while ((node = prom_getsibling (node)) != 0) {
- i = get_nodes (n, node);
- if (i == 0xffff)
- return 0xffff;
- nodes[parent].next = i;
- parent = i;
- }
- }
- return n;
+ kmem_cache_free(op_inode_cachep, OP_I(inode));
}
static void openprom_read_inode(struct inode * inode)
@@ -1031,6 +365,8 @@ static int openprom_remount(struct super_block *sb, int *flags, char *data)
}
static struct super_operations openprom_sops = {
+ .alloc_inode = openprom_alloc_inode,
+ .destroy_inode = openprom_destroy_inode,
.read_inode = openprom_read_inode,
.statfs = simple_statfs,
.remount_fs = openprom_remount,
@@ -1038,7 +374,8 @@ static struct super_operations openprom_sops = {
static int openprom_fill_super(struct super_block *s, void *data, int silent)
{
- struct inode * root_inode;
+ struct inode *root_inode;
+ struct op_inode_info *oi;
s->s_flags |= MS_NOATIME;
s->s_blocksize = 1024;
@@ -1049,6 +386,11 @@ static int openprom_fill_super(struct super_block *s, void *data, int silent)
root_inode = iget(s, OPENPROM_ROOT_INO);
if (!root_inode)
goto out_no_root;
+
+ oi = OP_I(root_inode);
+ oi->type = op_inode_node;
+ oi->u.node = of_find_node_by_path("/");
+
s->s_root = d_alloc_root(root_inode);
if (!s->s_root)
goto out_no_root;
@@ -1073,29 +415,39 @@ static struct file_system_type openprom_fs_type = {
.kill_sb = kill_anon_super,
};
+static void op_inode_init_once(void *data, kmem_cache_t * cachep, unsigned long flags)
+{
+ struct op_inode_info *oi = (struct op_inode_info *) data;
+
+ if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
+ SLAB_CTOR_CONSTRUCTOR)
+ inode_init_once(&oi->vfs_inode);
+}
+
static int __init init_openprom_fs(void)
{
- nodes = (openpromfs_node *)__get_free_pages(GFP_KERNEL, 0);
- if (!nodes) {
- printk (KERN_WARNING "openpromfs: can't get free page\n");
- return -EIO;
- }
- if (get_nodes (0xffff, prom_root_node) == 0xffff) {
- printk (KERN_WARNING "openpromfs: couldn't setup tree\n");
- return -EIO;
- }
- nodes[last_node].first_prop = first_prop;
- return register_filesystem(&openprom_fs_type);
+ int err;
+
+ op_inode_cachep = kmem_cache_create("op_inode_cache",
+ sizeof(struct op_inode_info),
+ 0,
+ (SLAB_RECLAIM_ACCOUNT |
+ SLAB_MEM_SPREAD),
+ op_inode_init_once, NULL);
+ if (!op_inode_cachep)
+ return -ENOMEM;
+
+ err = register_filesystem(&openprom_fs_type);
+ if (err)
+ kmem_cache_destroy(op_inode_cachep);
+
+ return err;
}
static void __exit exit_openprom_fs(void)
{
- int i;
unregister_filesystem(&openprom_fs_type);
- free_pages ((unsigned long)nodes, alloced);
- for (i = 0; i < aliases_nodes; i++)
- kfree (alias_names [i]);
- nodes = NULL;
+ kmem_cache_destroy(op_inode_cachep);
}
module_init(init_openprom_fs)
diff --git a/fs/pnode.c b/fs/pnode.c
index 37b568ed0e0..da42ee61c1d 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -53,8 +53,7 @@ static int do_make_slave(struct vfsmount *mnt)
if (master) {
list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
slave_mnt->mnt_master = master;
- list_del(&mnt->mnt_slave);
- list_add(&mnt->mnt_slave, &master->mnt_slave_list);
+ list_move(&mnt->mnt_slave, &master->mnt_slave_list);
list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
INIT_LIST_HEAD(&mnt->mnt_slave_list);
} else {
@@ -283,10 +282,8 @@ static void __propagate_umount(struct vfsmount *mnt)
* umount the child only if the child has no
* other children
*/
- if (child && list_empty(&child->mnt_mounts)) {
- list_del(&child->mnt_hash);
- list_add_tail(&child->mnt_hash, &mnt->mnt_hash);
- }
+ if (child && list_empty(&child->mnt_mounts))
+ list_move_tail(&child->mnt_hash, &mnt->mnt_hash);
}
}
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 6afff725a8c..6ba7785319d 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -74,6 +74,16 @@
#include <linux/poll.h>
#include "internal.h"
+/* NOTE:
+ * Implementing inode permission operations in /proc is almost
+ * certainly an error. Permission checks need to happen during
+ * each system call not at open time. The reason is that most of
+ * what we wish to check for permissions in /proc varies at runtime.
+ *
+ * The classic example of a problem is opening file descriptors
+ * in /proc for a task before it execs a suid executable.
+ */
+
/*
* For hysterical raisins we keep the same inumbers as in the old procfs.
* Feel free to change the macro below - just keep the range distinct from
@@ -121,6 +131,8 @@ enum pid_directory_inos {
PROC_TGID_ATTR_PREV,
PROC_TGID_ATTR_EXEC,
PROC_TGID_ATTR_FSCREATE,
+ PROC_TGID_ATTR_KEYCREATE,
+ PROC_TGID_ATTR_SOCKCREATE,
#endif
#ifdef CONFIG_AUDITSYSCALL
PROC_TGID_LOGINUID,
@@ -162,6 +174,8 @@ enum pid_directory_inos {
PROC_TID_ATTR_PREV,
PROC_TID_ATTR_EXEC,
PROC_TID_ATTR_FSCREATE,
+ PROC_TID_ATTR_KEYCREATE,
+ PROC_TID_ATTR_SOCKCREATE,
#endif
#ifdef CONFIG_AUDITSYSCALL
PROC_TID_LOGINUID,
@@ -173,6 +187,9 @@ enum pid_directory_inos {
PROC_TID_FD_DIR = 0x8000, /* 0x8000-0xffff */
};
+/* Worst case buffer size needed for holding an integer. */
+#define PROC_NUMBUF 10
+
struct pid_entry {
int type;
int len;
@@ -275,6 +292,8 @@ static struct pid_entry tgid_attr_stuff[] = {
E(PROC_TGID_ATTR_PREV, "prev", S_IFREG|S_IRUGO),
E(PROC_TGID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO),
E(PROC_TGID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+ E(PROC_TGID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+ E(PROC_TGID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
{0,0,NULL,0}
};
static struct pid_entry tid_attr_stuff[] = {
@@ -282,6 +301,8 @@ static struct pid_entry tid_attr_stuff[] = {
E(PROC_TID_ATTR_PREV, "prev", S_IFREG|S_IRUGO),
E(PROC_TID_ATTR_EXEC, "exec", S_IFREG|S_IRUGO|S_IWUGO),
E(PROC_TID_ATTR_FSCREATE, "fscreate", S_IFREG|S_IRUGO|S_IWUGO),
+ E(PROC_TID_ATTR_KEYCREATE, "keycreate", S_IFREG|S_IRUGO|S_IWUGO),
+ E(PROC_TID_ATTR_SOCKCREATE, "sockcreate", S_IFREG|S_IRUGO|S_IWUGO),
{0,0,NULL,0}
};
#endif
@@ -290,12 +311,15 @@ static struct pid_entry tid_attr_stuff[] = {
static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
{
- struct task_struct *task = proc_task(inode);
- struct files_struct *files;
+ struct task_struct *task = get_proc_task(inode);
+ struct files_struct *files = NULL;
struct file *file;
- int fd = proc_type(inode) - PROC_TID_FD_DIR;
+ int fd = proc_fd(inode);
- files = get_files_struct(task);
+ if (task) {
+ files = get_files_struct(task);
+ put_task_struct(task);
+ }
if (files) {
/*
* We are not taking a ref to the file structure, so we must
@@ -327,29 +351,33 @@ static struct fs_struct *get_fs_struct(struct task_struct *task)
return fs;
}
-static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int get_nr_threads(struct task_struct *tsk)
{
- struct fs_struct *fs = get_fs_struct(proc_task(inode));
- int result = -ENOENT;
- if (fs) {
- read_lock(&fs->lock);
- *mnt = mntget(fs->pwdmnt);
- *dentry = dget(fs->pwd);
- read_unlock(&fs->lock);
- result = 0;
- put_fs_struct(fs);
+ /* Must be called with the rcu_read_lock held */
+ unsigned long flags;
+ int count = 0;
+
+ if (lock_task_sighand(tsk, &flags)) {
+ count = atomic_read(&tsk->signal->count);
+ unlock_task_sighand(tsk, &flags);
}
- return result;
+ return count;
}
-static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
+static int proc_cwd_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
{
- struct fs_struct *fs = get_fs_struct(proc_task(inode));
+ struct task_struct *task = get_proc_task(inode);
+ struct fs_struct *fs = NULL;
int result = -ENOENT;
+
+ if (task) {
+ fs = get_fs_struct(task);
+ put_task_struct(task);
+ }
if (fs) {
read_lock(&fs->lock);
- *mnt = mntget(fs->rootmnt);
- *dentry = dget(fs->root);
+ *mnt = mntget(fs->pwdmnt);
+ *dentry = dget(fs->pwd);
read_unlock(&fs->lock);
result = 0;
put_fs_struct(fs);
@@ -357,42 +385,16 @@ static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vf
return result;
}
-
-/* Same as proc_root_link, but this addionally tries to get fs from other
- * threads in the group */
-static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
- struct vfsmount **mnt)
+static int proc_root_link(struct inode *inode, struct dentry **dentry, struct vfsmount **mnt)
{
- struct fs_struct *fs;
+ struct task_struct *task = get_proc_task(inode);
+ struct fs_struct *fs = NULL;
int result = -ENOENT;
- struct task_struct *leader = proc_task(inode);
- task_lock(leader);
- fs = leader->fs;
- if (fs) {
- atomic_inc(&fs->count);
- task_unlock(leader);
- } else {
- /* Try to get fs from other threads */
- task_unlock(leader);
- read_lock(&tasklist_lock);
- if (pid_alive(leader)) {
- struct task_struct *task = leader;
-
- while ((task = next_thread(task)) != leader) {
- task_lock(task);
- fs = task->fs;
- if (fs) {
- atomic_inc(&fs->count);
- task_unlock(task);
- break;
- }
- task_unlock(task);
- }
- }
- read_unlock(&tasklist_lock);
+ if (task) {
+ fs = get_fs_struct(task);
+ put_task_struct(task);
}
-
if (fs) {
read_lock(&fs->lock);
*mnt = mntget(fs->rootmnt);
@@ -404,7 +406,6 @@ static int proc_task_root_link(struct inode *inode, struct dentry **dentry,
return result;
}
-
#define MAY_PTRACE(task) \
(task == current || \
(task->parent == current && \
@@ -535,142 +536,22 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
/************************************************************************/
/* permission checks */
-
-/* If the process being read is separated by chroot from the reading process,
- * don't let the reader access the threads.
- *
- * note: this does dput(root) and mntput(vfsmnt) on exit.
- */
-static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt)
-{
- struct dentry *de, *base;
- struct vfsmount *our_vfsmnt, *mnt;
- int res = 0;
-
- read_lock(&current->fs->lock);
- our_vfsmnt = mntget(current->fs->rootmnt);
- base = dget(current->fs->root);
- read_unlock(&current->fs->lock);
-
- spin_lock(&vfsmount_lock);
- de = root;
- mnt = vfsmnt;
-
- while (mnt != our_vfsmnt) {
- if (mnt == mnt->mnt_parent)
- goto out;
- de = mnt->mnt_mountpoint;
- mnt = mnt->mnt_parent;
- }
-
- if (!is_subdir(de, base))
- goto out;
- spin_unlock(&vfsmount_lock);
-
-exit:
- dput(base);
- mntput(our_vfsmnt);
- dput(root);
- mntput(vfsmnt);
- return res;
-out:
- spin_unlock(&vfsmount_lock);
- res = -EACCES;
- goto exit;
-}
-
-static int proc_check_root(struct inode *inode)
-{
- struct dentry *root;
- struct vfsmount *vfsmnt;
-
- if (proc_root_link(inode, &root, &vfsmnt)) /* Ewww... */
- return -ENOENT;
- return proc_check_chroot(root, vfsmnt);
-}
-
-static int proc_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
- if (generic_permission(inode, mask, NULL) != 0)
- return -EACCES;
- return proc_check_root(inode);
-}
-
-static int proc_task_permission(struct inode *inode, int mask, struct nameidata *nd)
-{
- struct dentry *root;
- struct vfsmount *vfsmnt;
-
- if (generic_permission(inode, mask, NULL) != 0)
- return -EACCES;
-
- if (proc_task_root_link(inode, &root, &vfsmnt))
- return -ENOENT;
-
- return proc_check_chroot(root, vfsmnt);
-}
-
-extern struct seq_operations proc_pid_maps_op;
-static int maps_open(struct inode *inode, struct file *file)
-{
- struct task_struct *task = proc_task(inode);
- int ret = seq_open(file, &proc_pid_maps_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = task;
- }
- return ret;
-}
-
-static struct file_operations proc_maps_operations = {
- .open = maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-
-#ifdef CONFIG_NUMA
-extern struct seq_operations proc_pid_numa_maps_op;
-static int numa_maps_open(struct inode *inode, struct file *file)
-{
- struct task_struct *task = proc_task(inode);
- int ret = seq_open(file, &proc_pid_numa_maps_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = task;
- }
- return ret;
-}
-
-static struct file_operations proc_numa_maps_operations = {
- .open = numa_maps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-#endif
-
-#ifdef CONFIG_MMU
-extern struct seq_operations proc_pid_smaps_op;
-static int smaps_open(struct inode *inode, struct file *file)
+static int proc_fd_access_allowed(struct inode *inode)
{
- struct task_struct *task = proc_task(inode);
- int ret = seq_open(file, &proc_pid_smaps_op);
- if (!ret) {
- struct seq_file *m = file->private_data;
- m->private = task;
+ struct task_struct *task;
+ int allowed = 0;
+ /* Allow access to a task's file descriptors if it is us or we
+ * may use ptrace attach to the process and find out that
+ * information.
+ */
+ task = get_proc_task(inode);
+ if (task) {
+ allowed = ptrace_may_attach(task);
+ put_task_struct(task);
}
- return ret;
+ return allowed;
}
-static struct file_operations proc_smaps_operations = {
- .open = smaps_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
-#endif
-
extern struct seq_operations mounts_op;
struct proc_mounts {
struct seq_file m;
@@ -679,16 +560,19 @@ struct proc_mounts {
static int mounts_open(struct inode *inode, struct file *file)
{
- struct task_struct *task = proc_task(inode);
- struct namespace *namespace;
+ struct task_struct *task = get_proc_task(inode);
+ struct namespace *namespace = NULL;
struct proc_mounts *p;
int ret = -EINVAL;
- task_lock(task);
- namespace = task->namespace;
- if (namespace)
- get_namespace(namespace);
- task_unlock(task);
+ if (task) {
+ task_lock(task);
+ namespace = task->namespace;
+ if (namespace)
+ get_namespace(namespace);
+ task_unlock(task);
+ put_task_struct(task);
+ }
if (namespace) {
ret = -ENOMEM;
@@ -745,17 +629,21 @@ static struct file_operations proc_mounts_operations = {
extern struct seq_operations mountstats_op;
static int mountstats_open(struct inode *inode, struct file *file)
{
- struct task_struct *task = proc_task(inode);
int ret = seq_open(file, &mountstats_op);
if (!ret) {
struct seq_file *m = file->private_data;
- struct namespace *namespace;
- task_lock(task);
- namespace = task->namespace;
- if (namespace)
- get_namespace(namespace);
- task_unlock(task);
+ struct namespace *namespace = NULL;
+ struct task_struct *task = get_proc_task(inode);
+
+ if (task) {
+ task_lock(task);
+ namespace = task->namespace;
+ if (namespace)
+ get_namespace(namespace);
+ task_unlock(task);
+ put_task_struct(task);
+ }
if (namespace)
m->private = namespace;
@@ -782,18 +670,27 @@ static ssize_t proc_info_read(struct file * file, char __user * buf,
struct inode * inode = file->f_dentry->d_inode;
unsigned long page;
ssize_t length;
- struct task_struct *task = proc_task(inode);
+ struct task_struct *task = get_proc_task(inode);
+
+ length = -ESRCH;
+ if (!task)
+ goto out_no_task;
if (count > PROC_BLOCK_SIZE)
count = PROC_BLOCK_SIZE;
+
+ length = -ENOMEM;
if (!(page = __get_free_page(GFP_KERNEL)))
- return -ENOMEM;
+ goto out;
length = PROC_I(inode)->op.proc_read(task, (char*)page);
if (length >= 0)
length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
free_page(page);
+out:
+ put_task_struct(task);
+out_no_task:
return length;
}
@@ -810,12 +707,15 @@ static int mem_open(struct inode* inode, struct file* file)
static ssize_t mem_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
char *page;
unsigned long src = *ppos;
int ret = -ESRCH;
struct mm_struct *mm;
+ if (!task)
+ goto out_no_task;
+
if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
goto out;
@@ -865,6 +765,8 @@ out_put:
out_free:
free_page((unsigned long) page);
out:
+ put_task_struct(task);
+out_no_task:
return ret;
}
@@ -877,15 +779,20 @@ static ssize_t mem_write(struct file * file, const char * buf,
{
int copied = 0;
char *page;
- struct task_struct *task = proc_task(file->f_dentry->d_inode);
+ struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
unsigned long dst = *ppos;
+ copied = -ESRCH;
+ if (!task)
+ goto out_no_task;
+
if (!MAY_PTRACE(task) || !ptrace_may_attach(task))
- return -ESRCH;
+ goto out;
+ copied = -ENOMEM;
page = (char *)__get_free_page(GFP_USER);
if (!page)
- return -ENOMEM;
+ goto out;
while (count > 0) {
int this_len, retval;
@@ -908,6 +815,9 @@ static ssize_t mem_write(struct file * file, const char * buf,
}
*ppos = dst;
free_page((unsigned long) page);
+out:
+ put_task_struct(task);
+out_no_task:
return copied;
}
#endif
@@ -938,13 +848,18 @@ static struct file_operations proc_mem_operations = {
static ssize_t oom_adjust_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = proc_task(file->f_dentry->d_inode);
- char buffer[8];
+ struct task_struct *task = get_proc_task(file->f_dentry->d_inode);
+ char buffer[PROC_NUMBUF];
size_t len;
- int oom_adjust = task->oomkilladj;
+ int oom_adjust;
loff_t __ppos = *ppos;
- len = sprintf(buffer, "%i\n", oom_adjust);
+ if (!task)
+ return -ESRCH;
+ oom_adjust = task->oomkilladj;
+ put_task_struct(task);
+
+ len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust);
if (__ppos >= len)
return 0;
if (count > len-__ppos)
@@ -958,15 +873,15 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf,
static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *task = proc_task(file->f_dentry->d_inode);
- char buffer[8], *end;
+ struct task_struct *task;
+ char buffer[PROC_NUMBUF], *end;
int oom_adjust;
if (!capable(CAP_SYS_RESOURCE))
return -EPERM;
- memset(buffer, 0, 8);
- if (count > 6)
- count = 6;
+ memset(buffer, 0, sizeof(buffer));
+ if (count > sizeof(buffer) - 1)
+ count = sizeof(buffer) - 1;
if (copy_from_user(buffer, buf, count))
return -EFAULT;
oom_adjust = simple_strtol(buffer, &end, 0);
@@ -974,7 +889,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf,
return -EINVAL;
if (*end == '\n')
end++;
+ task = get_proc_task(file->f_dentry->d_inode);
+ if (!task)
+ return -ESRCH;
task->oomkilladj = oom_adjust;
+ put_task_struct(task);
if (end - buffer == 0)
return -EIO;
return end - buffer;
@@ -985,22 +904,21 @@ static struct file_operations proc_oom_adjust_operations = {
.write = oom_adjust_write,
};
-static struct inode_operations proc_mem_inode_operations = {
- .permission = proc_permission,
-};
-
#ifdef CONFIG_AUDITSYSCALL
#define TMPBUFLEN 21
static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
size_t count, loff_t *ppos)
{
struct inode * inode = file->f_dentry->d_inode;
- struct task_struct *task = proc_task(inode);
+ struct task_struct *task = get_proc_task(inode);
ssize_t length;
char tmpbuf[TMPBUFLEN];
+ if (!task)
+ return -ESRCH;
length = scnprintf(tmpbuf, TMPBUFLEN, "%u",
audit_get_loginuid(task->audit_context));
+ put_task_struct(task);
return simple_read_from_buffer(buf, count, ppos, tmpbuf, length);
}
@@ -1010,13 +928,12 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
struct inode * inode = file->f_dentry->d_inode;
char *page, *tmp;
ssize_t length;
- struct task_struct *task = proc_task(inode);
uid_t loginuid;
if (!capable(CAP_AUDIT_CONTROL))
return -EPERM;
- if (current != task)
+ if (current != pid_task(proc_pid(inode), PIDTYPE_PID))
return -EPERM;
if (count >= PAGE_SIZE)
@@ -1040,7 +957,7 @@ static ssize_t proc_loginuid_write(struct file * file, const char __user * buf,
goto out_free_page;
}
- length = audit_set_loginuid(task, loginuid);
+ length = audit_set_loginuid(current, loginuid);
if (likely(length == 0))
length = count;
@@ -1059,13 +976,16 @@ static struct file_operations proc_loginuid_operations = {
static ssize_t seccomp_read(struct file *file, char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+ struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
char __buf[20];
loff_t __ppos = *ppos;
size_t len;
+ if (!tsk)
+ return -ESRCH;
/* no need to print the trailing zero, so use only len */
len = sprintf(__buf, "%u\n", tsk->seccomp.mode);
+ put_task_struct(tsk);
if (__ppos >= len)
return 0;
if (count > len - __ppos)
@@ -1079,29 +999,43 @@ static ssize_t seccomp_read(struct file *file, char __user *buf,
static ssize_t seccomp_write(struct file *file, const char __user *buf,
size_t count, loff_t *ppos)
{
- struct task_struct *tsk = proc_task(file->f_dentry->d_inode);
+ struct task_struct *tsk = get_proc_task(file->f_dentry->d_inode);
char __buf[20], *end;
unsigned int seccomp_mode;
+ ssize_t result;
+
+ result = -ESRCH;
+ if (!tsk)
+ goto out_no_task;
/* can set it only once to be even more secure */
+ result = -EPERM;
if (unlikely(tsk->seccomp.mode))
- return -EPERM;
+ goto out;
+ result = -EFAULT;
memset(__buf, 0, sizeof(__buf));
count = min(count, sizeof(__buf) - 1);
if (copy_from_user(__buf, buf, count))
- return -EFAULT;
+ goto out;
+
seccomp_mode = simple_strtoul(__buf, &end, 0);
if (*end == '\n')
end++;
+ result = -EINVAL;
if (seccomp_mode && seccomp_mode <= NR_SECCOMP_MODES) {
tsk->seccomp.mode = seccomp_mode;
set_tsk_thread_flag(tsk, TIF_SECCOMP);
} else
- return -EINVAL;
+ goto out;
+ result = -EIO;
if (unlikely(!(end - __buf)))
- return -EIO;
- return end - __buf;
+ goto out;
+ result = end - __buf;
+out:
+ put_task_struct(tsk);
+out_no_task:
+ return result;
}
static struct file_operations proc_seccomp_operations = {
@@ -1118,10 +1052,8 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
/* We don't need a base pointer in the /proc filesystem */
path_release(nd);
- if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
- goto out;
- error = proc_check_root(inode);
- if (error)
+ /* Are we allowed to snoop on the tasks file descriptors? */
+ if (!proc_fd_access_allowed(inode))
goto out;
error = PROC_I(inode)->op.proc_get_link(inode, &nd->dentry, &nd->mnt);
@@ -1163,12 +1095,8 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
struct dentry *de;
struct vfsmount *mnt = NULL;
- lock_kernel();
-
- if (current->fsuid != inode->i_uid && !capable(CAP_DAC_OVERRIDE))
- goto out;
- error = proc_check_root(inode);
- if (error)
+ /* Are we allowed to snoop on the tasks file descriptors? */
+ if (!proc_fd_access_allowed(inode))
goto out;
error = PROC_I(inode)->op.proc_get_link(inode, &de, &mnt);
@@ -1179,7 +1107,6 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
dput(de);
mntput(mnt);
out:
- unlock_kernel();
return error;
}
@@ -1188,21 +1115,20 @@ static struct inode_operations proc_pid_link_inode_operations = {
.follow_link = proc_pid_follow_link
};
-#define NUMBUF 10
-
static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
{
- struct inode *inode = filp->f_dentry->d_inode;
- struct task_struct *p = proc_task(inode);
+ struct dentry *dentry = filp->f_dentry;
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *p = get_proc_task(inode);
unsigned int fd, tid, ino;
int retval;
- char buf[NUMBUF];
+ char buf[PROC_NUMBUF];
struct files_struct * files;
struct fdtable *fdt;
retval = -ENOENT;
- if (!pid_alive(p))
- goto out;
+ if (!p)
+ goto out_no_task;
retval = 0;
tid = p->pid;
@@ -1213,7 +1139,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
goto out;
filp->f_pos++;
case 1:
- ino = fake_ino(tid, PROC_TID_INO);
+ ino = parent_ino(dentry);
if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
goto out;
filp->f_pos++;
@@ -1232,7 +1158,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
continue;
rcu_read_unlock();
- j = NUMBUF;
+ j = PROC_NUMBUF;
i = fd;
do {
j--;
@@ -1241,7 +1167,7 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
} while (i);
ino = fake_ino(tid, PROC_TID_FD_DIR + fd);
- if (filldir(dirent, buf+j, NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
+ if (filldir(dirent, buf+j, PROC_NUMBUF-j, fd+2, ino, DT_LNK) < 0) {
rcu_read_lock();
break;
}
@@ -1251,6 +1177,8 @@ static int proc_readfd(struct file * filp, void * dirent, filldir_t filldir)
put_files_struct(files);
}
out:
+ put_task_struct(p);
+out_no_task:
return retval;
}
@@ -1262,16 +1190,18 @@ static int proc_pident_readdir(struct file *filp,
int pid;
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
+ struct task_struct *task = get_proc_task(inode);
struct pid_entry *p;
ino_t ino;
int ret;
ret = -ENOENT;
- if (!pid_alive(proc_task(inode)))
+ if (!task)
goto out;
ret = 0;
- pid = proc_task(inode)->pid;
+ pid = task->pid;
+ put_task_struct(task);
i = filp->f_pos;
switch (i) {
case 0:
@@ -1354,22 +1284,19 @@ static struct inode *proc_pid_make_inode(struct super_block * sb, struct task_st
/* Common stuff */
ei = PROC_I(inode);
- ei->task = NULL;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
inode->i_ino = fake_ino(task->pid, ino);
- if (!pid_alive(task))
- goto out_unlock;
-
/*
* grab the reference to task.
*/
- get_task_struct(task);
- ei->task = task;
- ei->type = ino;
+ ei->pid = get_pid(task->pids[PIDTYPE_PID].pid);
+ if (!ei->pid)
+ goto out_unlock;
+
inode->i_uid = 0;
inode->i_gid = 0;
- if (ino == PROC_TGID_INO || ino == PROC_TID_INO || task_dumpable(task)) {
+ if (task_dumpable(task)) {
inode->i_uid = task->euid;
inode->i_gid = task->egid;
}
@@ -1379,7 +1306,6 @@ out:
return inode;
out_unlock:
- ei->pde = NULL;
iput(inode);
return NULL;
}
@@ -1393,13 +1319,21 @@ out_unlock:
*
* Rewrite the inode's ownerships here because the owning task may have
* performed a setuid(), etc.
+ *
+ * Before the /proc/pid/status file was created the only way to read
+ * the effective uid of a /process was to stat /proc/pid. Reading
+ * /proc/pid/status is slow enough that procps and other packages
+ * kept stating /proc/pid. To keep the rules in /proc simple I have
+ * made this apply to all per process world readable and executable
+ * directories.
*/
static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
- struct task_struct *task = proc_task(inode);
- if (pid_alive(task)) {
- if (proc_type(inode) == PROC_TGID_INO || proc_type(inode) == PROC_TID_INO || task_dumpable(task)) {
+ struct task_struct *task = get_proc_task(inode);
+ if (task) {
+ if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+ task_dumpable(task)) {
inode->i_uid = task->euid;
inode->i_gid = task->egid;
} else {
@@ -1407,59 +1341,75 @@ static int pid_revalidate(struct dentry *dentry, struct nameidata *nd)
inode->i_gid = 0;
}
security_task_to_inode(task, inode);
+ put_task_struct(task);
return 1;
}
d_drop(dentry);
return 0;
}
+static int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *task;
+ generic_fillattr(inode, stat);
+
+ rcu_read_lock();
+ stat->uid = 0;
+ stat->gid = 0;
+ task = pid_task(proc_pid(inode), PIDTYPE_PID);
+ if (task) {
+ if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
+ task_dumpable(task)) {
+ stat->uid = task->euid;
+ stat->gid = task->egid;
+ }
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
{
struct inode *inode = dentry->d_inode;
- struct task_struct *task = proc_task(inode);
- int fd = proc_type(inode) - PROC_TID_FD_DIR;
+ struct task_struct *task = get_proc_task(inode);
+ int fd = proc_fd(inode);
struct files_struct *files;
- files = get_files_struct(task);
- if (files) {
- rcu_read_lock();
- if (fcheck_files(files, fd)) {
+ if (task) {
+ files = get_files_struct(task);
+ if (files) {
+ rcu_read_lock();
+ if (fcheck_files(files, fd)) {
+ rcu_read_unlock();
+ put_files_struct(files);
+ if (task_dumpable(task)) {
+ inode->i_uid = task->euid;
+ inode->i_gid = task->egid;
+ } else {
+ inode->i_uid = 0;
+ inode->i_gid = 0;
+ }
+ security_task_to_inode(task, inode);
+ put_task_struct(task);
+ return 1;
+ }
rcu_read_unlock();
put_files_struct(files);
- if (task_dumpable(task)) {
- inode->i_uid = task->euid;
- inode->i_gid = task->egid;
- } else {
- inode->i_uid = 0;
- inode->i_gid = 0;
- }
- security_task_to_inode(task, inode);
- return 1;
}
- rcu_read_unlock();
- put_files_struct(files);
+ put_task_struct(task);
}
d_drop(dentry);
return 0;
}
-static void pid_base_iput(struct dentry *dentry, struct inode *inode)
-{
- struct task_struct *task = proc_task(inode);
- spin_lock(&task->proc_lock);
- if (task->proc_dentry == dentry)
- task->proc_dentry = NULL;
- spin_unlock(&task->proc_lock);
- iput(inode);
-}
-
static int pid_delete_dentry(struct dentry * dentry)
{
/* Is the task we represent dead?
* If so, then don't put the dentry on the lru list,
* kill it immediately.
*/
- return !pid_alive(proc_task(dentry->d_inode));
+ return !proc_pid(dentry->d_inode)->tasks[PIDTYPE_PID].first;
}
static struct dentry_operations tid_fd_dentry_operations =
@@ -1474,13 +1424,6 @@ static struct dentry_operations pid_dentry_operations =
.d_delete = pid_delete_dentry,
};
-static struct dentry_operations pid_base_dentry_operations =
-{
- .d_revalidate = pid_revalidate,
- .d_iput = pid_base_iput,
- .d_delete = pid_delete_dentry,
-};
-
/* Lookups */
static unsigned name_to_int(struct dentry *dentry)
@@ -1508,22 +1451,24 @@ out:
/* SMP-safe */
static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry, struct nameidata *nd)
{
- struct task_struct *task = proc_task(dir);
+ struct task_struct *task = get_proc_task(dir);
unsigned fd = name_to_int(dentry);
+ struct dentry *result = ERR_PTR(-ENOENT);
struct file * file;
struct files_struct * files;
struct inode *inode;
struct proc_inode *ei;
+ if (!task)
+ goto out_no_task;
if (fd == ~0U)
goto out;
- if (!pid_alive(task))
- goto out;
inode = proc_pid_make_inode(dir->i_sb, task, PROC_TID_FD_DIR+fd);
if (!inode)
goto out;
ei = PROC_I(inode);
+ ei->fd = fd;
files = get_files_struct(task);
if (!files)
goto out_unlock;
@@ -1548,19 +1493,25 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
ei->op.proc_get_link = proc_fd_link;
dentry->d_op = &tid_fd_dentry_operations;
d_add(dentry, inode);
- return NULL;
+ /* Close the race of the process dying before we return the dentry */
+ if (tid_fd_revalidate(dentry, NULL))
+ result = NULL;
+out:
+ put_task_struct(task);
+out_no_task:
+ return result;
out_unlock2:
spin_unlock(&files->file_lock);
put_files_struct(files);
out_unlock:
iput(inode);
-out:
- return ERR_PTR(-ENOENT);
+ goto out;
}
static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir);
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd);
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat);
static struct file_operations proc_fd_operations = {
.read = generic_read_dir,
@@ -1577,12 +1528,11 @@ static struct file_operations proc_task_operations = {
*/
static struct inode_operations proc_fd_inode_operations = {
.lookup = proc_lookupfd,
- .permission = proc_permission,
};
static struct inode_operations proc_task_inode_operations = {
.lookup = proc_task_lookup,
- .permission = proc_task_permission,
+ .getattr = proc_task_getattr,
};
#ifdef CONFIG_SECURITY
@@ -1592,12 +1542,17 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
struct inode * inode = file->f_dentry->d_inode;
unsigned long page;
ssize_t length;
- struct task_struct *task = proc_task(inode);
+ struct task_struct *task = get_proc_task(inode);
+
+ length = -ESRCH;
+ if (!task)
+ goto out_no_task;
if (count > PAGE_SIZE)
count = PAGE_SIZE;
+ length = -ENOMEM;
if (!(page = __get_free_page(GFP_KERNEL)))
- return -ENOMEM;
+ goto out;
length = security_getprocattr(task,
(char*)file->f_dentry->d_name.name,
@@ -1605,6 +1560,9 @@ static ssize_t proc_pid_attr_read(struct file * file, char __user * buf,
if (length >= 0)
length = simple_read_from_buffer(buf, count, ppos, (char *)page, length);
free_page(page);
+out:
+ put_task_struct(task);
+out_no_task:
return length;
}
@@ -1614,26 +1572,36 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
struct inode * inode = file->f_dentry->d_inode;
char *page;
ssize_t length;
- struct task_struct *task = proc_task(inode);
+ struct task_struct *task = get_proc_task(inode);
+ length = -ESRCH;
+ if (!task)
+ goto out_no_task;
if (count > PAGE_SIZE)
count = PAGE_SIZE;
- if (*ppos != 0) {
- /* No partial writes. */
- return -EINVAL;
- }
+
+ /* No partial writes. */
+ length = -EINVAL;
+ if (*ppos != 0)
+ goto out;
+
+ length = -ENOMEM;
page = (char*)__get_free_page(GFP_USER);
if (!page)
- return -ENOMEM;
+ goto out;
+
length = -EFAULT;
if (copy_from_user(page, buf, count))
- goto out;
+ goto out_free;
length = security_setprocattr(task,
(char*)file->f_dentry->d_name.name,
(void*)page, count);
-out:
+out_free:
free_page((unsigned long) page);
+out:
+ put_task_struct(task);
+out_no_task:
return length;
}
@@ -1648,24 +1616,22 @@ static struct file_operations proc_tgid_attr_operations;
static struct inode_operations proc_tgid_attr_inode_operations;
#endif
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir);
-
/* SMP-safe */
static struct dentry *proc_pident_lookup(struct inode *dir,
struct dentry *dentry,
struct pid_entry *ents)
{
struct inode *inode;
- int error;
- struct task_struct *task = proc_task(dir);
+ struct dentry *error;
+ struct task_struct *task = get_proc_task(dir);
struct pid_entry *p;
struct proc_inode *ei;
- error = -ENOENT;
+ error = ERR_PTR(-ENOENT);
inode = NULL;
- if (!pid_alive(task))
- goto out;
+ if (!task)
+ goto out_no_task;
for (p = ents; p->name; p++) {
if (p->len != dentry->d_name.len)
@@ -1676,7 +1642,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
if (!p->name)
goto out;
- error = -EINVAL;
+ error = ERR_PTR(-EINVAL);
inode = proc_pid_make_inode(dir->i_sb, task, p->type);
if (!inode)
goto out;
@@ -1689,7 +1655,7 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
*/
switch(p->type) {
case PROC_TGID_TASK:
- inode->i_nlink = 2 + get_tid_list(2, NULL, dir);
+ inode->i_nlink = 2;
inode->i_op = &proc_task_inode_operations;
inode->i_fop = &proc_task_operations;
break;
@@ -1759,7 +1725,6 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
#endif
case PROC_TID_MEM:
case PROC_TGID_MEM:
- inode->i_op = &proc_mem_inode_operations;
inode->i_fop = &proc_mem_operations;
break;
#ifdef CONFIG_SECCOMP
@@ -1801,6 +1766,10 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
case PROC_TGID_ATTR_EXEC:
case PROC_TID_ATTR_FSCREATE:
case PROC_TGID_ATTR_FSCREATE:
+ case PROC_TID_ATTR_KEYCREATE:
+ case PROC_TGID_ATTR_KEYCREATE:
+ case PROC_TID_ATTR_SOCKCREATE:
+ case PROC_TGID_ATTR_SOCKCREATE:
inode->i_fop = &proc_pid_attr_operations;
break;
#endif
@@ -1842,14 +1811,18 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
default:
printk("procfs: impossible type (%d)",p->type);
iput(inode);
- return ERR_PTR(-EINVAL);
+ error = ERR_PTR(-EINVAL);
+ goto out;
}
dentry->d_op = &pid_dentry_operations;
d_add(dentry, inode);
- return NULL;
-
+ /* Close the race of the process dying before we return the dentry */
+ if (pid_revalidate(dentry, NULL))
+ error = NULL;
out:
- return ERR_PTR(error);
+ put_task_struct(task);
+out_no_task:
+ return error;
}
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd){
@@ -1872,10 +1845,12 @@ static struct file_operations proc_tid_base_operations = {
static struct inode_operations proc_tgid_base_inode_operations = {
.lookup = proc_tgid_base_lookup,
+ .getattr = pid_getattr,
};
static struct inode_operations proc_tid_base_inode_operations = {
.lookup = proc_tid_base_lookup,
+ .getattr = pid_getattr,
};
#ifdef CONFIG_SECURITY
@@ -1917,10 +1892,12 @@ static struct dentry *proc_tid_attr_lookup(struct inode *dir,
static struct inode_operations proc_tgid_attr_inode_operations = {
.lookup = proc_tgid_attr_lookup,
+ .getattr = pid_getattr,
};
static struct inode_operations proc_tid_attr_inode_operations = {
.lookup = proc_tid_attr_lookup,
+ .getattr = pid_getattr,
};
#endif
@@ -1930,14 +1907,14 @@ static struct inode_operations proc_tid_attr_inode_operations = {
static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
int buflen)
{
- char tmp[30];
+ char tmp[PROC_NUMBUF];
sprintf(tmp, "%d", current->tgid);
return vfs_readlink(dentry,buffer,buflen,tmp);
}
static void *proc_self_follow_link(struct dentry *dentry, struct nameidata *nd)
{
- char tmp[30];
+ char tmp[PROC_NUMBUF];
sprintf(tmp, "%d", current->tgid);
return ERR_PTR(vfs_follow_link(nd,tmp));
}
@@ -1948,67 +1925,80 @@ static struct inode_operations proc_self_inode_operations = {
};
/**
- * proc_pid_unhash - Unhash /proc/@pid entry from the dcache.
- * @p: task that should be flushed.
+ * proc_flush_task - Remove dcache entries for @task from the /proc dcache.
+ *
+ * @task: task that should be flushed.
+ *
+ * Looks in the dcache for
+ * /proc/@pid
+ * /proc/@tgid/task/@pid
+ * if either directory is present flushes it and all of it'ts children
+ * from the dcache.
*
- * Drops the /proc/@pid dcache entry from the hash chains.
+ * It is safe and reasonable to cache /proc entries for a task until
+ * that task exits. After that they just clog up the dcache with
+ * useless entries, possibly causing useful dcache entries to be
+ * flushed instead. This routine is proved to flush those useless
+ * dcache entries at process exit time.
*
- * Dropping /proc/@pid entries and detach_pid must be synchroneous,
- * otherwise e.g. /proc/@pid/exe might point to the wrong executable,
- * if the pid value is immediately reused. This is enforced by
- * - caller must acquire spin_lock(p->proc_lock)
- * - must be called before detach_pid()
- * - proc_pid_lookup acquires proc_lock, and checks that
- * the target is not dead by looking at the attach count
- * of PIDTYPE_PID.
+ * NOTE: This routine is just an optimization so it does not guarantee
+ * that no dcache entries will exist at process exit time it
+ * just makes it very unlikely that any will persist.
*/
-
-struct dentry *proc_pid_unhash(struct task_struct *p)
+void proc_flush_task(struct task_struct *task)
{
- struct dentry *proc_dentry;
+ struct dentry *dentry, *leader, *dir;
+ char buf[PROC_NUMBUF];
+ struct qstr name;
+
+ name.name = buf;
+ name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+ dentry = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+ if (dentry) {
+ shrink_dcache_parent(dentry);
+ d_drop(dentry);
+ dput(dentry);
+ }
- proc_dentry = p->proc_dentry;
- if (proc_dentry != NULL) {
+ if (thread_group_leader(task))
+ goto out;
- spin_lock(&dcache_lock);
- spin_lock(&proc_dentry->d_lock);
- if (!d_unhashed(proc_dentry)) {
- dget_locked(proc_dentry);
- __d_drop(proc_dentry);
- spin_unlock(&proc_dentry->d_lock);
- } else {
- spin_unlock(&proc_dentry->d_lock);
- proc_dentry = NULL;
- }
- spin_unlock(&dcache_lock);
- }
- return proc_dentry;
-}
+ name.name = buf;
+ name.len = snprintf(buf, sizeof(buf), "%d", task->tgid);
+ leader = d_hash_and_lookup(proc_mnt->mnt_root, &name);
+ if (!leader)
+ goto out;
-/**
- * proc_pid_flush - recover memory used by stale /proc/@pid/x entries
- * @proc_dentry: directoy to prune.
- *
- * Shrink the /proc directory that was used by the just killed thread.
- */
-
-void proc_pid_flush(struct dentry *proc_dentry)
-{
- might_sleep();
- if(proc_dentry != NULL) {
- shrink_dcache_parent(proc_dentry);
- dput(proc_dentry);
+ name.name = "task";
+ name.len = strlen(name.name);
+ dir = d_hash_and_lookup(leader, &name);
+ if (!dir)
+ goto out_put_leader;
+
+ name.name = buf;
+ name.len = snprintf(buf, sizeof(buf), "%d", task->pid);
+ dentry = d_hash_and_lookup(dir, &name);
+ if (dentry) {
+ shrink_dcache_parent(dentry);
+ d_drop(dentry);
+ dput(dentry);
}
+
+ dput(dir);
+out_put_leader:
+ dput(leader);
+out:
+ return;
}
/* SMP-safe */
struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
{
+ struct dentry *result = ERR_PTR(-ENOENT);
struct task_struct *task;
struct inode *inode;
struct proc_inode *ei;
unsigned tgid;
- int died;
if (dentry->d_name.len == 4 && !memcmp(dentry->d_name.name,"self",4)) {
inode = new_inode(dir->i_sb);
@@ -2029,21 +2019,18 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
if (tgid == ~0U)
goto out;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
task = find_task_by_pid(tgid);
if (task)
get_task_struct(task);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
if (!task)
goto out;
inode = proc_pid_make_inode(dir->i_sb, task, PROC_TGID_INO);
+ if (!inode)
+ goto out_put_task;
-
- if (!inode) {
- put_task_struct(task);
- goto out;
- }
inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
inode->i_op = &proc_tgid_base_inode_operations;
inode->i_fop = &proc_tgid_base_operations;
@@ -2054,45 +2041,40 @@ struct dentry *proc_pid_lookup(struct inode *dir, struct dentry * dentry, struct
inode->i_nlink = 4;
#endif
- dentry->d_op = &pid_base_dentry_operations;
+ dentry->d_op = &pid_dentry_operations;
- died = 0;
d_add(dentry, inode);
- spin_lock(&task->proc_lock);
- task->proc_dentry = dentry;
- if (!pid_alive(task)) {
- dentry = proc_pid_unhash(task);
- died = 1;
- }
- spin_unlock(&task->proc_lock);
+ /* Close the race of the process dying before we return the dentry */
+ if (pid_revalidate(dentry, NULL))
+ result = NULL;
+out_put_task:
put_task_struct(task);
- if (died) {
- proc_pid_flush(dentry);
- goto out;
- }
- return NULL;
out:
- return ERR_PTR(-ENOENT);
+ return result;
}
/* SMP-safe */
static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry, struct nameidata *nd)
{
+ struct dentry *result = ERR_PTR(-ENOENT);
struct task_struct *task;
- struct task_struct *leader = proc_task(dir);
+ struct task_struct *leader = get_proc_task(dir);
struct inode *inode;
unsigned tid;
+ if (!leader)
+ goto out_no_task;
+
tid = name_to_int(dentry);
if (tid == ~0U)
goto out;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
task = find_task_by_pid(tid);
if (task)
get_task_struct(task);
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
if (!task)
goto out;
if (leader->tgid != task->tgid)
@@ -2113,101 +2095,95 @@ static struct dentry *proc_task_lookup(struct inode *dir, struct dentry * dentry
inode->i_nlink = 3;
#endif
- dentry->d_op = &pid_base_dentry_operations;
+ dentry->d_op = &pid_dentry_operations;
d_add(dentry, inode);
+ /* Close the race of the process dying before we return the dentry */
+ if (pid_revalidate(dentry, NULL))
+ result = NULL;
- put_task_struct(task);
- return NULL;
out_drop_task:
put_task_struct(task);
out:
- return ERR_PTR(-ENOENT);
+ put_task_struct(leader);
+out_no_task:
+ return result;
}
-#define PROC_NUMBUF 10
-#define PROC_MAXPIDS 20
-
/*
- * Get a few tgid's to return for filldir - we need to hold the
- * tasklist lock while doing this, and we must release it before
- * we actually do the filldir itself, so we use a temp buffer..
+ * Find the first tgid to return to user space.
+ *
+ * Usually this is just whatever follows &init_task, but if the users
+ * buffer was too small to hold the full list or there was a seek into
+ * the middle of the directory we have more work to do.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with &init_task and walk nr
+ * threads past it.
*/
-static int get_tgid_list(int index, unsigned long version, unsigned int *tgids)
-{
- struct task_struct *p;
- int nr_tgids = 0;
-
- index--;
- read_lock(&tasklist_lock);
- p = NULL;
- if (version) {
- p = find_task_by_pid(version);
- if (p && !thread_group_leader(p))
- p = NULL;
+static struct task_struct *first_tgid(int tgid, unsigned int nr)
+{
+ struct task_struct *pos;
+ rcu_read_lock();
+ if (tgid && nr) {
+ pos = find_task_by_pid(tgid);
+ if (pos && thread_group_leader(pos))
+ goto found;
}
+ /* If nr exceeds the number of processes get out quickly */
+ pos = NULL;
+ if (nr && nr >= nr_processes())
+ goto done;
- if (p)
- index = 0;
- else
- p = next_task(&init_task);
-
- for ( ; p != &init_task; p = next_task(p)) {
- int tgid = p->pid;
- if (!pid_alive(p))
- continue;
- if (--index >= 0)
- continue;
- tgids[nr_tgids] = tgid;
- nr_tgids++;
- if (nr_tgids >= PROC_MAXPIDS)
- break;
+ /* If we haven't found our starting place yet start with
+ * the init_task and walk nr tasks forward.
+ */
+ for (pos = next_task(&init_task); nr > 0; --nr) {
+ pos = next_task(pos);
+ if (pos == &init_task) {
+ pos = NULL;
+ goto done;
+ }
}
- read_unlock(&tasklist_lock);
- return nr_tgids;
+found:
+ get_task_struct(pos);
+done:
+ rcu_read_unlock();
+ return pos;
}
/*
- * Get a few tid's to return for filldir - we need to hold the
- * tasklist lock while doing this, and we must release it before
- * we actually do the filldir itself, so we use a temp buffer..
+ * Find the next task in the task list.
+ * Return NULL if we loop or there is any error.
+ *
+ * The reference to the input task_struct is released.
*/
-static int get_tid_list(int index, unsigned int *tids, struct inode *dir)
-{
- struct task_struct *leader_task = proc_task(dir);
- struct task_struct *task = leader_task;
- int nr_tids = 0;
-
- index -= 2;
- read_lock(&tasklist_lock);
- /*
- * The starting point task (leader_task) might be an already
- * unlinked task, which cannot be used to access the task-list
- * via next_thread().
- */
- if (pid_alive(task)) do {
- int tid = task->pid;
-
- if (--index >= 0)
- continue;
- if (tids != NULL)
- tids[nr_tids] = tid;
- nr_tids++;
- if (nr_tids >= PROC_MAXPIDS)
- break;
- } while ((task = next_thread(task)) != leader_task);
- read_unlock(&tasklist_lock);
- return nr_tids;
+static struct task_struct *next_tgid(struct task_struct *start)
+{
+ struct task_struct *pos;
+ rcu_read_lock();
+ pos = start;
+ if (pid_alive(start))
+ pos = next_task(start);
+ if (pid_alive(pos) && (pos != &init_task)) {
+ get_task_struct(pos);
+ goto done;
+ }
+ pos = NULL;
+done:
+ rcu_read_unlock();
+ put_task_struct(start);
+ return pos;
}
/* for the /proc/ directory itself, after non-process stuff has been done */
int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
- unsigned int tgid_array[PROC_MAXPIDS];
char buf[PROC_NUMBUF];
unsigned int nr = filp->f_pos - FIRST_PROCESS_ENTRY;
- unsigned int nr_tgids, i;
- int next_tgid;
+ struct task_struct *task;
+ int tgid;
if (!nr) {
ino_t ino = fake_ino(0,PROC_TGID_INO);
@@ -2216,63 +2192,116 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
filp->f_pos++;
nr++;
}
+ nr -= 1;
/* f_version caches the tgid value that the last readdir call couldn't
* return. lseek aka telldir automagically resets f_version to 0.
*/
- next_tgid = filp->f_version;
+ tgid = filp->f_version;
filp->f_version = 0;
- for (;;) {
- nr_tgids = get_tgid_list(nr, next_tgid, tgid_array);
- if (!nr_tgids) {
- /* no more entries ! */
+ for (task = first_tgid(tgid, nr);
+ task;
+ task = next_tgid(task), filp->f_pos++) {
+ int len;
+ ino_t ino;
+ tgid = task->pid;
+ len = snprintf(buf, sizeof(buf), "%d", tgid);
+ ino = fake_ino(tgid, PROC_TGID_INO);
+ if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
+ /* returning this tgid failed, save it as the first
+ * pid for the next readir call */
+ filp->f_version = tgid;
+ put_task_struct(task);
break;
}
- next_tgid = 0;
+ }
+ return 0;
+}
- /* do not use the last found pid, reserve it for next_tgid */
- if (nr_tgids == PROC_MAXPIDS) {
- nr_tgids--;
- next_tgid = tgid_array[nr_tgids];
- }
+/*
+ * Find the first tid of a thread group to return to user space.
+ *
+ * Usually this is just the thread group leader, but if the users
+ * buffer was too small or there was a seek into the middle of the
+ * directory we have more work todo.
+ *
+ * In the case of a short read we start with find_task_by_pid.
+ *
+ * In the case of a seek we start with the leader and walk nr
+ * threads past it.
+ */
+static struct task_struct *first_tid(struct task_struct *leader,
+ int tid, int nr)
+{
+ struct task_struct *pos;
- for (i=0;i<nr_tgids;i++) {
- int tgid = tgid_array[i];
- ino_t ino = fake_ino(tgid,PROC_TGID_INO);
- unsigned long j = PROC_NUMBUF;
+ rcu_read_lock();
+ /* Attempt to start with the pid of a thread */
+ if (tid && (nr > 0)) {
+ pos = find_task_by_pid(tid);
+ if (pos && (pos->group_leader == leader))
+ goto found;
+ }
- do
- buf[--j] = '0' + (tgid % 10);
- while ((tgid /= 10) != 0);
+ /* If nr exceeds the number of threads there is nothing todo */
+ pos = NULL;
+ if (nr && nr >= get_nr_threads(leader))
+ goto out;
- if (filldir(dirent, buf+j, PROC_NUMBUF-j, filp->f_pos, ino, DT_DIR) < 0) {
- /* returning this tgid failed, save it as the first
- * pid for the next readir call */
- filp->f_version = tgid_array[i];
- goto out;
- }
- filp->f_pos++;
- nr++;
+ /* If we haven't found our starting place yet start
+ * with the leader and walk nr threads forward.
+ */
+ for (pos = leader; nr > 0; --nr) {
+ pos = next_thread(pos);
+ if (pos == leader) {
+ pos = NULL;
+ goto out;
}
}
+found:
+ get_task_struct(pos);
out:
- return 0;
+ rcu_read_unlock();
+ return pos;
+}
+
+/*
+ * Find the next thread in the thread list.
+ * Return NULL if there is an error or no next thread.
+ *
+ * The reference to the input task_struct is released.
+ */
+static struct task_struct *next_tid(struct task_struct *start)
+{
+ struct task_struct *pos = NULL;
+ rcu_read_lock();
+ if (pid_alive(start)) {
+ pos = next_thread(start);
+ if (thread_group_leader(pos))
+ pos = NULL;
+ else
+ get_task_struct(pos);
+ }
+ rcu_read_unlock();
+ put_task_struct(start);
+ return pos;
}
/* for the /proc/TGID/task/ directories */
static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldir)
{
- unsigned int tid_array[PROC_MAXPIDS];
char buf[PROC_NUMBUF];
- unsigned int nr_tids, i;
struct dentry *dentry = filp->f_dentry;
struct inode *inode = dentry->d_inode;
+ struct task_struct *leader = get_proc_task(inode);
+ struct task_struct *task;
int retval = -ENOENT;
ino_t ino;
+ int tid;
unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */
- if (!pid_alive(proc_task(inode)))
- goto out;
+ if (!leader)
+ goto out_no_task;
retval = 0;
switch (pos) {
@@ -2290,24 +2319,45 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi
/* fall through */
}
- nr_tids = get_tid_list(pos, tid_array, inode);
- inode->i_nlink = pos + nr_tids;
-
- for (i = 0; i < nr_tids; i++) {
- unsigned long j = PROC_NUMBUF;
- int tid = tid_array[i];
-
- ino = fake_ino(tid,PROC_TID_INO);
-
- do
- buf[--j] = '0' + (tid % 10);
- while ((tid /= 10) != 0);
-
- if (filldir(dirent, buf+j, PROC_NUMBUF-j, pos, ino, DT_DIR) < 0)
+ /* f_version caches the tgid value that the last readdir call couldn't
+ * return. lseek aka telldir automagically resets f_version to 0.
+ */
+ tid = filp->f_version;
+ filp->f_version = 0;
+ for (task = first_tid(leader, tid, pos - 2);
+ task;
+ task = next_tid(task), pos++) {
+ int len;
+ tid = task->pid;
+ len = snprintf(buf, sizeof(buf), "%d", tid);
+ ino = fake_ino(tid, PROC_TID_INO);
+ if (filldir(dirent, buf, len, pos, ino, DT_DIR < 0)) {
+ /* returning this tgid failed, save it as the first
+ * pid for the next readir call */
+ filp->f_version = tid;
+ put_task_struct(task);
break;
- pos++;
+ }
}
out:
filp->f_pos = pos;
+ put_task_struct(leader);
+out_no_task:
return retval;
}
+
+static int proc_task_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
+{
+ struct inode *inode = dentry->d_inode;
+ struct task_struct *p = get_proc_task(inode);
+ generic_fillattr(inode, stat);
+
+ if (p) {
+ rcu_read_lock();
+ stat->nlink += get_nr_threads(p);
+ rcu_read_unlock();
+ put_task_struct(p);
+ }
+
+ return 0;
+}
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 722b9c46311..6dcef089e18 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -58,14 +58,11 @@ static void de_put(struct proc_dir_entry *de)
static void proc_delete_inode(struct inode *inode)
{
struct proc_dir_entry *de;
- struct task_struct *tsk;
truncate_inode_pages(&inode->i_data, 0);
- /* Let go of any associated process */
- tsk = PROC_I(inode)->task;
- if (tsk)
- put_task_struct(tsk);
+ /* Stop tracking associated processes */
+ put_pid(PROC_I(inode)->pid);
/* Let go of any associated proc directory entry */
de = PROC_I(inode)->pde;
@@ -94,8 +91,8 @@ static struct inode *proc_alloc_inode(struct super_block *sb)
ei = (struct proc_inode *)kmem_cache_alloc(proc_inode_cachep, SLAB_KERNEL);
if (!ei)
return NULL;
- ei->task = NULL;
- ei->type = 0;
+ ei->pid = NULL;
+ ei->fd = 0;
ei->op.proc_get_link = NULL;
ei->pde = NULL;
inode = &ei->vfs_inode;
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 0502f17b860..146a434ba94 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -37,16 +37,30 @@ extern int proc_tgid_stat(struct task_struct *, char *);
extern int proc_pid_status(struct task_struct *, char *);
extern int proc_pid_statm(struct task_struct *, char *);
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+extern struct file_operations proc_maps_operations;
+extern struct file_operations proc_numa_maps_operations;
+extern struct file_operations proc_smaps_operations;
+
+
void free_proc_entry(struct proc_dir_entry *de);
int proc_init_inodecache(void);
-static inline struct task_struct *proc_task(struct inode *inode)
+static inline struct pid *proc_pid(struct inode *inode)
+{
+ return PROC_I(inode)->pid;
+}
+
+static inline struct task_struct *get_proc_task(struct inode *inode)
{
- return PROC_I(inode)->task;
+ return get_pid_task(proc_pid(inode), PIDTYPE_PID);
}
-static inline int proc_type(struct inode *inode)
+static inline int proc_fd(struct inode *inode)
{
- return PROC_I(inode)->type;
+ return PROC_I(inode)->fd;
}
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 91b7c15ab37..0137ec4c136 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -75,9 +75,13 @@ int proc_exe_link(struct inode *inode, struct dentry **dentry, struct vfsmount *
{
struct vm_area_struct * vma;
int result = -ENOENT;
- struct task_struct *task = proc_task(inode);
- struct mm_struct * mm = get_task_mm(task);
+ struct task_struct *task = get_proc_task(inode);
+ struct mm_struct * mm = NULL;
+ if (task) {
+ mm = get_task_mm(task);
+ put_task_struct(task);
+ }
if (!mm)
goto out;
down_read(&mm->mmap_sem);
@@ -120,7 +124,8 @@ struct mem_size_stats
static int show_map_internal(struct seq_file *m, void *v, struct mem_size_stats *mss)
{
- struct task_struct *task = m->private;
+ struct proc_maps_private *priv = m->private;
+ struct task_struct *task = priv->task;
struct vm_area_struct *vma = v;
struct mm_struct *mm = vma->vm_mm;
struct file *file = vma->vm_file;
@@ -295,12 +300,16 @@ static int show_smap(struct seq_file *m, void *v)
static void *m_start(struct seq_file *m, loff_t *pos)
{
- struct task_struct *task = m->private;
+ struct proc_maps_private *priv = m->private;
unsigned long last_addr = m->version;
struct mm_struct *mm;
- struct vm_area_struct *vma, *tail_vma;
+ struct vm_area_struct *vma, *tail_vma = NULL;
loff_t l = *pos;
+ /* Clear the per syscall fields in priv */
+ priv->task = NULL;
+ priv->tail_vma = NULL;
+
/*
* We remember last_addr rather than next_addr to hit with
* mmap_cache most of the time. We have zero last_addr at
@@ -311,11 +320,15 @@ static void *m_start(struct seq_file *m, loff_t *pos)
if (last_addr == -1UL)
return NULL;
- mm = get_task_mm(task);
+ priv->task = get_pid_task(priv->pid, PIDTYPE_PID);
+ if (!priv->task)
+ return NULL;
+
+ mm = get_task_mm(priv->task);
if (!mm)
return NULL;
- tail_vma = get_gate_vma(task);
+ priv->tail_vma = tail_vma = get_gate_vma(priv->task);
down_read(&mm->mmap_sem);
/* Start with last addr hint */
@@ -350,11 +363,9 @@ out:
return tail_vma;
}
-static void m_stop(struct seq_file *m, void *v)
+static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma)
{
- struct task_struct *task = m->private;
- struct vm_area_struct *vma = v;
- if (vma && vma != get_gate_vma(task)) {
+ if (vma && vma != priv->tail_vma) {
struct mm_struct *mm = vma->vm_mm;
up_read(&mm->mmap_sem);
mmput(mm);
@@ -363,38 +374,103 @@ static void m_stop(struct seq_file *m, void *v)
static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct task_struct *task = m->private;
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
- struct vm_area_struct *tail_vma = get_gate_vma(task);
+ struct vm_area_struct *tail_vma = priv->tail_vma;
(*pos)++;
if (vma && (vma != tail_vma) && vma->vm_next)
return vma->vm_next;
- m_stop(m, v);
+ vma_stop(priv, vma);
return (vma != tail_vma)? tail_vma: NULL;
}
-struct seq_operations proc_pid_maps_op = {
+static void m_stop(struct seq_file *m, void *v)
+{
+ struct proc_maps_private *priv = m->private;
+ struct vm_area_struct *vma = v;
+
+ vma_stop(priv, vma);
+ if (priv->task)
+ put_task_struct(priv->task);
+}
+
+static struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_map
};
-struct seq_operations proc_pid_smaps_op = {
+static struct seq_operations proc_pid_smaps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_smap
};
+static int do_maps_open(struct inode *inode, struct file *file,
+ struct seq_operations *ops)
+{
+ struct proc_maps_private *priv;
+ int ret = -ENOMEM;
+ priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+ if (priv) {
+ priv->pid = proc_pid(inode);
+ ret = seq_open(file, ops);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = priv;
+ } else {
+ kfree(priv);
+ }
+ }
+ return ret;
+}
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_maps_op);
+}
+
+struct file_operations proc_maps_operations = {
+ .open = maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
+
#ifdef CONFIG_NUMA
extern int show_numa_map(struct seq_file *m, void *v);
-struct seq_operations proc_pid_numa_maps_op = {
+static struct seq_operations proc_pid_numa_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_numa_map
};
+
+static int numa_maps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_numa_maps_op);
+}
+
+struct file_operations proc_numa_maps_operations = {
+ .open = numa_maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
#endif
+
+static int smaps_open(struct inode *inode, struct file *file)
+{
+ return do_maps_open(inode, file, &proc_pid_smaps_op);
+}
+
+struct file_operations proc_smaps_operations = {
+ .open = smaps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release_private,
+};
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 8f68827ed10..af69f28277b 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -156,9 +156,28 @@ static void *m_next(struct seq_file *m, void *v, loff_t *pos)
{
return NULL;
}
-struct seq_operations proc_pid_maps_op = {
+static struct seq_operations proc_pid_maps_op = {
.start = m_start,
.next = m_next,
.stop = m_stop,
.show = show_map
};
+
+static int maps_open(struct inode *inode, struct file *file)
+{
+ int ret;
+ ret = seq_open(file, &proc_pid_maps_op);
+ if (!ret) {
+ struct seq_file *m = file->private_data;
+ m->private = NULL;
+ }
+ return ret;
+}
+
+struct file_operations proc_maps_operations = {
+ .open = maps_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
+};
+
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index cf6e1cf4035..752cea12e30 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -1560,12 +1560,6 @@ static ssize_t reiserfs_file_write(struct file *file, /* the file we are going t
return res;
}
-static ssize_t reiserfs_aio_write(struct kiocb *iocb, const char __user * buf,
- size_t count, loff_t pos)
-{
- return generic_file_aio_write(iocb, buf, count, pos);
-}
-
const struct file_operations reiserfs_file_operations = {
.read = generic_file_read,
.write = reiserfs_file_write,
@@ -1575,7 +1569,7 @@ const struct file_operations reiserfs_file_operations = {
.fsync = reiserfs_sync_file,
.sendfile = generic_file_sendfile,
.aio_read = generic_file_aio_read,
- .aio_write = reiserfs_aio_write,
+ .aio_write = generic_file_aio_write,
.splice_read = generic_file_splice_read,
.splice_write = generic_file_splice_write,
};
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index 1b73529b809..49d1a53dbef 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -834,8 +834,7 @@ static int write_ordered_buffers(spinlock_t * lock,
get_bh(bh);
if (test_set_buffer_locked(bh)) {
if (!buffer_dirty(bh)) {
- list_del_init(&jh->list);
- list_add(&jh->list, &tmp);
+ list_move(&jh->list, &tmp);
goto loop_next;
}
spin_unlock(lock);
@@ -855,8 +854,7 @@ static int write_ordered_buffers(spinlock_t * lock,
ret = -EIO;
}
if (buffer_dirty(bh)) {
- list_del_init(&jh->list);
- list_add(&jh->list, &tmp);
+ list_move(&jh->list, &tmp);
add_to_chunk(&chunk, bh, lock, write_ordered_chunk);
} else {
reiserfs_free_jh(bh);
diff --git a/fs/smbfs/request.c b/fs/smbfs/request.c
index c71dd2760d3..c8e96195b96 100644
--- a/fs/smbfs/request.c
+++ b/fs/smbfs/request.c
@@ -400,8 +400,7 @@ static int smb_request_send_req(struct smb_request *req)
if (!(req->rq_flags & SMB_REQ_TRANSMITTED))
goto out;
- list_del_init(&req->rq_queue);
- list_add_tail(&req->rq_queue, &server->recvq);
+ list_move_tail(&req->rq_queue, &server->recvq);
result = 1;
out:
return result;
@@ -435,8 +434,7 @@ int smb_request_send_server(struct smb_sb_info *server)
result = smb_request_send_req(req);
if (result < 0) {
server->conn_error = result;
- list_del_init(&req->rq_queue);
- list_add(&req->rq_queue, &server->xmitq);
+ list_move(&req->rq_queue, &server->xmitq);
result = -EIO;
goto out;
}
diff --git a/fs/smbfs/smbiod.c b/fs/smbfs/smbiod.c
index 3f71384020c..24577e2c489 100644
--- a/fs/smbfs/smbiod.c
+++ b/fs/smbfs/smbiod.c
@@ -193,8 +193,7 @@ int smbiod_retry(struct smb_sb_info *server)
if (req->rq_flags & SMB_REQ_RETRY) {
/* must move the request to the xmitq */
VERBOSE("retrying request %p on recvq\n", req);
- list_del(&req->rq_queue);
- list_add(&req->rq_queue, &server->xmitq);
+ list_move(&req->rq_queue, &server->xmitq);
continue;
}
#endif
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 610b5bdbe75..61c42430cba 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -430,10 +430,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
i++;
/* fallthrough */
default:
- if (filp->f_pos == 2) {
- list_del(q);
- list_add(q, &parent_sd->s_children);
- }
+ if (filp->f_pos == 2)
+ list_move(q, &parent_sd->s_children);
+
for (p=q->next; p!= &parent_sd->s_children; p=p->next) {
struct sysfs_dirent *next;
const char * name;
@@ -455,8 +454,7 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
dt_type(next)) < 0)
return 0;
- list_del(q);
- list_add(q, p);
+ list_move(q, p);
p = q;
filp->f_pos++;
}