From 528da3e9e237059a84a2625e942811cf824a6efd Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 12 Jun 2009 16:04:26 -0400
Subject: inotify: inotify_destroy_mark_entry could get called twice

inotify_destroy_mark_entry could get called twice for the same mark since it
is called directly in inotify_rm_watch and when the mark is being destroyed for
another reason.  As an example assume that the file being watched was just
deleted so inotify_destroy_mark_entry would get called from the path
fsnotify_inoderemove() -> fsnotify_destroy_marks_by_inode() ->
fsnotify_destroy_mark_entry() -> inotify_destroy_mark_entry().  If this
happened at the same time as userspace tried to remove a watch via
inotify_rm_watch we could attempt to remove the mark from the idr twice and
could thus double dec the ref cnt and potentially could be in a use after
free/double free situation.  The fix is to have inotify_rm_watch use the
generic recursive safe fsnotify_destroy_mark_by_entry() so we are sure the
inotify_destroy_mark_entry() function can only be called one.

This patch also renames the function to inotify_ingored_remove_idr() so it is
clear what is actually going on in the function.

Hopefully this fixes:
[   20.342058] idr_remove called for id=20 which is not allocated.
[   20.348000] Pid: 1860, comm: udevd Not tainted 2.6.30-tip #1077
[   20.353933] Call Trace:
[   20.356410]  [<ffffffff811a82b7>] idr_remove+0x115/0x18f
[   20.361737]  [<ffffffff8134259d>] ? _spin_lock+0x6d/0x75
[   20.367061]  [<ffffffff8111640a>] ? inotify_destroy_mark_entry+0xa3/0xcf
[   20.373771]  [<ffffffff8111641e>] inotify_destroy_mark_entry+0xb7/0xcf
[   20.380306]  [<ffffffff81115913>] inotify_freeing_mark+0xe/0x10
[   20.386238]  [<ffffffff8111410d>] fsnotify_destroy_mark_by_entry+0x143/0x170
[   20.393293]  [<ffffffff811163a3>] inotify_destroy_mark_entry+0x3c/0xcf
[   20.399829]  [<ffffffff811164d1>] sys_inotify_rm_watch+0x9b/0xc6
[   20.405850]  [<ffffffff8100bcdb>] system_call_fastpath+0x16/0x1b

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
Tested-by: Peter Ziljlstra <peterz@infradead.org>
---
 fs/notify/inotify/inotify.h          |  3 ++-
 fs/notify/inotify/inotify_fsnotify.c |  2 +-
 fs/notify/inotify/inotify_user.c     | 32 +++++---------------------------
 3 files changed, 8 insertions(+), 29 deletions(-)

(limited to 'fs/notify')
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ea2605a58b8..f234f3a4c8c 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -15,7 +15,8 @@ struct inotify_inode_mark_entry {
 	int wd;
 };
 
-extern void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group);
+extern void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+					   struct fsnotify_group *group);
 extern void inotify_free_event_priv(struct fsnotify_event_private_data *event_priv);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 7ef75b83247..47cd258fd24 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -81,7 +81,7 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 
 static void inotify_freeing_mark(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
 {
-	inotify_destroy_mark_entry(entry, group);
+	inotify_ignored_and_remove_idr(entry, group);
 }
 
 static bool inotify_should_send_event(struct fsnotify_group *group, struct inode *inode, __u32 mask)
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 982a412ac5b..ff231ad2389 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -363,39 +363,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 }
 
 /*
- * When, for whatever reason, inotify is done with a mark (or what used to be a
- * watch) we need to remove that watch from the idr and we need to send IN_IGNORED
- * for the given wd.
- *
- * There is a bit of recursion here.  The loop looks like:
- * 	inotify_destroy_mark_entry -> fsnotify_destroy_mark_by_entry ->
- *	inotify_freeing_mark -> inotify_destory_mark_entry -> restart
- * But the loop is broken in 2 places.  fsnotify_destroy_mark_by_entry sets
- * entry->group = NULL before the call to inotify_freeing_mark, so the if (egroup)
- * test below will not call back to fsnotify again.  But even if that test wasn't
- * there this would still be safe since fsnotify_destroy_mark_by_entry() is
- * safe from recursion.
+ * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
+ * internal reference help on the mark because it is in the idr.
  */
-void inotify_destroy_mark_entry(struct fsnotify_mark_entry *entry, struct fsnotify_group *group)
+void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
+				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark_entry *ientry;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
-	struct fsnotify_group *egroup;
 	struct idr *idr;
 
-	spin_lock(&entry->lock);
-	egroup = entry->group;
-
-	/* if egroup we aren't really done and something might still send events
-	 * for this inode, on the callback we'll send the IN_IGNORED */
-	if (egroup) {
-		spin_unlock(&entry->lock);
-		fsnotify_destroy_mark_by_entry(entry);
-		return;
-	}
-	spin_unlock(&entry->lock);
-
 	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
 	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
@@ -699,7 +677,7 @@ SYSCALL_DEFINE2(inotify_rm_watch, int, fd, __s32, wd)
 	fsnotify_get_mark(entry);
 	spin_unlock(&group->inotify_data.idr_lock);
 
-	inotify_destroy_mark_entry(entry, group);
+	fsnotify_destroy_mark_by_entry(entry);
 	fsnotify_put_mark(entry);
 
 out:
-- 
cgit v1.2.3-70-g09d2


From bdae997f44535ac4ebe1e055ffe59eeee946f453 Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Wed, 1 Jul 2009 21:56:38 -0700
Subject: fs/notify/inotify: decrement user inotify count on close

The per-user inotify_devs value is incremented each time a new file is
allocated, but never decremented. This led to inotify_init failing after a
limited number of calls.

Signed-off-by: Keith Packard <keithp@keithp.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff231ad2389..ff27a296584 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -296,12 +296,15 @@ static int inotify_fasync(int fd, struct file *file, int on)
 static int inotify_release(struct inode *ignored, struct file *file)
 {
 	struct fsnotify_group *group = file->private_data;
+	struct user_struct *user = group->inotify_data.user;
 
 	fsnotify_clear_marks_by_group(group);
 
 	/* free this group, matching get was inotify_init->fsnotify_obtain_group */
 	fsnotify_put_group(group);
 
+	atomic_dec(&user->inotify_devs);
+
 	return 0;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 5549f7cdf84c02939fd368d0842aa2f472bb6e98 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Jul 2009 10:28:23 -0400
Subject: inotify: drop user watch count when a watch is removed

The inotify rewrite forgot to drop the inotify watch use cound when a watch
was removed.  This means that a single inotify fd can only ever register a
maximum of /proc/sys/fs/max_user_watches even if some of those had been
freed.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ff27a296584..1a870f9157b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -404,6 +404,8 @@ skip_send_ignore:
 
 	/* removed from idr, drop that reference */
 	fsnotify_put_mark(entry);
+
+	atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 
 /* ding dong the mark is dead */
-- 
cgit v1.2.3-70-g09d2


From 75fe2b26394c59c8e16bd7b76f4be5d048103ad1 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Jul 2009 10:28:23 -0400
Subject: inotify: do not leak inode marks in inotify_add_watch

inotify_add_watch had a couple of problems.  The biggest being that if
inotify_add_watch was called on the same inode twice (to update or change the
event mask) a refence was taken on the original inode mark by
fsnotify_find_mark_entry but was not being dropped at the end of the
inotify_add_watch call.  Thus if inotify_rm_watch was called although the mark
was removed from the inode, the refcnt wouldn't hit zero and we would leak
memory.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 1a870f9157b..aff4214f16c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -463,9 +463,6 @@ retry:
 			goto out_err;
 
 		spin_lock(&group->inotify_data.idr_lock);
-		/* if entry is added to the idr we keep the reference obtained
-		 * through fsnotify_mark_add.  remember to drop this reference
-		 * when entry is removed from idr */
 		ret = idr_get_new_above(&group->inotify_data.idr, entry,
 					++group->inotify_data.last_wd,
 					&ientry->wd);
@@ -476,8 +473,13 @@ retry:
 			goto out_err;
 		}
 		atomic_inc(&group->inotify_data.user->inotify_watches);
+
+		/* we put the mark on the idr, take a reference */
+		fsnotify_get_mark(entry);
 	}
 
+	ret = ientry->wd;
+
 	spin_lock(&entry->lock);
 
 	old_mask = entry->mask;
@@ -508,7 +510,11 @@ retry:
 			fsnotify_recalc_group_mask(group);
 	}
 
-	return ientry->wd;
+	/* this either matches fsnotify_find_mark_entry, or init_mark_entry
+	 * depending on which path we took... */
+	fsnotify_put_mark(entry);
+
+	return ret;
 
 out_err:
 	/* see this isn't supposed to happen, just kill the watch */
-- 
cgit v1.2.3-70-g09d2


From 7e790dd5fc937bc8d2400c30a05e32a9e9eef276 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 7 Jul 2009 10:28:24 -0400
Subject: inotify: fix error paths in inotify_update_watch

inotify_update_watch could leave things in a horrid state on a number of
error paths.  We could try to remove idr entries that didn't exist, we
could send an IN_IGNORED to userspace for watches that don't exist, and a
bit of other stupidity.  Clean these up by doing the idr addition before we
put the mark on the inode since we can clean that up on error and getting
off the inode's mark list is hard.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 79 +++++++++++++++++++++++++---------------
 1 file changed, 49 insertions(+), 30 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index aff4214f16c..726118a5845 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -365,6 +365,17 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 	return error;
 }
 
+static void inotify_remove_from_idr(struct fsnotify_group *group,
+				    struct inotify_inode_mark_entry *ientry)
+{
+	struct idr *idr;
+
+	spin_lock(&group->inotify_data.idr_lock);
+	idr = &group->inotify_data.idr;
+	idr_remove(idr, ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+	ientry->wd = -1;
+}
 /*
  * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
  * internal reference help on the mark because it is in the idr.
@@ -375,7 +386,6 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	struct inotify_inode_mark_entry *ientry;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
-	struct idr *idr;
 
 	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
@@ -397,10 +407,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 skip_send_ignore:
 
 	/* remove this entry from the idr */
-	spin_lock(&group->inotify_data.idr_lock);
-	idr = &group->inotify_data.idr;
-	idr_remove(idr, ientry->wd);
-	spin_unlock(&group->inotify_data.idr_lock);
+	inotify_remove_from_idr(group, ientry);
 
 	/* removed from idr, drop that reference */
 	fsnotify_put_mark(entry);
@@ -420,6 +427,7 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 {
 	struct fsnotify_mark_entry *entry = NULL;
 	struct inotify_inode_mark_entry *ientry;
+	struct inotify_inode_mark_entry *tmp_ientry;
 	int ret = 0;
 	int add = (arg & IN_MASK_ADD);
 	__u32 mask;
@@ -430,50 +438,60 @@ static int inotify_update_watch(struct fsnotify_group *group, struct inode *inod
 	if (unlikely(!mask))
 		return -EINVAL;
 
-	ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!ientry))
+	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_ientry))
 		return -ENOMEM;
 	/* we set the mask at the end after attaching it */
-	fsnotify_init_mark(&ientry->fsn_entry, inotify_free_mark);
-	ientry->wd = 0;
+	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+	tmp_ientry->wd = -1;
 
 find_entry:
 	spin_lock(&inode->i_lock);
 	entry = fsnotify_find_mark_entry(group, inode);
 	spin_unlock(&inode->i_lock);
 	if (entry) {
-		kmem_cache_free(inotify_inode_mark_cachep, ientry);
 		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 	} else {
-		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) {
-			ret = -ENOSPC;
-			goto out_err;
-		}
-
-		ret = fsnotify_add_mark(&ientry->fsn_entry, group, inode);
-		if (ret == -EEXIST)
-			goto find_entry;
-		else if (ret)
+		ret = -ENOSPC;
+		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
 			goto out_err;
-
-		entry = &ientry->fsn_entry;
 retry:
 		ret = -ENOMEM;
 		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
 			goto out_err;
 
 		spin_lock(&group->inotify_data.idr_lock);
-		ret = idr_get_new_above(&group->inotify_data.idr, entry,
-					++group->inotify_data.last_wd,
-					&ientry->wd);
+		ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+					group->inotify_data.last_wd,
+					&tmp_ientry->wd);
 		spin_unlock(&group->inotify_data.idr_lock);
 		if (ret) {
 			if (ret == -EAGAIN)
 				goto retry;
 			goto out_err;
 		}
+
+		ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+		if (ret) {
+			inotify_remove_from_idr(group, tmp_ientry);
+			if (ret == -EEXIST)
+				goto find_entry;
+			goto out_err;
+		}
+
+		/* tmp_ientry has been added to the inode, so we are all set up.
+		 * now we just need to make sure tmp_ientry doesn't get freed and
+		 * we need to set up entry and ientry so the generic code can
+		 * do its thing. */
+		ientry = tmp_ientry;
+		entry = &ientry->fsn_entry;
+		tmp_ientry = NULL;
+
 		atomic_inc(&group->inotify_data.user->inotify_watches);
 
+		/* update the idr hint */
+		group->inotify_data.last_wd = ientry->wd;
+
 		/* we put the mark on the idr, take a reference */
 		fsnotify_get_mark(entry);
 	}
@@ -514,14 +532,15 @@ retry:
 	 * depending on which path we took... */
 	fsnotify_put_mark(entry);
 
-	return ret;
-
 out_err:
-	/* see this isn't supposed to happen, just kill the watch */
-	if (entry) {
-		fsnotify_destroy_mark_by_entry(entry);
-		fsnotify_put_mark(entry);
+	/* could be an error, could be that we found an existing mark */
+	if (tmp_ientry) {
+		/* on the idr but didn't make it on the inode */
+		if (tmp_ientry->wd != -1)
+			inotify_remove_from_idr(group, tmp_ientry);
+		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
 	}
+
 	return ret;
 }
 
-- 
cgit v1.2.3-70-g09d2


From 520dc2a526fd681337883b6ff1ddcf7c23b1b063 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 13 Jul 2009 15:56:54 -0400
Subject: fsnotify: use def_bool in kconfig instead of letting the user choose

fsnotify doens't give the user anything.  If someone chooses inotify or
dnotify it should build fsnotify, if they don't select one it shouldn't be
built.  This patch changes fsnotify to be a def_bool=n and makes everything
else select it.  Also fixes the issue people complained about on lwn where
gdm hung because they didn't have inotify and they didn't get the inotify
build option.....

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/Kconfig         | 12 +-----------
 fs/notify/dnotify/Kconfig |  2 +-
 fs/notify/inotify/Kconfig |  2 +-
 3 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/Kconfig b/fs/notify/Kconfig
index 31dac7e3b0f..dffbb0911d0 100644
--- a/fs/notify/Kconfig
+++ b/fs/notify/Kconfig
@@ -1,15 +1,5 @@
 config FSNOTIFY
-	bool "Filesystem notification backend"
-	default y
-	---help---
-	   fsnotify is a backend for filesystem notification.  fsnotify does
-	   not provide any userspace interface but does provide the basis
-	   needed for other notification schemes such as dnotify, inotify,
-	   and fanotify.
-
-	   Say Y here to enable fsnotify suport.
-
-	   If unsure, say Y.
+	def_bool n
 
 source "fs/notify/dnotify/Kconfig"
 source "fs/notify/inotify/Kconfig"
diff --git a/fs/notify/dnotify/Kconfig b/fs/notify/dnotify/Kconfig
index 904ff8d5405..f9c1ca139d8 100644
--- a/fs/notify/dnotify/Kconfig
+++ b/fs/notify/dnotify/Kconfig
@@ -1,6 +1,6 @@
 config DNOTIFY
 	bool "Dnotify support"
-	depends on FSNOTIFY
+	select FSNOTIFY
 	default y
 	help
 	  Dnotify is a directory-based per-fd file change notification system
diff --git a/fs/notify/inotify/Kconfig b/fs/notify/inotify/Kconfig
index 5356884289a..3e56dbffe72 100644
--- a/fs/notify/inotify/Kconfig
+++ b/fs/notify/inotify/Kconfig
@@ -15,7 +15,7 @@ config INOTIFY
 
 config INOTIFY_USER
 	bool "Inotify support for userspace"
-	depends on FSNOTIFY
+	select FSNOTIFY
 	default y
 	---help---
 	  Say Y here to enable inotify support for userspace, including the
-- 
cgit v1.2.3-70-g09d2


From 4a148ba988988b9c400ad0f2cbccc155289b954b Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 13 Jul 2009 15:56:55 -0400
Subject: inotify: check filename before dropping repeat events

inotify drops events if the last event on the queue is the same as the
current event.  But it does 2 things wrong.  First it is comparing old->inode
with new->inode.  But after an event if put on the queue the ->inode is no
longer allowed to be used.  It's possible between the last event and this new
event the inode could be reused and we would falsely match the inode's memory
address between two differing events.

The second problem is that when a file is removed fsnotify is passed the
negative dentry for the removed object rather than the postive dentry from
immediately before the removal.  This mean the (broken) inotify tail drop code
was matching the NULL ->inode of differing events.

The fix is to check the file name which is stored with events when doing the
tail drop instead of wrongly checking the address of the stored ->inode.

Reported-by: Scott James Remnant <scott@ubuntu.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/notification.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 959b73e756f..69391fe8efb 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -136,10 +136,15 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 {
 	if ((old->mask == new->mask) &&
 	    (old->to_tell == new->to_tell) &&
-	    (old->data_type == new->data_type)) {
+	    (old->data_type == new->data_type) &&
+	    (old->name_len == new->name_len)) {
 		switch (old->data_type) {
 		case (FSNOTIFY_EVENT_INODE):
-			if (old->inode == new->inode)
+			/* remember, after old was put on the wait_q we aren't
+			 * allowed to look at the inode any more, only thing
+			 * left to check was if the file_name is the same */
+			if (old->name_len &&
+			    !strcmp(old->file_name, new->file_name))
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_PATH):
-- 
cgit v1.2.3-70-g09d2


From c05594b62125c528d93af3a78229793aae36df7f Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 13 Jul 2009 15:56:55 -0400
Subject: fsnotify: fix inotify tail drop check with path entries

fsnotify drops new events when they are the same as the tail event on the
queue to be sent to userspace.  The problem is that if the event comes with
a path we forget to break out of the switch statement and fall into the
code path which matches on events that do not have any type of file backed
information (things like IN_UNMOUNT and IN_Q_OVERFLOW).  The problem is
that this code thinks all such events should be dropped.  Fix is to add a
break.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/notification.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs/notify')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 69391fe8efb..2b20feaf263 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -151,6 +151,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 			if ((old->path.mnt == new->path.mnt) &&
 			    (old->path.dentry == new->path.dentry))
 				return true;
+			break;
 		case (FSNOTIFY_EVENT_NONE):
 			return true;
 		};
-- 
cgit v1.2.3-70-g09d2


From f44aebcc566d1d6275f7191867b9633dc11de2ee Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Wed, 15 Jul 2009 15:49:52 -0400
Subject: inotify: use GFP_NOFS under potential memory pressure

inotify can have a watchs removed under filesystem reclaim.

=================================
[ INFO: inconsistent lock state ]
2.6.31-rc2 #16
---------------------------------
inconsistent {IN-RECLAIM_FS-W} -> {RECLAIM_FS-ON-W} usage.
khubd/217 [HC0[0]:SC0[0]:HE1:SE1] takes:
 (iprune_mutex){+.+.?.}, at: [<c10ba899>] invalidate_inodes+0x20/0xe3
{IN-RECLAIM_FS-W} state was registered at:
  [<c10536ab>] __lock_acquire+0x2c9/0xac4
  [<c1053f45>] lock_acquire+0x9f/0xc2
  [<c1308872>] __mutex_lock_common+0x2d/0x323
  [<c1308c00>] mutex_lock_nested+0x2e/0x36
  [<c10ba6ff>] shrink_icache_memory+0x38/0x1b2
  [<c108bfb6>] shrink_slab+0xe2/0x13c
  [<c108c3e1>] kswapd+0x3d1/0x55d
  [<c10449b5>] kthread+0x66/0x6b
  [<c1003fdf>] kernel_thread_helper+0x7/0x10
  [<ffffffff>] 0xffffffff

Two things are needed to fix this.  First we need a method to tell
fsnotify_create_event() to use GFP_NOFS and second we need to stop using
one global IN_IGNORED event and allocate them one at a time.  This solves
current issues with multiple IN_IGNORED on a queue having tail drop
problems and simplifies the allocations since we don't have to worry about
two tasks opperating on the IGNORED event concurrently.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/fsnotify.c             |  4 +++-
 fs/notify/inotify/inotify_user.c | 18 ++++++++++++------
 fs/notify/notification.c         |  9 +++++----
 include/linux/fsnotify_backend.h |  2 +-
 4 files changed, 21 insertions(+), 12 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index ec2f7bd7681..037e878e03f 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -159,7 +159,9 @@ void fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is, const
 			if (!group->ops->should_send_event(group, to_tell, mask))
 				continue;
 			if (!event) {
-				event = fsnotify_create_event(to_tell, mask, data, data_is, file_name, cookie);
+				event = fsnotify_create_event(to_tell, mask, data,
+							      data_is, file_name, cookie,
+							      GFP_KERNEL);
 				/* shit, we OOM'd and now we can't tell, maybe
 				 * someday someone else will want to do something
 				 * here */
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 726118a5845..f30d9bbc2e1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -57,7 +57,6 @@ int inotify_max_user_watches __read_mostly;
 
 static struct kmem_cache *inotify_inode_mark_cachep __read_mostly;
 struct kmem_cache *event_priv_cachep __read_mostly;
-static struct fsnotify_event *inotify_ignored_event;
 
 /*
  * When inotify registers a new group it increments this and uses that
@@ -384,12 +383,19 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 				    struct fsnotify_group *group)
 {
 	struct inotify_inode_mark_entry *ientry;
+	struct fsnotify_event *ignored_event;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
 
+	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
+					      FSNOTIFY_EVENT_NONE, NULL, 0,
+					      GFP_NOFS);
+	if (!ignored_event)
+		return;
+
 	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
-	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_KERNEL);
+	event_priv = kmem_cache_alloc(event_priv_cachep, GFP_NOFS);
 	if (unlikely(!event_priv))
 		goto skip_send_ignore;
 
@@ -398,7 +404,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	fsn_event_priv->group = group;
 	event_priv->wd = ientry->wd;
 
-	fsnotify_add_notify_event(group, inotify_ignored_event, fsn_event_priv);
+	fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
 
 	/* did the private data get added? */
 	if (list_empty(&fsn_event_priv->event_list))
@@ -406,6 +412,9 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 
 skip_send_ignore:
 
+	/* matches the reference taken when the event was created */
+	fsnotify_put_event(ignored_event);
+
 	/* remove this entry from the idr */
 	inotify_remove_from_idr(group, ientry);
 
@@ -748,9 +757,6 @@ static int __init inotify_user_setup(void)
 
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark_entry, SLAB_PANIC);
 	event_priv_cachep = KMEM_CACHE(inotify_event_private_data, SLAB_PANIC);
-	inotify_ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0);
-	if (!inotify_ignored_event)
-		panic("unable to allocate the inotify ignored event\n");
 
 	inotify_max_queued_events = 16384;
 	inotify_max_user_instances = 128;
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 2b20feaf263..521368574e9 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,7 +153,7 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_NONE):
-			return true;
+			return false;
 		};
 	}
 	return false;
@@ -345,18 +345,19 @@ static void initialize_event(struct fsnotify_event *event)
  * @name the filename, if available
  */
 struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask, void *data,
-					     int data_type, const char *name, u32 cookie)
+					     int data_type, const char *name, u32 cookie,
+					     gfp_t gfp)
 {
 	struct fsnotify_event *event;
 
-	event = kmem_cache_alloc(fsnotify_event_cachep, GFP_KERNEL);
+	event = kmem_cache_alloc(fsnotify_event_cachep, gfp);
 	if (!event)
 		return NULL;
 
 	initialize_event(event);
 
 	if (name) {
-		event->file_name = kstrdup(name, GFP_KERNEL);
+		event->file_name = kstrdup(name, gfp);
 		if (!event->file_name) {
 			kmem_cache_free(fsnotify_event_cachep, event);
 			return NULL;
diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h
index 6c3de999fb3..4d6f47b5118 100644
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -352,7 +352,7 @@ extern void fsnotify_unmount_inodes(struct list_head *list);
 /* put here because inotify does some weird stuff when destroying watches */
 extern struct fsnotify_event *fsnotify_create_event(struct inode *to_tell, __u32 mask,
 						    void *data, int data_is, const char *name,
-						    u32 cookie);
+						    u32 cookie, gfp_t gfp);
 
 #else
 
-- 
cgit v1.2.3-70-g09d2


From eef3a116be11d35396efb2a8cc7345fd3221e294 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 16 Aug 2009 21:51:44 -0400
Subject: notify: unused event private race

inotify decides if private data it passed to get added to an event was
used by checking list_empty().  But it's possible that the event may
have been dequeued and the private event removed so it would look empty.

The fix is to use the return code from fsnotify_add_notify_event rather
than looking at the list.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_fsnotify.c | 13 +++++++------
 fs/notify/inotify/inotify_user.c     |  7 +++----
 fs/notify/notification.c             |  7 +++----
 3 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 47cd258fd24..5dcbafe72d7 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev
 	event_priv->wd = wd;
 
 	ret = fsnotify_add_notify_event(group, event, fsn_event_priv);
-	/* EEXIST is not an error */
-	if (ret == -EEXIST)
-		ret = 0;
-
-	/* did event_priv get attached? */
-	if (list_empty(&fsn_event_priv->event_list))
+	if (ret) {
 		inotify_free_event_priv(fsn_event_priv);
+		/* EEXIST says we tail matched, EOVERFLOW isn't something
+		 * to report up the stack. */
+		if ((ret == -EEXIST) ||
+		    (ret == -EOVERFLOW))
+			ret = 0;
+	}
 
 	/*
 	 * If we hold the entry until after the event is on the queue
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index f30d9bbc2e1..c172a7a17b1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -386,6 +386,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	struct fsnotify_event *ignored_event;
 	struct inotify_event_private_data *event_priv;
 	struct fsnotify_event_private_data *fsn_event_priv;
+	int ret;
 
 	ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL,
 					      FSNOTIFY_EVENT_NONE, NULL, 0,
@@ -404,10 +405,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 	fsn_event_priv->group = group;
 	event_priv->wd = ientry->wd;
 
-	fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
-
-	/* did the private data get added? */
-	if (list_empty(&fsn_event_priv->event_list))
+	ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv);
+	if (ret)
 		inotify_free_event_priv(fsn_event_priv);
 
 skip_send_ignore:
diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 521368574e9..74b3cf30bc6 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -171,9 +171,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even
 	struct list_head *list = &group->notification_list;
 	struct fsnotify_event_holder *last_holder;
 	struct fsnotify_event *last_event;
-
-	/* easy to tell if priv was attached to the event */
-	INIT_LIST_HEAD(&priv->event_list);
+	int ret = 0;
 
 	/*
 	 * There is one fsnotify_event_holder embedded inside each fsnotify_event.
@@ -194,6 +192,7 @@ alloc_holder:
 
 	if (group->q_len >= group->max_events) {
 		event = &q_overflow_event;
+		ret = -EOVERFLOW;
 		/* sorry, no private data on the overflow event */
 		priv = NULL;
 	}
@@ -235,7 +234,7 @@ alloc_holder:
 	mutex_unlock(&group->notification_mutex);
 
 	wake_up(&group->notification_waitq);
-	return 0;
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3-70-g09d2


From cd94c8bbef8d4b796a7ed4c551355a334604fd36 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 16 Aug 2009 21:51:49 -0400
Subject: inotify: tail drop inotify q_overflow events

In f44aebcc the tail drop logic of events with no file backing
(q_overflow and in_ignored) was reversed so IN_IGNORED events would
never be tail dropped.  This now means that Q_OVERFLOW events are NOT
tail dropped.  The fix is to not tail drop IN_IGNORED, but to tail drop
Q_OVERFLOW.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/notification.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/notify')

diff --git a/fs/notify/notification.c b/fs/notify/notification.c
index 74b3cf30bc6..3816d5750dd 100644
--- a/fs/notify/notification.c
+++ b/fs/notify/notification.c
@@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new
 				return true;
 			break;
 		case (FSNOTIFY_EVENT_NONE):
+			if (old->mask & FS_Q_OVERFLOW)
+				return true;
+			else if (old->mask & FS_IN_IGNORED)
+				return false;
 			return false;
 		};
 	}
-- 
cgit v1.2.3-70-g09d2


From 08e53fcb0db34baca3db84a457b6d67faabee4c6 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Sun, 16 Aug 2009 21:51:55 -0400
Subject: inotify: start watch descriptor count at 1

The inotify_add_watch man page specifies that inotify_add_watch() will
return a non-negative integer.  However, historically the inotify
watches started at 1, not at 0.

Turns out that the inotifywait program provided by the inotify-tools
package doesn't properly handle a 0 watch descriptor.  In 7e790dd5 we
changed from starting at 1 to starting at 0.  This patch starts at 1,
just like in previous kernels, but also just like in previous kernels
it's possible for it to wrap back to 0.  This preserves the kernel
functionality exactly like it was before the patch (neither method broke
the spec)

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify_user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index c172a7a17b1..dc32ed8323b 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -567,7 +567,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign
 
 	spin_lock_init(&group->inotify_data.idr_lock);
 	idr_init(&group->inotify_data.idr);
-	group->inotify_data.last_wd = 0;
+	group->inotify_data.last_wd = 1;
 	group->inotify_data.user = user;
 	group->inotify_data.fa = NULL;
 
-- 
cgit v1.2.3-70-g09d2


From 52cef7555adf5ca09b3b7283097466759120d901 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 24 Aug 2009 16:03:35 -0400
Subject: inotify: seperate new watch creation updating existing watches

There is nothing known wrong with the inotify watch addition/modification
but this patch seperates the two code paths to make them each easy to
verify as correct.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 172 +++++++++++++++++++++++----------------
 1 file changed, 103 insertions(+), 69 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index dc32ed8323b..d8f73c25307 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -431,80 +431,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry)
 	kmem_cache_free(inotify_inode_mark_cachep, ientry);
 }
 
-static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+static int inotify_update_existing_watch(struct fsnotify_group *group,
+					 struct inode *inode,
+					 u32 arg)
 {
-	struct fsnotify_mark_entry *entry = NULL;
+	struct fsnotify_mark_entry *entry;
 	struct inotify_inode_mark_entry *ientry;
-	struct inotify_inode_mark_entry *tmp_ientry;
-	int ret = 0;
-	int add = (arg & IN_MASK_ADD);
-	__u32 mask;
 	__u32 old_mask, new_mask;
+	__u32 mask;
+	int add = (arg & IN_MASK_ADD);
+	int ret;
 
 	/* don't allow invalid bits: we don't want flags set */
 	mask = inotify_arg_to_mask(arg);
 	if (unlikely(!mask))
 		return -EINVAL;
 
-	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
-	if (unlikely(!tmp_ientry))
-		return -ENOMEM;
-	/* we set the mask at the end after attaching it */
-	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
-	tmp_ientry->wd = -1;
-
-find_entry:
 	spin_lock(&inode->i_lock);
 	entry = fsnotify_find_mark_entry(group, inode);
 	spin_unlock(&inode->i_lock);
-	if (entry) {
-		ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
-	} else {
-		ret = -ENOSPC;
-		if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
-			goto out_err;
-retry:
-		ret = -ENOMEM;
-		if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
-			goto out_err;
-
-		spin_lock(&group->inotify_data.idr_lock);
-		ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
-					group->inotify_data.last_wd,
-					&tmp_ientry->wd);
-		spin_unlock(&group->inotify_data.idr_lock);
-		if (ret) {
-			if (ret == -EAGAIN)
-				goto retry;
-			goto out_err;
-		}
+	if (!entry)
+		return -ENOENT;
 
-		ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
-		if (ret) {
-			inotify_remove_from_idr(group, tmp_ientry);
-			if (ret == -EEXIST)
-				goto find_entry;
-			goto out_err;
-		}
-
-		/* tmp_ientry has been added to the inode, so we are all set up.
-		 * now we just need to make sure tmp_ientry doesn't get freed and
-		 * we need to set up entry and ientry so the generic code can
-		 * do its thing. */
-		ientry = tmp_ientry;
-		entry = &ientry->fsn_entry;
-		tmp_ientry = NULL;
-
-		atomic_inc(&group->inotify_data.user->inotify_watches);
-
-		/* update the idr hint */
-		group->inotify_data.last_wd = ientry->wd;
-
-		/* we put the mark on the idr, take a reference */
-		fsnotify_get_mark(entry);
-	}
-
-	ret = ientry->wd;
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
 
 	spin_lock(&entry->lock);
 
@@ -536,18 +485,103 @@ retry:
 			fsnotify_recalc_group_mask(group);
 	}
 
-	/* this either matches fsnotify_find_mark_entry, or init_mark_entry
-	 * depending on which path we took... */
+	/* return the wd */
+	ret = ientry->wd;
+
+	/* match the get from fsnotify_find_mark_entry() */
 	fsnotify_put_mark(entry);
 
+	return ret;
+}
+
+static int inotify_new_watch(struct fsnotify_group *group,
+			     struct inode *inode,
+			     u32 arg)
+{
+	struct inotify_inode_mark_entry *tmp_ientry;
+	__u32 mask;
+	int ret;
+
+	/* don't allow invalid bits: we don't want flags set */
+	mask = inotify_arg_to_mask(arg);
+	if (unlikely(!mask))
+		return -EINVAL;
+
+	tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL);
+	if (unlikely(!tmp_ientry))
+		return -ENOMEM;
+
+	fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark);
+	tmp_ientry->fsn_entry.mask = mask;
+	tmp_ientry->wd = -1;
+
+	ret = -ENOSPC;
+	if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches)
+		goto out_err;
+retry:
+	ret = -ENOMEM;
+	if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL)))
+		goto out_err;
+
+	spin_lock(&group->inotify_data.idr_lock);
+	ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry,
+				group->inotify_data.last_wd,
+				&tmp_ientry->wd);
+	spin_unlock(&group->inotify_data.idr_lock);
+	if (ret) {
+		/* idr was out of memory allocate and try again */
+		if (ret == -EAGAIN)
+			goto retry;
+		goto out_err;
+	}
+
+	/* we are on the idr, now get on the inode */
+	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
+	if (ret) {
+		/* we failed to get on the inode, get off the idr */
+		inotify_remove_from_idr(group, tmp_ientry);
+		goto out_err;
+	}
+
+	/* we put the mark on the idr, take a reference */
+	fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
+	/* update the idr hint, who cares about races, it's just a hint */
+	group->inotify_data.last_wd = tmp_ientry->wd;
+
+	/* increment the number of watches the user has */
+	atomic_inc(&group->inotify_data.user->inotify_watches);
+
+	/* return the watch descriptor for this new entry */
+	ret = tmp_ientry->wd;
+
+	/* match the ref from fsnotify_init_markentry() */
+	fsnotify_put_mark(&tmp_ientry->fsn_entry);
+
 out_err:
-	/* could be an error, could be that we found an existing mark */
-	if (tmp_ientry) {
-		/* on the idr but didn't make it on the inode */
-		if (tmp_ientry->wd != -1)
-			inotify_remove_from_idr(group, tmp_ientry);
+	if (ret < 0)
 		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
-	}
+
+	return ret;
+}
+
+static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg)
+{
+	int ret = 0;
+
+retry:
+	/* try to update and existing watch with the new arg */
+	ret = inotify_update_existing_watch(group, inode, arg);
+	/* no mark present, try to add a new one */
+	if (ret == -ENOENT)
+		ret = inotify_new_watch(group, inode, arg);
+	/*
+	 * inotify_new_watch could race with another thread which did an
+	 * inotify_new_watch between the update_existing and the add watch
+	 * here, go back and try to update an existing mark again.
+	 */
+	if (ret == -EEXIST)
+		goto retry;
 
 	return ret;
 }
-- 
cgit v1.2.3-70-g09d2


From cf4374267fbe966e8e4e7db68f5dc7b267439780 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 24 Aug 2009 16:03:35 -0400
Subject: inotify: do not BUG on idr entries at inotify destruction

If an inotify watch is left in the idr when an fsnotify group is destroyed
this will lead to a BUG.  This is not a dangerous situation and really
indicates a programming bug and leak of memory.  This patch changes it to
use a WARN and a printk rather than killing people's boxes.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_fsnotify.c | 33 +++++++++++++++++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 5dcbafe72d7..c9ee67b442e 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -105,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode
 	return send;
 }
 
+/*
+ * This is NEVER supposed to be called.  Inotify marks should either have been
+ * removed from the idr when the watch was removed or in the
+ * fsnotify_destroy_mark_by_group() call when the inotify instance was being
+ * torn down.  This is only called if the idr is about to be freed but there
+ * are still marks in it.
+ */
 static int idr_callback(int id, void *p, void *data)
 {
-	BUG();
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *ientry;
+	static bool warned = false;
+
+	if (warned)
+		return 0;
+
+	warned = false;
+	entry = p;
+	ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+
+	WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in "
+		"idr.  Probably leaking memory\n", id, p, data);
+
+	/*
+	 * I'm taking the liberty of assuming that the mark in question is a
+	 * valid address and I'm dereferencing it.  This might help to figure
+	 * out why we got here and the panic is no worse than the original
+	 * BUG() that was here.
+	 */
+	if (entry)
+		printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n",
+			entry->group, entry->inode, ientry->wd);
 	return 0;
 }
 
 static void inotify_free_group_priv(struct fsnotify_group *group)
 {
 	/* ideally the idr is empty and we won't hit the BUG in teh callback */
-	idr_for_each(&group->inotify_data.idr, idr_callback, NULL);
+	idr_for_each(&group->inotify_data.idr, idr_callback, group);
 	idr_remove_all(&group->inotify_data.idr);
 	idr_destroy(&group->inotify_data.idr);
 }
-- 
cgit v1.2.3-70-g09d2


From dead537dd8a1c9495322c1d6f7c780697f474af0 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 24 Aug 2009 16:03:35 -0400
Subject: inotify: fix locking around inotify watching in the idr

The are races around the idr storage of inotify watches.  It's possible
that a watch could be found from sys_inotify_rm_watch() in the idr, but it
could be removed from the idr before that code does it's removal.  Move the
locking and the refcnt'ing so that these have to happen atomically.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 50 ++++++++++++++++++++++++++++++++--------
 1 file changed, 40 insertions(+), 10 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index d8f73c25307..ce1f5823e2c 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -364,20 +364,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
 	return error;
 }
 
+/*
+ * Remove the mark from the idr (if present) and drop the reference
+ * on the mark because it was in the idr.
+ */
 static void inotify_remove_from_idr(struct fsnotify_group *group,
 				    struct inotify_inode_mark_entry *ientry)
 {
 	struct idr *idr;
+	struct fsnotify_mark_entry *entry;
+	struct inotify_inode_mark_entry *found_ientry;
+	int wd;
 
 	spin_lock(&group->inotify_data.idr_lock);
 	idr = &group->inotify_data.idr;
-	idr_remove(idr, ientry->wd);
-	spin_unlock(&group->inotify_data.idr_lock);
+	wd = ientry->wd;
+
+	if (wd == -1)
+		goto out;
+
+	entry = idr_find(&group->inotify_data.idr, wd);
+	if (unlikely(!entry))
+		goto out;
+
+	found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry);
+	if (unlikely(found_ientry != ientry)) {
+		/* We found an entry in the idr with the right wd, but it's
+		 * not the entry we were told to remove.  eparis seriously
+		 * fucked up somewhere. */
+		WARN_ON(1);
+		ientry->wd = -1;
+		goto out;
+	}
+
+	/* One ref for being in the idr, one ref held by the caller */
+	BUG_ON(atomic_read(&entry->refcnt) < 2);
+
+	idr_remove(idr, wd);
 	ientry->wd = -1;
+
+	/* removed from the idr, drop that ref */
+	fsnotify_put_mark(entry);
+out:
+	spin_unlock(&group->inotify_data.idr_lock);
 }
+
 /*
- * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the
- * internal reference help on the mark because it is in the idr.
+ * Send IN_IGNORED for this wd, remove this wd from the idr.
  */
 void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry,
 				    struct fsnotify_group *group)
@@ -417,9 +450,6 @@ skip_send_ignore:
 	/* remove this entry from the idr */
 	inotify_remove_from_idr(group, ientry);
 
-	/* removed from idr, drop that reference */
-	fsnotify_put_mark(entry);
-
 	atomic_dec(&group->inotify_data.user->inotify_watches);
 }
 
@@ -535,6 +565,9 @@ retry:
 		goto out_err;
 	}
 
+	/* we put the mark on the idr, take a reference */
+	fsnotify_get_mark(&tmp_ientry->fsn_entry);
+
 	/* we are on the idr, now get on the inode */
 	ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode);
 	if (ret) {
@@ -543,9 +576,6 @@ retry:
 		goto out_err;
 	}
 
-	/* we put the mark on the idr, take a reference */
-	fsnotify_get_mark(&tmp_ientry->fsn_entry);
-
 	/* update the idr hint, who cares about races, it's just a hint */
 	group->inotify_data.last_wd = tmp_ientry->wd;
 
-- 
cgit v1.2.3-70-g09d2


From 0db501bd0610ee0c0aca84d927f90bcccd09e2bd Mon Sep 17 00:00:00 2001
From: "Eric W. Biederman" <ebiederm@xmission.com>
Date: Thu, 27 Aug 2009 03:20:04 -0700
Subject: inotify: Ensure we alwasy write the terminating NULL.

Before the rewrite copy_event_to_user always wrote a terqminating '\0'
byte to user space after the filename.  Since the rewrite that
terminating byte was skipped if your filename is exactly a multiple of
event_size.  Ouch!

So add one byte to name_size before we round up and use clear_user to
set userspace to zero like /dev/zero does instead of copying the
strange nul_inotify_event.  I can't quite convince myself len_to_zero
will never exceed 16 and even if it doesn't clear_user should be more
efficient and a more accurate reflection of what the code is trying to
do.

Signed-off-by: Eric W. Biederman <ebiederm@aristanetworks.com>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index ce1f5823e2c..0e781bc88d1 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -47,9 +47,6 @@
 
 static struct vfsmount *inotify_mnt __read_mostly;
 
-/* this just sits here and wastes global memory.  used to just pad userspace messages with zeros */
-static struct inotify_event nul_inotify_event;
-
 /* these are configurable via /proc/sys/fs/inotify/ */
 static int inotify_max_user_instances __read_mostly;
 static int inotify_max_queued_events __read_mostly;
@@ -199,8 +196,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		inotify_free_event_priv(fsn_priv);
 	}
 
-	/* round up event->name_len so it is a multiple of event_size */
-	name_len = roundup(event->name_len, event_size);
+	/* round up event->name_len so it is a multiple of event_size
+	 * plus an extra byte for the terminating '\0'.
+	 */
+	name_len = roundup(event->name_len + 1, event_size);
 	inotify_event.len = name_len;
 
 	inotify_event.mask = inotify_mask_to_arg(event->mask);
@@ -224,8 +223,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 			return -EFAULT;
 		buf += event->name_len;
 
-		/* fill userspace with 0's from nul_inotify_event */
-		if (copy_to_user(buf, &nul_inotify_event, len_to_zero))
+		/* fill userspace with 0's */
+		if (clear_user(buf, len_to_zero))
 			return -EFAULT;
 		buf += len_to_zero;
 		event_size += name_len;
-- 
cgit v1.2.3-70-g09d2


From b962e7312ae87006aed6f68ceee94bdf8db08338 Mon Sep 17 00:00:00 2001
From: Brian Rogers <brian@xyzw.org>
Date: Fri, 28 Aug 2009 10:00:05 -0400
Subject: inotify: do not send a block of zeros when no pathname is available

When an event has no pathname, there's no need to pad it with a null byte and
therefore generate an inotify_event sized block of zeros. This fixes a
regression introduced by commit 0db501bd0610ee0c0aca84d927f90bcccd09e2bd where
my system wouldn't finish booting because some process was being confused by
this.

Signed-off-by: Brian Rogers <brian@xyzw.org>
Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 0e781bc88d1..b547ae17b46 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -180,7 +180,7 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 	struct fsnotify_event_private_data *fsn_priv;
 	struct inotify_event_private_data *priv;
 	size_t event_size = sizeof(struct inotify_event);
-	size_t name_len;
+	size_t name_len = 0;
 
 	/* we get the inotify watch descriptor from the event private data */
 	spin_lock(&event->lock);
@@ -196,10 +196,12 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group,
 		inotify_free_event_priv(fsn_priv);
 	}
 
-	/* round up event->name_len so it is a multiple of event_size
+	/*
+	 * round up event->name_len so it is a multiple of event_size
 	 * plus an extra byte for the terminating '\0'.
 	 */
-	name_len = roundup(event->name_len + 1, event_size);
+	if (event->name_len)
+		name_len = roundup(event->name_len + 1, event_size);
 	inotify_event.len = name_len;
 
 	inotify_event.mask = inotify_mask_to_arg(event->mask);
-- 
cgit v1.2.3-70-g09d2


From 83cb10f0ef3c96162be92339ccf8c0c9c9f2d13e Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 28 Aug 2009 11:57:55 -0400
Subject: inotify: fix length reporting and size checking

0db501bd0610ee0c0 introduced a regresion in that it now sends a nul
terminator but the length accounting when checking for space or
reporting to userspace did not take this into account.  This corrects
all of the rounding logic.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index b547ae17b46..6111670b257 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -154,7 +154,8 @@ static struct fsnotify_event *get_one_event(struct fsnotify_group *group,
 
 	event = fsnotify_peek_notify_event(group);
 
-	event_size += roundup(event->name_len, event_size);
+	if (event->name_len)
+		event_size += roundup(event->name_len + 1, event_size);
 
 	if (event_size > count)
 		return ERR_PTR(-EINVAL);
@@ -327,8 +328,9 @@ static long inotify_ioctl(struct file *file, unsigned int cmd,
 		list_for_each_entry(holder, &group->notification_list, event_list) {
 			event = holder->event;
 			send_len += sizeof(struct inotify_event);
-			send_len += roundup(event->name_len,
-					     sizeof(struct inotify_event));
+			if (event->name_len)
+				send_len += roundup(event->name_len + 1,
+						sizeof(struct inotify_event));
 		}
 		mutex_unlock(&group->notification_mutex);
 		ret = put_user(send_len, (int __user *) p);
-- 
cgit v1.2.3-70-g09d2


From 750a8870fe4016ef3091fc97e084d58c613c2cc7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Fri, 28 Aug 2009 12:50:47 -0400
Subject: inotify: update the group mask on mark addition

Seperating the addition and update of marks in inotify resulted in a
regression in that inotify never gets events.  The inotify group mask is
always 0.  This mask should be updated any time a new mark is added.

Signed-off-by: Eric Paris <eparis@redhat.com>
---
 fs/notify/inotify/inotify_user.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs/notify')

diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 6111670b257..dcd2040d330 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -591,6 +591,10 @@ retry:
 	/* match the ref from fsnotify_init_markentry() */
 	fsnotify_put_mark(&tmp_ientry->fsn_entry);
 
+	/* if this mark added a new event update the group mask */
+	if (mask & ~group->mask)
+		fsnotify_recalc_group_mask(group);
+
 out_err:
 	if (ret < 0)
 		kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry);
-- 
cgit v1.2.3-70-g09d2