From 1edf223485c42c99655dcd001db1e46ad5e5d2d7 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Tue, 10 Jan 2012 15:06:57 -0800
Subject: mm/page-writeback.c: make determine_dirtyable_memory static again

The tracing ring-buffer used this function briefly, but not anymore.
Make it local to the writeback code again.

Also, move the function so that no forward declaration needs to be
reintroduced.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h | 2 --
 1 file changed, 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index a378c295851..34a005515fe 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -138,8 +138,6 @@ extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
 
-extern unsigned long determine_dirtyable_memory(void);
-
 extern int dirty_background_ratio_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
-- 
cgit v1.2.3-70-g09d2


From cc59850ef940e4ee6a765d28b439b9bafe07cf63 Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 10 Jan 2012 15:07:04 -0800
Subject: mm: add free_hot_cold_page_list() helper

This patch adds helper free_hot_cold_page_list() to free list of 0-order
pages.  It frees pages directly from list without temporary page-vector.
It also calls trace_mm_pagevec_free() to simulate pagevec_free()
behaviour.

bloat-o-meter:

add/remove: 1/1 grow/shrink: 1/3 up/down: 267/-295 (-28)
function                                     old     new   delta
free_hot_cold_page_list                        -     264    +264
get_page_from_freelist                      2129    2132      +3
__pagevec_free                               243     239      -4
split_free_page                              380     373      -7
release_pages                                606     510     -96
free_page_list                               188       -    -188

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Acked-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h |  1 +
 mm/page_alloc.c     | 13 +++++++++++++
 mm/swap.c           | 14 +++-----------
 mm/vmscan.c         | 20 +-------------------
 4 files changed, 18 insertions(+), 30 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 3a76faf6a3e..656295865d5 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -358,6 +358,7 @@ void *alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask);
 extern void __free_pages(struct page *page, unsigned int order);
 extern void free_pages(unsigned long addr, unsigned int order);
 extern void free_hot_cold_page(struct page *page, int cold);
+extern void free_hot_cold_page_list(struct list_head *list, int cold);
 
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 7990ca154d1..cd0c95c6cc9 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1188,6 +1188,19 @@ out:
 	local_irq_restore(flags);
 }
 
+/*
+ * Free a list of 0-order pages
+ */
+void free_hot_cold_page_list(struct list_head *list, int cold)
+{
+	struct page *page, *next;
+
+	list_for_each_entry_safe(page, next, list, lru) {
+		trace_mm_pagevec_free(page, cold);
+		free_hot_cold_page(page, cold);
+	}
+}
+
 /*
  * split_page takes a non-compound higher-order page, and splits it into
  * n (1<<order) sub-pages: page[0..n]
diff --git a/mm/swap.c b/mm/swap.c
index a91caf754d9..67a09a633a0 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -585,11 +585,10 @@ int lru_add_drain_all(void)
 void release_pages(struct page **pages, int nr, int cold)
 {
 	int i;
-	struct pagevec pages_to_free;
+	LIST_HEAD(pages_to_free);
 	struct zone *zone = NULL;
 	unsigned long uninitialized_var(flags);
 
-	pagevec_init(&pages_to_free, cold);
 	for (i = 0; i < nr; i++) {
 		struct page *page = pages[i];
 
@@ -620,19 +619,12 @@ void release_pages(struct page **pages, int nr, int cold)
 			del_page_from_lru(zone, page);
 		}
 
-		if (!pagevec_add(&pages_to_free, page)) {
-			if (zone) {
-				spin_unlock_irqrestore(&zone->lru_lock, flags);
-				zone = NULL;
-			}
-			__pagevec_free(&pages_to_free);
-			pagevec_reinit(&pages_to_free);
-  		}
+		list_add(&page->lru, &pages_to_free);
 	}
 	if (zone)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
-	pagevec_free(&pages_to_free);
+	free_hot_cold_page_list(&pages_to_free, cold);
 }
 EXPORT_SYMBOL(release_pages);
 
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 753a2dc300b..3d571df41c7 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -734,24 +734,6 @@ static enum page_references page_check_references(struct page *page,
 	return PAGEREF_RECLAIM;
 }
 
-static noinline_for_stack void free_page_list(struct list_head *free_pages)
-{
-	struct pagevec freed_pvec;
-	struct page *page, *tmp;
-
-	pagevec_init(&freed_pvec, 1);
-
-	list_for_each_entry_safe(page, tmp, free_pages, lru) {
-		list_del(&page->lru);
-		if (!pagevec_add(&freed_pvec, page)) {
-			__pagevec_free(&freed_pvec);
-			pagevec_reinit(&freed_pvec);
-		}
-	}
-
-	pagevec_free(&freed_pvec);
-}
-
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -1015,7 +997,7 @@ keep_lumpy:
 	if (nr_dirty && nr_dirty == nr_congested && scanning_global_lru(sc))
 		zone_set_flag(zone, ZONE_CONGESTED);
 
-	free_page_list(&free_pages);
+	free_hot_cold_page_list(&free_pages, 1);
 
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
-- 
cgit v1.2.3-70-g09d2


From da066ad3570b88e7dee82e76a06ee9a7adffcf0d Mon Sep 17 00:00:00 2001
From: Konstantin Khlebnikov <khlebnikov@openvz.org>
Date: Tue, 10 Jan 2012 15:07:06 -0800
Subject: mm: remove unused pagevec_free

It not exported and now nobody uses it.

Signed-off-by: Konstantin Khlebnikov <khlebnikov@openvz.org>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/pagevec.h |  7 -------
 mm/page_alloc.c         | 10 ----------
 2 files changed, 17 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h
index bab82f4c571..ed17024d2eb 100644
--- a/include/linux/pagevec.h
+++ b/include/linux/pagevec.h
@@ -21,7 +21,6 @@ struct pagevec {
 };
 
 void __pagevec_release(struct pagevec *pvec);
-void __pagevec_free(struct pagevec *pvec);
 void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru);
 void pagevec_strip(struct pagevec *pvec);
 unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
@@ -67,12 +66,6 @@ static inline void pagevec_release(struct pagevec *pvec)
 		__pagevec_release(pvec);
 }
 
-static inline void pagevec_free(struct pagevec *pvec)
-{
-	if (pagevec_count(pvec))
-		__pagevec_free(pvec);
-}
-
 static inline void __pagevec_lru_add_anon(struct pagevec *pvec)
 {
 	____pagevec_lru_add(pvec, LRU_INACTIVE_ANON);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index cd0c95c6cc9..6c77efbca5b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2319,16 +2319,6 @@ unsigned long get_zeroed_page(gfp_t gfp_mask)
 }
 EXPORT_SYMBOL(get_zeroed_page);
 
-void __pagevec_free(struct pagevec *pvec)
-{
-	int i = pagevec_count(pvec);
-
-	while (--i >= 0) {
-		trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
-		free_hot_cold_page(pvec->pages[i], pvec->cold);
-	}
-}
-
 void __free_pages(struct page *page, unsigned int order)
 {
 	if (put_page_testzero(page)) {
-- 
cgit v1.2.3-70-g09d2


From f90ac3982a78d36f894824636beeef13361d7c59 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mgorman@suse.de>
Date: Tue, 10 Jan 2012 15:07:15 -0800
Subject: mm: avoid livelock on !__GFP_FS allocations

Colin Cross reported;

  Under the following conditions, __alloc_pages_slowpath can loop forever:
  gfp_mask & __GFP_WAIT is true
  gfp_mask & __GFP_FS is false
  reclaim and compaction make no progress
  order <= PAGE_ALLOC_COSTLY_ORDER

  These conditions happen very often during suspend and resume,
  when pm_restrict_gfp_mask() effectively converts all GFP_KERNEL
  allocations into __GFP_WAIT.

  The oom killer is not run because gfp_mask & __GFP_FS is false,
  but should_alloc_retry will always return true when order is less
  than PAGE_ALLOC_COSTLY_ORDER.

In his fix, he avoided retrying the allocation if reclaim made no progress
and __GFP_FS was not set.  The problem is that this would result in
GFP_NOIO allocations failing that previously succeeded which would be very
unfortunate.

The big difference between GFP_NOIO and suspend converting GFP_KERNEL to
behave like GFP_NOIO is that normally flushers will be cleaning pages and
kswapd reclaims pages allowing GFP_NOIO to succeed after a short delay.
The same does not necessarily apply during suspend as the storage device
may be suspended.

This patch special cases the suspend case to fail the page allocation if
reclaim cannot make progress and adds some documentation on how
gfp_allowed_mask is currently used.  Failing allocations like this may
cause suspend to abort but that is better than a livelock.

[mgorman@suse.de: Rework fix to be suspend specific]
[rientjes@google.com: Move suspended device check to should_alloc_retry]
Reported-by: Colin Cross <ccross@android.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Acked-by: David Rientjes <rientjes@google.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 16 ++++++++++++++++
 mm/page_alloc.c     | 30 ++++++++++++++++++++++--------
 mm/swapfile.c       |  6 +++---
 3 files changed, 41 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 656295865d5..91812df1351 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -368,9 +368,25 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp);
 void drain_all_pages(void);
 void drain_local_pages(void *dummy);
 
+/*
+ * gfp_allowed_mask is set to GFP_BOOT_MASK during early boot to restrict what
+ * GFP flags are used before interrupts are enabled. Once interrupts are
+ * enabled, it is set to __GFP_BITS_MASK while the system is running. During
+ * hibernation, it is used by PM to avoid I/O during memory allocation while
+ * devices are suspended.
+ */
 extern gfp_t gfp_allowed_mask;
 
 extern void pm_restrict_gfp_mask(void);
 extern void pm_restore_gfp_mask(void);
 
+#ifdef CONFIG_PM_SLEEP
+extern bool pm_suspended_storage(void);
+#else
+static inline bool pm_suspended_storage(void)
+{
+	return false;
+}
+#endif /* CONFIG_PM_SLEEP */
+
 #endif /* __LINUX_GFP_H */
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 671e6c94fed..3cba4b67203 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -127,6 +127,13 @@ void pm_restrict_gfp_mask(void)
 	saved_gfp_mask = gfp_allowed_mask;
 	gfp_allowed_mask &= ~GFP_IOFS;
 }
+
+bool pm_suspended_storage(void)
+{
+	if ((gfp_allowed_mask & GFP_IOFS) == GFP_IOFS)
+		return false;
+	return true;
+}
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
@@ -1786,12 +1793,25 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 
 static inline int
 should_alloc_retry(gfp_t gfp_mask, unsigned int order,
+				unsigned long did_some_progress,
 				unsigned long pages_reclaimed)
 {
 	/* Do not loop if specifically requested */
 	if (gfp_mask & __GFP_NORETRY)
 		return 0;
 
+	/* Always retry if specifically requested */
+	if (gfp_mask & __GFP_NOFAIL)
+		return 1;
+
+	/*
+	 * Suspend converts GFP_KERNEL to __GFP_WAIT which can prevent reclaim
+	 * making forward progress without invoking OOM. Suspend also disables
+	 * storage devices so kswapd will not help. Bail if we are suspending.
+	 */
+	if (!did_some_progress && pm_suspended_storage())
+		return 0;
+
 	/*
 	 * In this implementation, order <= PAGE_ALLOC_COSTLY_ORDER
 	 * means __GFP_NOFAIL, but that may not be true in other
@@ -1810,13 +1830,6 @@ should_alloc_retry(gfp_t gfp_mask, unsigned int order,
 	if (gfp_mask & __GFP_REPEAT && pages_reclaimed < (1 << order))
 		return 1;
 
-	/*
-	 * Don't let big-order allocations loop unless the caller
-	 * explicitly requests that.
-	 */
-	if (gfp_mask & __GFP_NOFAIL)
-		return 1;
-
 	return 0;
 }
 
@@ -2209,7 +2222,8 @@ rebalance:
 
 	/* Check if we should retry the allocation */
 	pages_reclaimed += did_some_progress;
-	if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
+	if (should_alloc_retry(gfp_mask, order, did_some_progress,
+						pages_reclaimed)) {
 		/* Wait for some write requests to complete then retry */
 		wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50);
 		goto rebalance;
diff --git a/mm/swapfile.c b/mm/swapfile.c
index b1cd1206072..9520592d423 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -667,10 +667,10 @@ int try_to_free_swap(struct page *page)
 	 * original page might be freed under memory pressure, then
 	 * later read back in from swap, now with the wrong data.
 	 *
-	 * Hibernation clears bits from gfp_allowed_mask to prevent
-	 * memory reclaim from writing to disk, so check that here.
+	 * Hibration suspends storage while it is writing the image
+	 * to disk so check that here.
 	 */
-	if (!(gfp_allowed_mask & __GFP_IO))
+	if (pm_suspended_storage())
 		return 0;
 
 	delete_from_swap_cache(page);
-- 
cgit v1.2.3-70-g09d2


From 1399ff86f2a2bbacbbe68fa00c5f8c752b344723 Mon Sep 17 00:00:00 2001
From: David Daney <david.daney@cavium.com>
Date: Tue, 10 Jan 2012 15:07:25 -0800
Subject: kernel.h: add BUILD_BUG() macro

We can place this in definitions that we expect the compiler to remove by
dead code elimination.  If this assertion fails, we get a nice error
message at build time.

The GCC function attribute error("message") was added in version 4.3, so
we define a new macro __linktime_error(message) to expand to this for
GCC-4.3 and later.  This will give us an error diagnostic from the
compiler on the line that fails.  For other compilers
__linktime_error(message) expands to nothing, and we have to be content
with a link time error, but at least we will still get a build error.

BUILD_BUG() expands to the undefined function __build_bug_failed() and
will fail at link time if the compiler ever emits code for it.  On GCC-4.3
and later, attribute((error())) is used so that the failure will be noted
at compile time instead.

Signed-off-by: David Daney <david.daney@cavium.com>
Acked-by: David Rientjes <rientjes@google.com>
Cc: DM <dm.n9107@gmail.com>
Cc: Ralf Baechle <ralf@linux-mips.org>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/compiler-gcc4.h |  1 +
 include/linux/compiler.h      |  4 +++-
 include/linux/kernel.h        | 16 ++++++++++++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/compiler-gcc4.h b/include/linux/compiler-gcc4.h
index dfadc96e9d6..2f4079175af 100644
--- a/include/linux/compiler-gcc4.h
+++ b/include/linux/compiler-gcc4.h
@@ -29,6 +29,7 @@
    the kernel context */
 #define __cold			__attribute__((__cold__))
 
+#define __linktime_error(message) __attribute__((__error__(message)))
 
 #if __GNUC_MINOR__ >= 5
 /*
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index 320d6c94ff8..4a243546d14 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -293,7 +293,9 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 #ifndef __compiletime_error
 # define __compiletime_error(message)
 #endif
-
+#ifndef __linktime_error
+# define __linktime_error(message)
+#endif
 /*
  * Prevent the compiler from merging or refetching accesses.  The compiler
  * is also forbidden from reordering successive instances of ACCESS_ONCE(),
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e8b1597b5cf..f48e8a52854 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -665,6 +665,7 @@ static inline void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { }
 #define BUILD_BUG_ON_ZERO(e) (0)
 #define BUILD_BUG_ON_NULL(e) ((void*)0)
 #define BUILD_BUG_ON(condition)
+#define BUILD_BUG() (0)
 #else /* __CHECKER__ */
 
 /* Force a compilation error if a constant expression is not a power of 2 */
@@ -703,6 +704,21 @@ extern int __build_bug_on_failed;
 		if (condition) __build_bug_on_failed = 1;	\
 	} while(0)
 #endif
+
+/**
+ * BUILD_BUG - break compile if used.
+ *
+ * If you have some code that you expect the compiler to eliminate at
+ * build time, you should use BUILD_BUG to detect if it is
+ * unexpectedly used.
+ */
+#define BUILD_BUG()						\
+	do {							\
+		extern void __build_bug_failed(void)		\
+			__linktime_error("BUILD_BUG failed");	\
+		__build_bug_failed();				\
+	} while (0)
+
 #endif	/* __CHECKER__ */
 
 /* Trap pasters of __FUNCTION__ at compile-time */
-- 
cgit v1.2.3-70-g09d2


From c0a32fc5a2e470d0b02597b23ad79a317735253e Mon Sep 17 00:00:00 2001
From: Stanislaw Gruszka <sgruszka@redhat.com>
Date: Tue, 10 Jan 2012 15:07:28 -0800
Subject: mm: more intensive memory corruption debugging

With CONFIG_DEBUG_PAGEALLOC configured, the CPU will generate an exception
on access (read,write) to an unallocated page, which permits us to catch
code which corrupts memory.  However the kernel is trying to maximise
memory usage, hence there are usually few free pages in the system and
buggy code usually corrupts some crucial data.

This patch changes the buddy allocator to keep more free/protected pages
and to interlace free/protected and allocated pages to increase the
probability of catching corruption.

When the kernel is compiled with CONFIG_DEBUG_PAGEALLOC,
debug_guardpage_minorder defines the minimum order used by the page
allocator to grant a request.  The requested size will be returned with
the remaining pages used as guard pages.

The default value of debug_guardpage_minorder is zero: no change from
current behaviour.

[akpm@linux-foundation.org: tweak documentation, s/flg/flag/]
Signed-off-by: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kernel-parameters.txt | 19 ++++++++++
 include/linux/mm.h                  | 17 +++++++++
 include/linux/page-debug-flags.h    |  4 +-
 mm/Kconfig.debug                    |  5 +++
 mm/page_alloc.c                     | 75 ++++++++++++++++++++++++++++++++++---
 5 files changed, 113 insertions(+), 7 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 7b2e5c5eefa..7ed7030e772 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -623,6 +623,25 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	no_debug_objects
 			[KNL] Disable object debugging
 
+	debug_guardpage_minorder=
+			[KNL] When CONFIG_DEBUG_PAGEALLOC is set, this
+			parameter allows control of the order of pages that will
+			be intentionally kept free (and hence protected) by the
+			buddy allocator. Bigger value increase the probability
+			of catching random memory corruption, but reduce the
+			amount of memory for normal system use. The maximum
+			possible value is MAX_ORDER/2.  Setting this parameter
+			to 1 or 2 should be enough to identify most random
+			memory corruption problems caused by bugs in kernel or
+			driver code when a CPU writes to (or reads from) a
+			random memory location. Note that there exists a class
+			of memory corruptions problems caused by buggy H/W or
+			F/W or by drivers badly programing DMA (basically when
+			memory is written at bus level and the CPU MMU is
+			bypassed) which are not detectable by
+			CONFIG_DEBUG_PAGEALLOC, hence this option will not help
+			tracking down these problems.
+
 	debugpat	[X86] Enable PAT debugging
 
 	decnet.addr=	[HW,NET]
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5d9b4c9813b..5568553a41f 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1618,5 +1618,22 @@ extern void copy_user_huge_page(struct page *dst, struct page *src,
 				unsigned int pages_per_huge_page);
 #endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+extern unsigned int _debug_guardpage_minorder;
+
+static inline unsigned int debug_guardpage_minorder(void)
+{
+	return _debug_guardpage_minorder;
+}
+
+static inline bool page_is_guard(struct page *page)
+{
+	return test_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline unsigned int debug_guardpage_minorder(void) { return 0; }
+static inline bool page_is_guard(struct page *page) { return false; }
+#endif /* CONFIG_DEBUG_PAGEALLOC */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
diff --git a/include/linux/page-debug-flags.h b/include/linux/page-debug-flags.h
index b0638fd91e9..22691f61404 100644
--- a/include/linux/page-debug-flags.h
+++ b/include/linux/page-debug-flags.h
@@ -13,6 +13,7 @@
 
 enum page_debug_flags {
 	PAGE_DEBUG_FLAG_POISON,		/* Page is poisoned */
+	PAGE_DEBUG_FLAG_GUARD,
 };
 
 /*
@@ -21,7 +22,8 @@ enum page_debug_flags {
  */
 
 #ifdef CONFIG_WANT_PAGE_DEBUG_FLAGS
-#if !defined(CONFIG_PAGE_POISONING) \
+#if !defined(CONFIG_PAGE_POISONING) && \
+    !defined(CONFIG_PAGE_GUARD) \
 /* && !defined(CONFIG_PAGE_DEBUG_SOMETHING_ELSE) && ... */
 #error WANT_PAGE_DEBUG_FLAGS is turned on with no debug features!
 #endif
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 8b1a477162d..4b2443254de 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -4,6 +4,7 @@ config DEBUG_PAGEALLOC
 	depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
 	depends on !KMEMCHECK
 	select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
+	select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
 	---help---
 	  Unmap pages from the kernel linear mapping after free_pages().
 	  This results in a large slowdown, but helps to find certain types
@@ -22,3 +23,7 @@ config WANT_PAGE_DEBUG_FLAGS
 config PAGE_POISONING
 	bool
 	select WANT_PAGE_DEBUG_FLAGS
+
+config PAGE_GUARD
+	bool
+	select WANT_PAGE_DEBUG_FLAGS
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 3cba4b67203..93baebcc06f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
 #include <linux/ftrace_event.h>
 #include <linux/memcontrol.h>
 #include <linux/prefetch.h>
+#include <linux/page-debug-flags.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -388,6 +389,37 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
 		clear_highpage(page + i);
 }
 
+#ifdef CONFIG_DEBUG_PAGEALLOC
+unsigned int _debug_guardpage_minorder;
+
+static int __init debug_guardpage_minorder_setup(char *buf)
+{
+	unsigned long res;
+
+	if (kstrtoul(buf, 10, &res) < 0 ||  res > MAX_ORDER / 2) {
+		printk(KERN_ERR "Bad debug_guardpage_minorder value\n");
+		return 0;
+	}
+	_debug_guardpage_minorder = res;
+	printk(KERN_INFO "Setting debug_guardpage_minorder to %lu\n", res);
+	return 0;
+}
+__setup("debug_guardpage_minorder=", debug_guardpage_minorder_setup);
+
+static inline void set_page_guard_flag(struct page *page)
+{
+	__set_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+
+static inline void clear_page_guard_flag(struct page *page)
+{
+	__clear_bit(PAGE_DEBUG_FLAG_GUARD, &page->debug_flags);
+}
+#else
+static inline void set_page_guard_flag(struct page *page) { }
+static inline void clear_page_guard_flag(struct page *page) { }
+#endif
+
 static inline void set_page_order(struct page *page, int order)
 {
 	set_page_private(page, order);
@@ -445,6 +477,11 @@ static inline int page_is_buddy(struct page *page, struct page *buddy,
 	if (page_zone_id(page) != page_zone_id(buddy))
 		return 0;
 
+	if (page_is_guard(buddy) && page_order(buddy) == order) {
+		VM_BUG_ON(page_count(buddy) != 0);
+		return 1;
+	}
+
 	if (PageBuddy(buddy) && page_order(buddy) == order) {
 		VM_BUG_ON(page_count(buddy) != 0);
 		return 1;
@@ -501,11 +538,19 @@ static inline void __free_one_page(struct page *page,
 		buddy = page + (buddy_idx - page_idx);
 		if (!page_is_buddy(page, buddy, order))
 			break;
-
-		/* Our buddy is free, merge with it and move up one order. */
-		list_del(&buddy->lru);
-		zone->free_area[order].nr_free--;
-		rmv_page_order(buddy);
+		/*
+		 * Our buddy is free or it is CONFIG_DEBUG_PAGEALLOC guard page,
+		 * merge with it and move up one order.
+		 */
+		if (page_is_guard(buddy)) {
+			clear_page_guard_flag(buddy);
+			set_page_private(page, 0);
+			__mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
+		} else {
+			list_del(&buddy->lru);
+			zone->free_area[order].nr_free--;
+			rmv_page_order(buddy);
+		}
 		combined_idx = buddy_idx & page_idx;
 		page = page + (combined_idx - page_idx);
 		page_idx = combined_idx;
@@ -731,6 +776,23 @@ static inline void expand(struct zone *zone, struct page *page,
 		high--;
 		size >>= 1;
 		VM_BUG_ON(bad_range(zone, &page[size]));
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+		if (high < debug_guardpage_minorder()) {
+			/*
+			 * Mark as guard pages (or page), that will allow to
+			 * merge back to allocator when buddy will be freed.
+			 * Corresponding page table entries will not be touched,
+			 * pages will stay not present in virtual address space
+			 */
+			INIT_LIST_HEAD(&page[size].lru);
+			set_page_guard_flag(&page[size]);
+			set_page_private(&page[size], high);
+			/* Guard pages are not available for any usage */
+			__mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << high));
+			continue;
+		}
+#endif
 		list_add(&page[size].lru, &area->free_list[migratetype]);
 		area->nr_free++;
 		set_page_order(&page[size], high);
@@ -1754,7 +1816,8 @@ void warn_alloc_failed(gfp_t gfp_mask, int order, const char *fmt, ...)
 {
 	unsigned int filter = SHOW_MEM_FILTER_NODES;
 
-	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
+	if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs) ||
+	    debug_guardpage_minorder() > 0)
 		return;
 
 	/*
-- 
cgit v1.2.3-70-g09d2


From f6d7e0cb3ecc248e98fa11d83253f6174bd7e085 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Tue, 10 Jan 2012 15:07:38 -0800
Subject: mm, debug: test for online nid when allocating on single node

Calling alloc_pages_exact_node() means the allocation only passes the
zonelist of a single node into the page allocator.  If that node isn't
online, it's zonelist may never have been initialized causing a strange
oops that may not immediately be clear.

I recently debugged an issue where node 0 wasn't online and an allocator
was passing 0 to alloc_pages_exact_node() and it resulted in a NULL
pointer on zonelist->_zoneref.  If CONFIG_DEBUG_VM is enabled, though, it
would be nice to catch this a bit earlier.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 91812df1351..66f172fdf5f 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -313,7 +313,7 @@ static inline struct page *alloc_pages_node(int nid, gfp_t gfp_mask,
 static inline struct page *alloc_pages_exact_node(int nid, gfp_t gfp_mask,
 						unsigned int order)
 {
-	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES);
+	VM_BUG_ON(nid < 0 || nid >= MAX_NUMNODES || !node_online(nid));
 
 	return __alloc_pages(gfp_mask, order, node_zonelist(nid, gfp_mask));
 }
-- 
cgit v1.2.3-70-g09d2


From ab8fabd46f811d5153d8a0cd2fac9a0d41fb593d Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Tue, 10 Jan 2012 15:07:42 -0800
Subject: mm: exclude reserved pages from dirtyable memory

Per-zone dirty limits try to distribute page cache pages allocated for
writing across zones in proportion to the individual zone sizes, to reduce
the likelihood of reclaim having to write back individual pages from the
LRU lists in order to make progress.

This patch:

The amount of dirtyable pages should not include the full number of free
pages: there is a number of reserved pages that the page allocator and
kswapd always try to keep free.

The closer (reclaimable pages - dirty pages) is to the number of reserved
pages, the more likely it becomes for reclaim to run into dirty pages:

       +----------+ ---
       |   anon   |  |
       +----------+  |
       |          |  |
       |          |  -- dirty limit new    -- flusher new
       |   file   |  |                     |
       |          |  |                     |
       |          |  -- dirty limit old    -- flusher old
       |          |                        |
       +----------+                       --- reclaim
       | reserved |
       +----------+
       |  kernel  |
       +----------+

This patch introduces a per-zone dirty reserve that takes both the lowmem
reserve as well as the high watermark of the zone into account, and a
global sum of those per-zone values that is subtracted from the global
amount of dirtyable pages.  The lowmem reserve is unavailable to page
cache allocations and kswapd tries to keep the high watermark free.  We
don't want to end up in a situation where reclaim has to clean pages in
order to balance zones.

Not treating reserved pages as dirtyable on a global level is only a
conceptual fix.  In reality, dirty pages are not distributed equally
across zones and reclaim runs into dirty pages on a regular basis.

But it is important to get this right before tackling the problem on a
per-zone level, where the distance between reclaim and the dirty pages is
mostly much smaller in absolute numbers.

[akpm@linux-foundation.org: fix highmem build]
Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Wu Fengguang <fengguang.wu@intel.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h |  6 ++++++
 include/linux/swap.h   |  1 +
 mm/page-writeback.c    |  5 +++--
 mm/page_alloc.c        | 19 +++++++++++++++++++
 4 files changed, 29 insertions(+), 2 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 3ac040f1936..ca6ca92418a 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -317,6 +317,12 @@ struct zone {
 	 */
 	unsigned long		lowmem_reserve[MAX_NR_ZONES];
 
+	/*
+	 * This is a per-zone reserve of pages that should not be
+	 * considered dirtyable memory.
+	 */
+	unsigned long		dirty_balance_reserve;
+
 #ifdef CONFIG_NUMA
 	int node;
 	/*
diff --git a/include/linux/swap.h b/include/linux/swap.h
index 1e22e126d2a..06061a7f8e6 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -207,6 +207,7 @@ struct swap_list_t {
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
+extern unsigned long dirty_balance_reserve;
 extern unsigned int nr_free_buffer_pages(void);
 extern unsigned int nr_free_pagecache_pages(void);
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index c081bf62202..9ab6de82d8e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -157,7 +157,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total)
 			&NODE_DATA(node)->node_zones[ZONE_HIGHMEM];
 
 		x += zone_page_state(z, NR_FREE_PAGES) +
-		     zone_reclaimable_pages(z);
+		     zone_reclaimable_pages(z) - z->dirty_balance_reserve;
 	}
 	/*
 	 * Make sure that the number of highmem pages is never larger
@@ -181,7 +181,8 @@ static unsigned long determine_dirtyable_memory(void)
 {
 	unsigned long x;
 
-	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages();
+	x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() -
+	    dirty_balance_reserve;
 
 	if (!vm_highmem_is_dirtyable)
 		x -= highmem_dirtyable_memory(x);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 93baebcc06f..2cb9eb71e28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -97,6 +97,14 @@ EXPORT_SYMBOL(node_states);
 
 unsigned long totalram_pages __read_mostly;
 unsigned long totalreserve_pages __read_mostly;
+/*
+ * When calculating the number of globally allowed dirty pages, there
+ * is a certain number of per-zone reserves that should not be
+ * considered dirtyable memory.  This is the sum of those reserves
+ * over all existing zones that contribute dirtyable memory.
+ */
+unsigned long dirty_balance_reserve __read_mostly;
+
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
@@ -4822,8 +4830,19 @@ static void calculate_totalreserve_pages(void)
 			if (max > zone->present_pages)
 				max = zone->present_pages;
 			reserve_pages += max;
+			/*
+			 * Lowmem reserves are not available to
+			 * GFP_HIGHUSER page cache allocations and
+			 * kswapd tries to balance zones to their high
+			 * watermark.  As a result, neither should be
+			 * regarded as dirtyable memory, to prevent a
+			 * situation where reclaim has to clean pages
+			 * in order to balance the zones.
+			 */
+			zone->dirty_balance_reserve = max;
 		}
 	}
+	dirty_balance_reserve = reserve_pages;
 	totalreserve_pages = reserve_pages;
 }
 
-- 
cgit v1.2.3-70-g09d2


From a756cf5908530e8b40bdf569eb48b40139e8d7fd Mon Sep 17 00:00:00 2001
From: Johannes Weiner <jweiner@redhat.com>
Date: Tue, 10 Jan 2012 15:07:49 -0800
Subject: mm: try to distribute dirty pages fairly across zones

The maximum number of dirty pages that exist in the system at any time is
determined by a number of pages considered dirtyable and a user-configured
percentage of those, or an absolute number in bytes.

This number of dirtyable pages is the sum of memory provided by all the
zones in the system minus their lowmem reserves and high watermarks, so
that the system can retain a healthy number of free pages without having
to reclaim dirty pages.

But there is a flaw in that we have a zoned page allocator which does not
care about the global state but rather the state of individual memory
zones.  And right now there is nothing that prevents one zone from filling
up with dirty pages while other zones are spared, which frequently leads
to situations where kswapd, in order to restore the watermark of free
pages, does indeed have to write pages from that zone's LRU list.  This
can interfere so badly with IO from the flusher threads that major
filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim
already, taking away the VM's only possibility to keep such a zone
balanced, aside from hoping the flushers will soon clean pages from that
zone.

Enter per-zone dirty limits.  They are to a zone's dirtyable memory what
the global limit is to the global amount of dirtyable memory, and try to
make sure that no single zone receives more than its fair share of the
globally allowed dirty pages in the first place.  As the number of pages
considered dirtyable excludes the zones' lowmem reserves and high
watermarks, the maximum number of dirty pages in a zone is such that the
zone can always be balanced without requiring page cleaning.

As this is a placement decision in the page allocator and pages are
dirtied only after the allocation, this patch allows allocators to pass
__GFP_WRITE when they know in advance that the page will be written to and
become dirty soon.  The page allocator will then attempt to allocate from
the first zone of the zonelist - which on NUMA is determined by the task's
NUMA memory policy - that has not exceeded its dirty limit.

At first glance, it would appear that the diversion to lower zones can
increase pressure on them, but this is not the case.  With a full high
zone, allocations will be diverted to lower zones eventually, so it is
more of a shift in timing of the lower zone allocations.  Workloads that
previously could fit their dirty pages completely in the higher zone may
be forced to allocate from lower zones, but the amount of pages that
"spill over" are limited themselves by the lower zones' dirty constraints,
and thus unlikely to become a problem.

For now, the problem of unfair dirty page distribution remains for NUMA
configurations where the zones allowed for allocation are in sum not big
enough to trigger the global dirty limits, wake up the flusher threads and
remedy the situation.  Because of this, an allocation that could not
succeed on any of the considered zones is allowed to ignore the dirty
limits before going into direct reclaim or even failing the allocation,
until a future patch changes the global dirty throttling and flusher
thread activation so that they take individual zone states into account.

			Test results

15M DMA + 3246M DMA32 + 504 Normal = 3765M memory
40% dirty ratio
16G USB thumb drive
10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15))

		seconds			nr_vmscan_write
		        (stddev)	       min|     median|        max
xfs
vanilla:	 549.747( 3.492)	     0.000|      0.000|      0.000
patched:	 550.996( 3.802)	     0.000|      0.000|      0.000

fuse-ntfs
vanilla:	1183.094(53.178)	 54349.000|  59341.000|  65163.000
patched:	 558.049(17.914)	     0.000|      0.000|     43.000

btrfs
vanilla:	 573.679(14.015)	156657.000| 460178.000| 606926.000
patched:	 563.365(11.368)	     0.000|      0.000|   1362.000

ext4
vanilla:	 561.197(15.782)	     0.000|2725438.000|4143837.000
patched:	 568.806(17.496)	     0.000|      0.000|      0.000

Signed-off-by: Johannes Weiner <jweiner@redhat.com>
Reviewed-by: Minchan Kim <minchan.kim@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Michal Hocko <mhocko@suse.cz>
Tested-by: Wu Fengguang <fengguang.wu@intel.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Shaohua Li <shaohua.li@intel.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Chris Mason <chris.mason@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/gfp.h       |  4 ++-
 include/linux/writeback.h |  1 +
 mm/page-writeback.c       | 82 +++++++++++++++++++++++++++++++++++++++++++++++
 mm/page_alloc.c           | 29 +++++++++++++++++
 4 files changed, 115 insertions(+), 1 deletion(-)

(limited to 'include/linux')

diff --git a/include/linux/gfp.h b/include/linux/gfp.h
index 66f172fdf5f..581e74b7df9 100644
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,6 +36,7 @@ struct vm_area_struct;
 #endif
 #define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
+#define ___GFP_WRITE		0x1000000u
 
 /*
  * GFP bitmasks..
@@ -85,6 +86,7 @@ struct vm_area_struct;
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
+#define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
 /*
  * This may seem redundant, but it's a way of annotating false positives vs.
@@ -92,7 +94,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 24	/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 25	/* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 34a005515fe..6dff4730497 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -124,6 +124,7 @@ void laptop_mode_timer_fn(unsigned long data);
 static inline void laptop_sync_completion(void) { }
 #endif
 void throttle_vm_writeout(gfp_t gfp_mask);
+bool zone_dirty_ok(struct zone *zone);
 
 extern unsigned long global_dirty_limit;
 
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 433fa990fe8..5cdd4f2b0c9 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -147,6 +147,24 @@ static struct prop_descriptor vm_completions;
  * clamping level.
  */
 
+/*
+ * In a memory zone, there is a certain amount of pages we consider
+ * available for the page cache, which is essentially the number of
+ * free and reclaimable pages, minus some zone reserves to protect
+ * lowmem and the ability to uphold the zone's watermarks without
+ * requiring writeback.
+ *
+ * This number of dirtyable pages is the base value of which the
+ * user-configurable dirty ratio is the effictive number of pages that
+ * are allowed to be actually dirtied.  Per individual zone, or
+ * globally by using the sum of dirtyable pages over all zones.
+ *
+ * Because the user is allowed to specify the dirty limit globally as
+ * absolute number of bytes, calculating the per-zone dirty limit can
+ * require translating the configured limit into a percentage of
+ * global dirtyable memory first.
+ */
+
 static unsigned long highmem_dirtyable_memory(unsigned long total)
 {
 #ifdef CONFIG_HIGHMEM
@@ -232,6 +250,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
 	trace_global_dirty_state(background, dirty);
 }
 
+/**
+ * zone_dirtyable_memory - number of dirtyable pages in a zone
+ * @zone: the zone
+ *
+ * Returns the zone's number of pages potentially available for dirty
+ * page cache.  This is the base value for the per-zone dirty limits.
+ */
+static unsigned long zone_dirtyable_memory(struct zone *zone)
+{
+	/*
+	 * The effective global number of dirtyable pages may exclude
+	 * highmem as a big-picture measure to keep the ratio between
+	 * dirty memory and lowmem reasonable.
+	 *
+	 * But this function is purely about the individual zone and a
+	 * highmem zone can hold its share of dirty pages, so we don't
+	 * care about vm_highmem_is_dirtyable here.
+	 */
+	return zone_page_state(zone, NR_FREE_PAGES) +
+	       zone_reclaimable_pages(zone) -
+	       zone->dirty_balance_reserve;
+}
+
+/**
+ * zone_dirty_limit - maximum number of dirty pages allowed in a zone
+ * @zone: the zone
+ *
+ * Returns the maximum number of dirty pages allowed in a zone, based
+ * on the zone's dirtyable memory.
+ */
+static unsigned long zone_dirty_limit(struct zone *zone)
+{
+	unsigned long zone_memory = zone_dirtyable_memory(zone);
+	struct task_struct *tsk = current;
+	unsigned long dirty;
+
+	if (vm_dirty_bytes)
+		dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) *
+			zone_memory / global_dirtyable_memory();
+	else
+		dirty = vm_dirty_ratio * zone_memory / 100;
+
+	if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+		dirty += dirty / 4;
+
+	return dirty;
+}
+
+/**
+ * zone_dirty_ok - tells whether a zone is within its dirty limits
+ * @zone: the zone to check
+ *
+ * Returns %true when the dirty pages in @zone are within the zone's
+ * dirty limit, %false if the limit is exceeded.
+ */
+bool zone_dirty_ok(struct zone *zone)
+{
+	unsigned long limit = zone_dirty_limit(zone);
+
+	return zone_page_state(zone, NR_FILE_DIRTY) +
+	       zone_page_state(zone, NR_UNSTABLE_NFS) +
+	       zone_page_state(zone, NR_WRITEBACK) <= limit;
+}
+
 /*
  * couple the period to the dirty_ratio:
  *
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 2cb9eb71e28..4f95bcf0f2b 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1735,6 +1735,35 @@ zonelist_scan:
 		if ((alloc_flags & ALLOC_CPUSET) &&
 			!cpuset_zone_allowed_softwall(zone, gfp_mask))
 				continue;
+		/*
+		 * When allocating a page cache page for writing, we
+		 * want to get it from a zone that is within its dirty
+		 * limit, such that no single zone holds more than its
+		 * proportional share of globally allowed dirty pages.
+		 * The dirty limits take into account the zone's
+		 * lowmem reserves and high watermark so that kswapd
+		 * should be able to balance it without having to
+		 * write pages from its LRU list.
+		 *
+		 * This may look like it could increase pressure on
+		 * lower zones by failing allocations in higher zones
+		 * before they are full.  But the pages that do spill
+		 * over are limited as the lower zones are protected
+		 * by this very same mechanism.  It should not become
+		 * a practical burden to them.
+		 *
+		 * XXX: For now, allow allocations to potentially
+		 * exceed the per-zone dirty limit in the slowpath
+		 * (ALLOC_WMARK_LOW unset) before going into reclaim,
+		 * which is important when on a NUMA setup the allowed
+		 * zones are together not big enough to reach the
+		 * global limit.  The proper fix for these situations
+		 * will require awareness of zones in the
+		 * dirty-throttling and the flusher threads.
+		 */
+		if ((alloc_flags & ALLOC_WMARK_LOW) &&
+		    (gfp_mask & __GFP_WRITE) && !zone_dirty_ok(zone))
+			goto this_zone_full;
 
 		BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
 		if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
-- 
cgit v1.2.3-70-g09d2


From 948f017b093a9baac23855fcd920d3a970b71bb6 Mon Sep 17 00:00:00 2001
From: Andrea Arcangeli <aarcange@redhat.com>
Date: Tue, 10 Jan 2012 15:08:05 -0800
Subject: mremap: enforce rmap src/dst vma ordering in case of vma_merge()
 succeeding in copy_vma()

migrate was doing an rmap_walk with speculative lock-less access on
pagetables.  That could lead it to not serializing properly against mremap
PT locks.  But a second problem remains in the order of vmas in the
same_anon_vma list used by the rmap_walk.

If vma_merge succeeds in copy_vma, the src vma could be placed after the
dst vma in the same_anon_vma list.  That could still lead to migrate
missing some pte.

This patch adds an anon_vma_moveto_tail() function to force the dst vma at
the end of the list before mremap starts to solve the problem.

If the mremap is very large and there are a lots of parents or childs
sharing the anon_vma root lock, this should still scale better than taking
the anon_vma root lock around every pte copy practically for the whole
duration of mremap.

Update: Hugh noticed special care is needed in the error path where
move_page_tables goes in the reverse direction, a second
anon_vma_moveto_tail() call is needed in the error path.

This program exercises the anon_vma_moveto_tail:

===

int main()
{
	static struct timeval oldstamp, newstamp;
	long diffsec;
	char *p, *p2, *p3, *p4;
	if (posix_memalign((void **)&p, 2*1024*1024, SIZE))
		perror("memalign"), exit(1);
	if (posix_memalign((void **)&p2, 2*1024*1024, SIZE))
		perror("memalign"), exit(1);
	if (posix_memalign((void **)&p3, 2*1024*1024, SIZE))
		perror("memalign"), exit(1);

	memset(p, 0xff, SIZE);
	printf("%p\n", p);
	memset(p2, 0xff, SIZE);
	memset(p3, 0x77, 4096);
	if (memcmp(p, p2, SIZE))
		printf("error\n");
	p4 = mremap(p+SIZE/2, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p3);
	if (p4 != p3)
		perror("mremap"), exit(1);
	p4 = mremap(p4, SIZE/2, SIZE/2, MREMAP_FIXED|MREMAP_MAYMOVE, p+SIZE/2);
	if (p4 != p+SIZE/2)
		perror("mremap"), exit(1);
	if (memcmp(p, p2, SIZE))
		printf("error\n");
	printf("ok\n");

	return 0;
}
===

$ perf probe -a anon_vma_moveto_tail
Add new event:
  probe:anon_vma_moveto_tail (on anon_vma_moveto_tail)

You can now use it on all perf tools, such as:

        perf record -e probe:anon_vma_moveto_tail -aR sleep 1

$ perf record -e probe:anon_vma_moveto_tail -aR ./anon_vma_moveto_tail
0x7f2ca2800000
ok
[ perf record: Woken up 1 times to write data ]
[ perf record: Captured and wrote 0.043 MB perf.data (~1860 samples) ]
$ perf report --stdio
   100.00%  anon_vma_moveto  [kernel.kallsyms]  [k] anon_vma_moveto_tail

Signed-off-by: Andrea Arcangeli <aarcange@redhat.com>
Reported-by: Nai Xia <nai.xia@gmail.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Cc: Hugh Dickins <hughd@google.com>
Cc: Pawel Sikora <pluto@agmk.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/rmap.h |  1 +
 mm/mmap.c            | 24 +++++++++++++++++++++---
 mm/mremap.c          |  9 +++++++++
 mm/rmap.c            | 45 +++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 76 insertions(+), 3 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/rmap.h b/include/linux/rmap.h
index 2148b122779..1afb9954bbf 100644
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -120,6 +120,7 @@ void anon_vma_init(void);	/* create anon_vma_cachep */
 int  anon_vma_prepare(struct vm_area_struct *);
 void unlink_anon_vmas(struct vm_area_struct *);
 int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *);
+void anon_vma_moveto_tail(struct vm_area_struct *);
 int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *);
 void __anon_vma_link(struct vm_area_struct *);
 
diff --git a/mm/mmap.c b/mm/mmap.c
index eae90af60ea..adea3b8880e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2322,13 +2322,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 	struct vm_area_struct *new_vma, *prev;
 	struct rb_node **rb_link, *rb_parent;
 	struct mempolicy *pol;
+	bool faulted_in_anon_vma = true;
 
 	/*
 	 * If anonymous vma has not yet been faulted, update new pgoff
 	 * to match new location, to increase its chance of merging.
 	 */
-	if (!vma->vm_file && !vma->anon_vma)
+	if (unlikely(!vma->vm_file && !vma->anon_vma)) {
 		pgoff = addr >> PAGE_SHIFT;
+		faulted_in_anon_vma = false;
+	}
 
 	find_vma_prepare(mm, addr, &prev, &rb_link, &rb_parent);
 	new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
@@ -2337,9 +2340,24 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		/*
 		 * Source vma may have been merged into new_vma
 		 */
-		if (vma_start >= new_vma->vm_start &&
-		    vma_start < new_vma->vm_end)
+		if (unlikely(vma_start >= new_vma->vm_start &&
+			     vma_start < new_vma->vm_end)) {
+			/*
+			 * The only way we can get a vma_merge with
+			 * self during an mremap is if the vma hasn't
+			 * been faulted in yet and we were allowed to
+			 * reset the dst vma->vm_pgoff to the
+			 * destination address of the mremap to allow
+			 * the merge to happen. mremap must change the
+			 * vm_pgoff linearity between src and dst vmas
+			 * (in turn preventing a vma_merge) to be
+			 * safe. It is only safe to keep the vm_pgoff
+			 * linear if there are no pages mapped yet.
+			 */
+			VM_BUG_ON(faulted_in_anon_vma);
 			*vmap = new_vma;
+		} else
+			anon_vma_moveto_tail(new_vma);
 	} else {
 		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
 		if (new_vma) {
diff --git a/mm/mremap.c b/mm/mremap.c
index d6959cb4df5..87bb8393e7d 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -220,6 +220,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len);
 	if (moved_len < old_len) {
+		/*
+		 * Before moving the page tables from the new vma to
+		 * the old vma, we need to be sure the old vma is
+		 * queued after new vma in the same_anon_vma list to
+		 * prevent SMP races with rmap_walk (that could lead
+		 * rmap_walk to miss some page table).
+		 */
+		anon_vma_moveto_tail(vma);
+
 		/*
 		 * On error, move entries back from new area to old,
 		 * which will succeed since page tables still there,
diff --git a/mm/rmap.c b/mm/rmap.c
index a4fd3680038..a2e5ce1fa08 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -271,6 +271,51 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
 	return -ENOMEM;
 }
 
+/*
+ * Some rmap walk that needs to find all ptes/hugepmds without false
+ * negatives (like migrate and split_huge_page) running concurrent
+ * with operations that copy or move pagetables (like mremap() and
+ * fork()) to be safe. They depend on the anon_vma "same_anon_vma"
+ * list to be in a certain order: the dst_vma must be placed after the
+ * src_vma in the list. This is always guaranteed by fork() but
+ * mremap() needs to call this function to enforce it in case the
+ * dst_vma isn't newly allocated and chained with the anon_vma_clone()
+ * function but just an extension of a pre-existing vma through
+ * vma_merge.
+ *
+ * NOTE: the same_anon_vma list can still be changed by other
+ * processes while mremap runs because mremap doesn't hold the
+ * anon_vma mutex to prevent modifications to the list while it
+ * runs. All we need to enforce is that the relative order of this
+ * process vmas isn't changing (we don't care about other vmas
+ * order). Each vma corresponds to an anon_vma_chain structure so
+ * there's no risk that other processes calling anon_vma_moveto_tail()
+ * and changing the same_anon_vma list under mremap() will screw with
+ * the relative order of this process vmas in the list, because we
+ * they can't alter the order of any vma that belongs to this
+ * process. And there can't be another anon_vma_moveto_tail() running
+ * concurrently with mremap() coming from this process because we hold
+ * the mmap_sem for the whole mremap(). fork() ordering dependency
+ * also shouldn't be affected because fork() only cares that the
+ * parent vmas are placed in the list before the child vmas and
+ * anon_vma_moveto_tail() won't reorder vmas from either the fork()
+ * parent or child.
+ */
+void anon_vma_moveto_tail(struct vm_area_struct *dst)
+{
+	struct anon_vma_chain *pavc;
+	struct anon_vma *root = NULL;
+
+	list_for_each_entry_reverse(pavc, &dst->anon_vma_chain, same_vma) {
+		struct anon_vma *anon_vma = pavc->anon_vma;
+		VM_BUG_ON(pavc->vma != dst);
+		root = lock_anon_vma_root(root, anon_vma);
+		list_del(&pavc->same_anon_vma);
+		list_add_tail(&pavc->same_anon_vma, &anon_vma->head);
+	}
+	unlock_anon_vma_root(root);
+}
+
 /*
  * Attach vma to its own anon_vma, as well as to the anon_vmas that
  * the corresponding VMA in the parent process is attached to.
-- 
cgit v1.2.3-70-g09d2


From fcfb4dcc9698f932836aa63ba0d82e7dbd300fb3 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 10 Jan 2012 15:08:21 -0800
Subject: mm/mempolicy.c: mpol_equal(): use bool

mpol_equal() logically returns a boolean.  Use a bool type to slightly
improve readability.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Stephen Wilson <wilsons@start.ca>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mempolicy.h | 10 +++++-----
 mm/mempolicy.c            | 14 +++++++-------
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h
index 7978eec1b7d..7c727a90d70 100644
--- a/include/linux/mempolicy.h
+++ b/include/linux/mempolicy.h
@@ -164,11 +164,11 @@ static inline void mpol_get(struct mempolicy *pol)
 		atomic_inc(&pol->refcnt);
 }
 
-extern int __mpol_equal(struct mempolicy *a, struct mempolicy *b);
-static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
+extern bool __mpol_equal(struct mempolicy *a, struct mempolicy *b);
+static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	if (a == b)
-		return 1;
+		return true;
 	return __mpol_equal(a, b);
 }
 
@@ -257,9 +257,9 @@ static inline int vma_migratable(struct vm_area_struct *vma)
 
 struct mempolicy {};
 
-static inline int mpol_equal(struct mempolicy *a, struct mempolicy *b)
+static inline bool mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
-	return 1;
+	return true;
 }
 
 static inline void mpol_put(struct mempolicy *p)
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index c3fdbcb1765..e3d58f08846 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1983,28 +1983,28 @@ struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
 }
 
 /* Slow path of a mempolicy comparison */
-int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
+bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 {
 	if (!a || !b)
-		return 0;
+		return false;
 	if (a->mode != b->mode)
-		return 0;
+		return false;
 	if (a->flags != b->flags)
-		return 0;
+		return false;
 	if (mpol_store_user_nodemask(a))
 		if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
-			return 0;
+			return false;
 
 	switch (a->mode) {
 	case MPOL_BIND:
 		/* Fall through */
 	case MPOL_INTERLEAVE:
-		return nodes_equal(a->v.nodes, b->v.nodes);
+		return !!nodes_equal(a->v.nodes, b->v.nodes);
 	case MPOL_PREFERRED:
 		return a->v.preferred_node == b->v.preferred_node;
 	default:
 		BUG();
-		return 0;
+		return false;
 	}
 }
 
-- 
cgit v1.2.3-70-g09d2


From a6d511e5155406cd214d3af3ff9cffc69548b006 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Tue, 10 Jan 2012 15:09:40 -0800
Subject: leds: add driver for TCA6507 LED controller

TI's TCA6507 is the LED driver in the GTA04 Openmoko motherboard.  The
driver provides full support for brightness levels and hardware blinking.

This driver can drive each of 7 outputs as an LED or a GPIO output,
and provides hardware-assist blinking.

[akpm@linux-foundation.org: fix __mod_i2c_device_table alias]
[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: NeilBrown <neilb@suse.de>
Cc: Richard Purdie <rpurdie@rpsys.net>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/leds/Kconfig         |   8 +
 drivers/leds/Makefile        |   1 +
 drivers/leds/leds-tca6507.c  | 779 +++++++++++++++++++++++++++++++++++++++++++
 include/linux/leds-tca6507.h |  34 ++
 4 files changed, 822 insertions(+)
 create mode 100644 drivers/leds/leds-tca6507.c
 create mode 100644 include/linux/leds-tca6507.h

(limited to 'include/linux')

diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 1b75a56ebd0..897a77dfa9d 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -388,6 +388,14 @@ config LEDS_RENESAS_TPU
 	  pin function. The latter to support brightness control.
 	  Brightness control is supported but hardware blinking is not.
 
+config LEDS_TCA6507
+	tristate "LED Support for TCA6507 I2C chip"
+	depends on LEDS_CLASS && I2C
+	help
+	  This option enables support for LEDs connected to TC6507
+	  LED driver chips accessed via the I2C bus.
+	  Driver support brightness control and hardware-assisted blinking.
+
 config LEDS_TRIGGERS
 	bool "LED Trigger support"
 	depends on LEDS_CLASS
diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile
index e4f6bf56888..5c9dc4b000d 100644
--- a/drivers/leds/Makefile
+++ b/drivers/leds/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_LEDS_GPIO)			+= leds-gpio.o
 obj-$(CONFIG_LEDS_LP3944)		+= leds-lp3944.o
 obj-$(CONFIG_LEDS_LP5521)		+= leds-lp5521.o
 obj-$(CONFIG_LEDS_LP5523)		+= leds-lp5523.o
+obj-$(CONFIG_LEDS_TCA6507)		+= leds-tca6507.o
 obj-$(CONFIG_LEDS_CLEVO_MAIL)		+= leds-clevo-mail.o
 obj-$(CONFIG_LEDS_HP6XX)		+= leds-hp6xx.o
 obj-$(CONFIG_LEDS_FSG)			+= leds-fsg.o
diff --git a/drivers/leds/leds-tca6507.c b/drivers/leds/leds-tca6507.c
new file mode 100644
index 00000000000..133f89fb707
--- /dev/null
+++ b/drivers/leds/leds-tca6507.c
@@ -0,0 +1,779 @@
+/*
+ * leds-tca6507
+ *
+ * The TCA6507 is a programmable LED controller that can drive 7
+ * separate lines either by holding them low, or by pulsing them
+ * with modulated width.
+ * The modulation can be varied in a simple pattern to produce a blink or
+ * double-blink.
+ *
+ * This driver can configure each line either as a 'GPIO' which is out-only
+ * (no pull-up) or as an LED with variable brightness and hardware-assisted
+ * blinking.
+ *
+ * Apart from OFF and ON there are three programmable brightness levels which
+ * can be programmed from 0 to 15 and indicate how many 500usec intervals in
+ * each 8msec that the led is 'on'.  The levels are named MASTER, BANK0 and
+ * BANK1.
+ *
+ * There are two different blink rates that can be programmed, each with
+ * separate time for rise, on, fall, off and second-off.  Thus if 3 or more
+ * different non-trivial rates are required, software must be used for the extra
+ * rates. The two different blink rates must align with the two levels BANK0 and
+ * BANK1.
+ * This driver does not support double-blink so 'second-off' always matches
+ * 'off'.
+ *
+ * Only 16 different times can be programmed in a roughly logarithmic scale from
+ * 64ms to 16320ms.  To be precise the possible times are:
+ *    0, 64, 128, 192, 256, 384, 512, 768,
+ *    1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320
+ *
+ * Times that cannot be closely matched with these must be
+ * handled in software.  This driver allows 12.5% error in matching.
+ *
+ * This driver does not allow rise/fall rates to be set explicitly.  When trying
+ * to match a given 'on' or 'off' period, an appropriate pair of 'change' and
+ * 'hold' times are chosen to get a close match.  If the target delay is even,
+ * the 'change' number will be the smaller; if odd, the 'hold' number will be
+ * the smaller.
+
+ * Choosing pairs of delays with 12.5% errors allows us to match delays in the
+ * ranges: 56-72, 112-144, 168-216, 224-27504, 28560-36720.
+ * 26% of the achievable sums can be matched by multiple pairings. For example
+ * 1536 == 1536+0, 1024+512, or 768+768.  This driver will always choose the
+ * pairing with the least maximum - 768+768 in this case.  Other pairings are
+ * not available.
+ *
+ * Access to the 3 levels and 2 blinks are on a first-come, first-served basis.
+ * Access can be shared by multiple leds if they have the same level and
+ * either same blink rates, or some don't blink.
+ * When a led changes, it relinquishes access and tries again, so it might
+ * lose access to hardware blink.
+ * If a blink engine cannot be allocated, software blink is used.
+ * If the desired brightness cannot be allocated, the closest available non-zero
+ * brightness is used.  As 'full' is always available, the worst case would be
+ * to have two different blink rates at '1', with Max at '2', then other leds
+ * will have to choose between '2' and '16'.  Hopefully this is not likely.
+ *
+ * Each bank (BANK0 and BANK1) has two usage counts - LEDs using the brightness
+ * and LEDs using the blink.  It can only be reprogrammed when the appropriate
+ * counter is zero.  The MASTER level has a single usage count.
+ *
+ * Each Led has programmable 'on' and 'off' time as milliseconds.  With each
+ * there is a flag saying if it was explicitly requested or defaulted.
+ * Similarly the banks know if each time was explicit or a default.  Defaults
+ * are permitted to be changed freely - they are not recognised when matching.
+ *
+ *
+ * An led-tca6507 device must be provided with platform data.  This data
+ * lists for each output: the name, default trigger, and whether the signal
+ * is being used as a GPiO rather than an led.  'struct led_plaform_data'
+ * is used for this.  If 'name' is NULL, the output isn't used.  If 'flags'
+ * is TCA6507_MAKE_CPIO, the output is a GPO.
+ * The "struct led_platform_data" can be embedded in a
+ * "struct tca6507_platform_data" which adds a 'gpio_base' for the GPiOs,
+ * and a 'setup' callback which is called once the GPiOs are available.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/leds.h>
+#include <linux/err.h>
+#include <linux/i2c.h>
+#include <linux/gpio.h>
+#include <linux/workqueue.h>
+#include <linux/leds-tca6507.h>
+
+/* LED select registers determine the source that drives LED outputs */
+#define TCA6507_LS_LED_OFF	0x0	/* Output HI-Z (off) */
+#define TCA6507_LS_LED_OFF1	0x1	/* Output HI-Z (off) - not used */
+#define TCA6507_LS_LED_PWM0	0x2	/* Output LOW with Bank0 rate */
+#define TCA6507_LS_LED_PWM1	0x3	/* Output LOW with Bank1 rate */
+#define TCA6507_LS_LED_ON	0x4	/* Output LOW (on) */
+#define TCA6507_LS_LED_MIR	0x5	/* Output LOW with Master Intensity */
+#define TCA6507_LS_BLINK0	0x6	/* Blink at Bank0 rate */
+#define TCA6507_LS_BLINK1	0x7	/* Blink at Bank1 rate */
+
+enum {
+	BANK0,
+	BANK1,
+	MASTER,
+};
+static int bank_source[3] = {
+	TCA6507_LS_LED_PWM0,
+	TCA6507_LS_LED_PWM1,
+	TCA6507_LS_LED_MIR,
+};
+static int blink_source[2] = {
+	TCA6507_LS_BLINK0,
+	TCA6507_LS_BLINK1,
+};
+
+/* PWM registers */
+#define	TCA6507_REG_CNT			11
+
+/*
+ * 0x00, 0x01, 0x02 encode the TCA6507_LS_* values, each output
+ * owns one bit in each register
+ */
+#define	TCA6507_FADE_ON			0x03
+#define	TCA6507_FULL_ON			0x04
+#define	TCA6507_FADE_OFF		0x05
+#define	TCA6507_FIRST_OFF		0x06
+#define	TCA6507_SECOND_OFF		0x07
+#define	TCA6507_MAX_INTENSITY		0x08
+#define	TCA6507_MASTER_INTENSITY	0x09
+#define	TCA6507_INITIALIZE		0x0A
+
+#define	INIT_CODE			0x8
+
+#define TIMECODES 16
+static int time_codes[TIMECODES] = {
+	0, 64, 128, 192, 256, 384, 512, 768,
+	1024, 1536, 2048, 3072, 4096, 5760, 8128, 16320
+};
+
+/* Convert an led.brightness level (0..255) to a TCA6507 level (0..15) */
+static inline int TO_LEVEL(int brightness)
+{
+	return brightness >> 4;
+}
+
+/* ...and convert back */
+static inline int TO_BRIGHT(int level)
+{
+	if (level)
+		return (level << 4) | 0xf;
+	return 0;
+}
+
+#define NUM_LEDS 7
+struct tca6507_chip {
+	int			reg_set;	/* One bit per register where
+						 * a '1' means the register
+						 * should be written */
+	u8			reg_file[TCA6507_REG_CNT];
+	/* Bank 2 is Master Intensity and doesn't use times */
+	struct bank {
+		int level;
+		int ontime, offtime;
+		int on_dflt, off_dflt;
+		int time_use, level_use;
+	} bank[3];
+	struct i2c_client	*client;
+	struct work_struct	work;
+	spinlock_t		lock;
+
+	struct tca6507_led {
+		struct tca6507_chip	*chip;
+		struct led_classdev	led_cdev;
+		int			num;
+		int			ontime, offtime;
+		int			on_dflt, off_dflt;
+		int			bank;	/* Bank used, or -1 */
+		int			blink;	/* Set if hardware-blinking */
+	} leds[NUM_LEDS];
+#ifdef CONFIG_GPIOLIB
+	struct gpio_chip		gpio;
+	const char			*gpio_name[NUM_LEDS];
+	int				gpio_map[NUM_LEDS];
+#endif
+};
+
+static const struct i2c_device_id tca6507_id[] = {
+	{ "tca6507" },
+	{ }
+};
+MODULE_DEVICE_TABLE(i2c, tca6507_id);
+
+static int choose_times(int msec, int *c1p, int *c2p)
+{
+	/*
+	 * Choose two timecodes which add to 'msec' as near as possible.
+	 * The first returned is the 'on' or 'off' time.  The second is to be
+	 * used as a 'fade-on' or 'fade-off' time.  If 'msec' is even,
+	 * the first will not be smaller than the second.  If 'msec' is odd,
+	 * the first will not be larger than the second.
+	 * If we cannot get a sum within 1/8 of 'msec' fail with -EINVAL,
+	 * otherwise return the sum that was achieved, plus 1 if the first is
+	 * smaller.
+	 * If two possibilities are equally good (e.g. 512+0, 256+256), choose
+	 * the first pair so there is more change-time visible (i.e. it is
+	 * softer).
+	 */
+	int c1, c2;
+	int tmax = msec * 9 / 8;
+	int tmin = msec * 7 / 8;
+	int diff = 65536;
+
+	/* We start at '1' to ensure we never even think of choosing a
+	 * total time of '0'.
+	 */
+	for (c1 = 1; c1 < TIMECODES; c1++) {
+		int t = time_codes[c1];
+		if (t*2 < tmin)
+			continue;
+		if (t > tmax)
+			break;
+		for (c2 = 0; c2 <= c1; c2++) {
+			int tt = t + time_codes[c2];
+			int d;
+			if (tt < tmin)
+				continue;
+			if (tt > tmax)
+				break;
+			/* This works! */
+			d = abs(msec - tt);
+			if (d >= diff)
+				continue;
+			/* Best yet */
+			*c1p = c1;
+			*c2p = c2;
+			diff = d;
+			if (d == 0)
+				return msec;
+		}
+	}
+	if (diff < 65536) {
+		int actual;
+		if (msec & 1) {
+			c1 = *c2p;
+			*c2p = *c1p;
+			*c1p = c1;
+		}
+		actual = time_codes[*c1p] + time_codes[*c2p];
+		if (*c1p < *c2p)
+			return actual + 1;
+		else
+			return actual;
+	}
+	/* No close match */
+	return -EINVAL;
+}
+
+/*
+ * Update the register file with the appropriate 3-bit state for
+ * the given led.
+ */
+static void set_select(struct tca6507_chip *tca, int led, int val)
+{
+	int mask = (1 << led);
+	int bit;
+
+	for (bit = 0; bit < 3; bit++) {
+		int n = tca->reg_file[bit] & ~mask;
+		if (val & (1 << bit))
+			n |= mask;
+		if (tca->reg_file[bit] != n) {
+			tca->reg_file[bit] = n;
+			tca->reg_set |= (1 << bit);
+		}
+	}
+}
+
+/* Update the register file with the appropriate 4-bit code for
+ * one bank or other.  This can be used for timers, for levels, or
+ * for initialisation.
+ */
+static void set_code(struct tca6507_chip *tca, int reg, int bank, int new)
+{
+	int mask = 0xF;
+	int n;
+	if (bank) {
+		mask <<= 4;
+		new <<= 4;
+	}
+	n = tca->reg_file[reg] & ~mask;
+	n |= new;
+	if (tca->reg_file[reg] != n) {
+		tca->reg_file[reg] = n;
+		tca->reg_set |= 1 << reg;
+	}
+}
+
+/* Update brightness level. */
+static void set_level(struct tca6507_chip *tca, int bank, int level)
+{
+	switch (bank) {
+	case BANK0:
+	case BANK1:
+		set_code(tca, TCA6507_MAX_INTENSITY, bank, level);
+		break;
+	case MASTER:
+		set_code(tca, TCA6507_MASTER_INTENSITY, 0, level);
+		break;
+	}
+	tca->bank[bank].level = level;
+}
+
+/* Record all relevant time code for a given bank */
+static void set_times(struct tca6507_chip *tca, int bank)
+{
+	int c1, c2;
+	int result;
+
+	result = choose_times(tca->bank[bank].ontime, &c1, &c2);
+	dev_dbg(&tca->client->dev,
+		"Chose on  times %d(%d) %d(%d) for %dms\n", c1, time_codes[c1],
+		c2, time_codes[c2], tca->bank[bank].ontime);
+	set_code(tca, TCA6507_FADE_ON, bank, c2);
+	set_code(tca, TCA6507_FULL_ON, bank, c1);
+	tca->bank[bank].ontime = result;
+
+	result = choose_times(tca->bank[bank].offtime, &c1, &c2);
+	dev_dbg(&tca->client->dev,
+		"Chose off times %d(%d) %d(%d) for %dms\n", c1, time_codes[c1],
+		c2, time_codes[c2], tca->bank[bank].offtime);
+	set_code(tca, TCA6507_FADE_OFF, bank, c2);
+	set_code(tca, TCA6507_FIRST_OFF, bank, c1);
+	set_code(tca, TCA6507_SECOND_OFF, bank, c1);
+	tca->bank[bank].offtime = result;
+
+	set_code(tca, TCA6507_INITIALIZE, bank, INIT_CODE);
+}
+
+/* Write all needed register of tca6507 */
+
+static void tca6507_work(struct work_struct *work)
+{
+	struct tca6507_chip *tca = container_of(work, struct tca6507_chip,
+						work);
+	struct i2c_client *cl = tca->client;
+	int set;
+	u8 file[TCA6507_REG_CNT];
+	int r;
+
+	spin_lock_irq(&tca->lock);
+	set = tca->reg_set;
+	memcpy(file, tca->reg_file, TCA6507_REG_CNT);
+	tca->reg_set = 0;
+	spin_unlock_irq(&tca->lock);
+
+	for (r = 0; r < TCA6507_REG_CNT; r++)
+		if (set & (1<<r))
+			i2c_smbus_write_byte_data(cl, r, file[r]);
+}
+
+static void led_release(struct tca6507_led *led)
+{
+	/* If led owns any resource, release it. */
+	struct tca6507_chip *tca = led->chip;
+	if (led->bank >= 0) {
+		struct bank *b = tca->bank + led->bank;
+		if (led->blink)
+			b->time_use--;
+		b->level_use--;
+	}
+	led->blink = 0;
+	led->bank = -1;
+}
+
+static int led_prepare(struct tca6507_led *led)
+{
+	/* Assign this led to a bank, configuring that bank if necessary. */
+	int level = TO_LEVEL(led->led_cdev.brightness);
+	struct tca6507_chip *tca = led->chip;
+	int c1, c2;
+	int i;
+	struct bank *b;
+	int need_init = 0;
+
+	led->led_cdev.brightness = TO_BRIGHT(level);
+	if (level == 0) {
+		set_select(tca, led->num, TCA6507_LS_LED_OFF);
+		return 0;
+	}
+
+	if (led->ontime == 0 || led->offtime == 0) {
+		/*
+		 * Just set the brightness, choosing first usable bank.
+		 * If none perfect, choose best.
+		 * Count backwards so we check MASTER bank first
+		 * to avoid wasting a timer.
+		 */
+		int best = -1;/* full-on */
+		int diff = 15-level;
+
+		if (level == 15) {
+			set_select(tca, led->num, TCA6507_LS_LED_ON);
+			return 0;
+		}
+
+		for (i = MASTER; i >= BANK0; i--) {
+			int d;
+			if (tca->bank[i].level == level ||
+			    tca->bank[i].level_use == 0) {
+				best = i;
+				break;
+			}
+			d = abs(level - tca->bank[i].level);
+			if (d < diff) {
+				diff = d;
+				best = i;
+			}
+		}
+		if (best == -1) {
+			/* Best brightness is full-on */
+			set_select(tca, led->num, TCA6507_LS_LED_ON);
+			led->led_cdev.brightness = LED_FULL;
+			return 0;
+		}
+
+		if (!tca->bank[best].level_use)
+			set_level(tca, best, level);
+
+		tca->bank[best].level_use++;
+		led->bank = best;
+		set_select(tca, led->num, bank_source[best]);
+		led->led_cdev.brightness = TO_BRIGHT(tca->bank[best].level);
+		return 0;
+	}
+
+	/*
+	 * We have on/off time so we need to try to allocate a timing bank.
+	 * First check if times are compatible with hardware and give up if
+	 * not.
+	 */
+	if (choose_times(led->ontime, &c1, &c2) < 0)
+		return -EINVAL;
+	if (choose_times(led->offtime, &c1, &c2) < 0)
+		return -EINVAL;
+
+	for (i = BANK0; i <= BANK1; i++) {
+		if (tca->bank[i].level_use == 0)
+			/* not in use - it is ours! */
+			break;
+		if (tca->bank[i].level != level)
+			/* Incompatible level - skip */
+			/* FIX: if timer matches we maybe should consider
+			 * this anyway...
+			 */
+			continue;
+
+		if (tca->bank[i].time_use == 0)
+			/* Timer not in use, and level matches - use it */
+			break;
+
+		if (!(tca->bank[i].on_dflt ||
+		      led->on_dflt ||
+		      tca->bank[i].ontime == led->ontime))
+			/* on time is incompatible */
+			continue;
+
+		if (!(tca->bank[i].off_dflt ||
+		      led->off_dflt ||
+		      tca->bank[i].offtime == led->offtime))
+			/* off time is incompatible */
+			continue;
+
+		/* looks like a suitable match */
+		break;
+	}
+
+	if (i > BANK1)
+		/* Nothing matches - how sad */
+		return -EINVAL;
+
+	b = &tca->bank[i];
+	if (b->level_use == 0)
+		set_level(tca, i, level);
+	b->level_use++;
+	led->bank = i;
+
+	if (b->on_dflt ||
+	    !led->on_dflt ||
+	    b->time_use == 0) {
+		b->ontime = led->ontime;
+		b->on_dflt = led->on_dflt;
+		need_init = 1;
+	}
+
+	if (b->off_dflt ||
+	    !led->off_dflt ||
+	    b->time_use == 0) {
+		b->offtime = led->offtime;
+		b->off_dflt = led->off_dflt;
+		need_init = 1;
+	}
+
+	if (need_init)
+		set_times(tca, i);
+
+	led->ontime = b->ontime;
+	led->offtime = b->offtime;
+
+	b->time_use++;
+	led->blink = 1;
+	led->led_cdev.brightness = TO_BRIGHT(b->level);
+	set_select(tca, led->num, blink_source[i]);
+	return 0;
+}
+
+static int led_assign(struct tca6507_led *led)
+{
+	struct tca6507_chip *tca = led->chip;
+	int err;
+	unsigned long flags;
+
+	spin_lock_irqsave(&tca->lock, flags);
+	led_release(led);
+	err = led_prepare(led);
+	if (err) {
+		/*
+		 * Can only fail on timer setup.  In that case we need to
+		 * re-establish as steady level.
+		 */
+		led->ontime = 0;
+		led->offtime = 0;
+		led_prepare(led);
+	}
+	spin_unlock_irqrestore(&tca->lock, flags);
+
+	if (tca->reg_set)
+		schedule_work(&tca->work);
+	return err;
+}
+
+static void tca6507_brightness_set(struct led_classdev *led_cdev,
+				   enum led_brightness brightness)
+{
+	struct tca6507_led *led = container_of(led_cdev, struct tca6507_led,
+					       led_cdev);
+	led->led_cdev.brightness = brightness;
+	led->ontime = 0;
+	led->offtime = 0;
+	led_assign(led);
+}
+
+static int tca6507_blink_set(struct led_classdev *led_cdev,
+			     unsigned long *delay_on,
+			     unsigned long *delay_off)
+{
+	struct tca6507_led *led = container_of(led_cdev, struct tca6507_led,
+					       led_cdev);
+
+	if (*delay_on == 0)
+		led->on_dflt = 1;
+	else if (delay_on != &led_cdev->blink_delay_on)
+		led->on_dflt = 0;
+	led->ontime = *delay_on;
+
+	if (*delay_off == 0)
+		led->off_dflt = 1;
+	else if (delay_off != &led_cdev->blink_delay_off)
+		led->off_dflt = 0;
+	led->offtime = *delay_off;
+
+	if (led->ontime == 0)
+		led->ontime = 512;
+	if (led->offtime == 0)
+		led->offtime = 512;
+
+	if (led->led_cdev.brightness == LED_OFF)
+		led->led_cdev.brightness = LED_FULL;
+	if (led_assign(led) < 0) {
+		led->ontime = 0;
+		led->offtime = 0;
+		led->led_cdev.brightness = LED_OFF;
+		return -EINVAL;
+	}
+	*delay_on = led->ontime;
+	*delay_off = led->offtime;
+	return 0;
+}
+
+#ifdef CONFIG_GPIOLIB
+static void tca6507_gpio_set_value(struct gpio_chip *gc,
+				   unsigned offset, int val)
+{
+	struct tca6507_chip *tca = container_of(gc, struct tca6507_chip, gpio);
+	unsigned long flags;
+
+	spin_lock_irqsave(&tca->lock, flags);
+	/*
+	 * 'OFF' is floating high, and 'ON' is pulled down, so it has the
+	 * inverse sense of 'val'.
+	 */
+	set_select(tca, tca->gpio_map[offset],
+		   val ? TCA6507_LS_LED_OFF : TCA6507_LS_LED_ON);
+	spin_unlock_irqrestore(&tca->lock, flags);
+	if (tca->reg_set)
+		schedule_work(&tca->work);
+}
+
+static int tca6507_gpio_direction_output(struct gpio_chip *gc,
+					  unsigned offset, int val)
+{
+	tca6507_gpio_set_value(gc, offset, val);
+	return 0;
+}
+
+static int tca6507_probe_gpios(struct i2c_client *client,
+			       struct tca6507_chip *tca,
+			       struct tca6507_platform_data *pdata)
+{
+	int err;
+	int i = 0;
+	int gpios = 0;
+
+	for (i = 0; i < NUM_LEDS; i++)
+		if (pdata->leds.leds[i].name && pdata->leds.leds[i].flags) {
+			/* Configure as a gpio */
+			tca->gpio_name[gpios] = pdata->leds.leds[i].name;
+			tca->gpio_map[gpios] = i;
+			gpios++;
+		}
+
+	if (!gpios)
+		return 0;
+
+	tca->gpio.label = "gpio-tca6507";
+	tca->gpio.names = tca->gpio_name;
+	tca->gpio.ngpio = gpios;
+	tca->gpio.base = pdata->gpio_base;
+	tca->gpio.owner = THIS_MODULE;
+	tca->gpio.direction_output = tca6507_gpio_direction_output;
+	tca->gpio.set = tca6507_gpio_set_value;
+	tca->gpio.dev = &client->dev;
+	err = gpiochip_add(&tca->gpio);
+	if (err) {
+		tca->gpio.ngpio = 0;
+		return err;
+	}
+	if (pdata->setup)
+		pdata->setup(tca->gpio.base, tca->gpio.ngpio);
+	return 0;
+}
+
+static void tca6507_remove_gpio(struct tca6507_chip *tca)
+{
+	if (tca->gpio.ngpio) {
+		int err = gpiochip_remove(&tca->gpio);
+		dev_err(&tca->client->dev, "%s failed, %d\n",
+			"gpiochip_remove()", err);
+	}
+}
+#else /* CONFIG_GPIOLIB */
+static int tca6507_probe_gpios(struct i2c_client *client,
+			       struct tca6507_chip *tca,
+			       struct tca6507_platform_data *pdata)
+{
+	return 0;
+}
+static void tca6507_remove_gpio(struct tca6507_chip *tca)
+{
+}
+#endif /* CONFIG_GPIOLIB */
+
+static int __devinit tca6507_probe(struct i2c_client *client,
+				   const struct i2c_device_id *id)
+{
+	struct tca6507_chip *tca;
+	struct i2c_adapter *adapter;
+	struct tca6507_platform_data *pdata;
+	int err;
+	int i = 0;
+
+	adapter = to_i2c_adapter(client->dev.parent);
+	pdata = client->dev.platform_data;
+
+	if (!i2c_check_functionality(adapter, I2C_FUNC_I2C))
+		return -EIO;
+
+	if (!pdata || pdata->leds.num_leds != NUM_LEDS) {
+		dev_err(&client->dev, "Need %d entries in platform-data list\n",
+			NUM_LEDS);
+		return -ENODEV;
+	}
+	err = -ENOMEM;
+	tca = kzalloc(sizeof(*tca), GFP_KERNEL);
+	if (!tca)
+		goto exit;
+
+	tca->client = client;
+	INIT_WORK(&tca->work, tca6507_work);
+	spin_lock_init(&tca->lock);
+	i2c_set_clientdata(client, tca);
+
+	for (i = 0; i < NUM_LEDS; i++) {
+		struct tca6507_led *l = tca->leds + i;
+
+		l->chip = tca;
+		l->num = i;
+		if (pdata->leds.leds[i].name && !pdata->leds.leds[i].flags) {
+			l->led_cdev.name = pdata->leds.leds[i].name;
+			l->led_cdev.default_trigger
+				= pdata->leds.leds[i].default_trigger;
+			l->led_cdev.brightness_set = tca6507_brightness_set;
+			l->led_cdev.blink_set = tca6507_blink_set;
+			l->bank = -1;
+			err = led_classdev_register(&client->dev,
+						    &l->led_cdev);
+			if (err < 0)
+				goto exit;
+		}
+	}
+	err = tca6507_probe_gpios(client, tca, pdata);
+	if (err)
+		goto exit;
+	/* set all registers to known state - zero */
+	tca->reg_set = 0x7f;
+	schedule_work(&tca->work);
+
+	return 0;
+exit:
+	while (i--)
+		if (tca->leds[i].led_cdev.name)
+			led_classdev_unregister(&tca->leds[i].led_cdev);
+	cancel_work_sync(&tca->work);
+	i2c_set_clientdata(client, NULL);
+	kfree(tca);
+	return err;
+}
+
+static int __devexit tca6507_remove(struct i2c_client *client)
+{
+	int i;
+	struct tca6507_chip *tca = i2c_get_clientdata(client);
+	struct tca6507_led *tca_leds = tca->leds;
+
+	for (i = 0; i < NUM_LEDS; i++) {
+		if (tca_leds[i].led_cdev.name)
+			led_classdev_unregister(&tca_leds[i].led_cdev);
+	}
+	tca6507_remove_gpio(tca);
+	cancel_work_sync(&tca->work);
+	i2c_set_clientdata(client, NULL);
+	kfree(tca);
+
+	return 0;
+}
+
+static struct i2c_driver tca6507_driver = {
+	.driver   = {
+		.name    = "leds-tca6507",
+		.owner   = THIS_MODULE,
+	},
+	.probe    = tca6507_probe,
+	.remove   = __devexit_p(tca6507_remove),
+	.id_table = tca6507_id,
+};
+
+static int __init tca6507_leds_init(void)
+{
+	return i2c_add_driver(&tca6507_driver);
+}
+
+static void __exit tca6507_leds_exit(void)
+{
+	i2c_del_driver(&tca6507_driver);
+}
+
+module_init(tca6507_leds_init);
+module_exit(tca6507_leds_exit);
+
+MODULE_AUTHOR("NeilBrown <neilb@suse.de>");
+MODULE_DESCRIPTION("TCA6507 LED/GPO driver");
+MODULE_LICENSE("GPL v2");
diff --git a/include/linux/leds-tca6507.h b/include/linux/leds-tca6507.h
new file mode 100644
index 00000000000..dcabf4fa2ae
--- /dev/null
+++ b/include/linux/leds-tca6507.h
@@ -0,0 +1,34 @@
+/*
+ * TCA6507 LED chip driver.
+ *
+ * Copyright (C) 2011 Neil Brown <neil@brown.name>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+
+#ifndef __LINUX_TCA6507_H
+#define __LINUX_TCA6507_H
+#include <linux/leds.h>
+
+struct tca6507_platform_data {
+	struct led_platform_data leds;
+#ifdef CONFIG_GPIOLIB
+	int gpio_base;
+	void (*setup)(unsigned gpio_base, unsigned ngpio);
+#endif
+};
+
+#define	TCA6507_MAKE_GPIO 1
+#endif /* __LINUX_TCA6507_H*/
-- 
cgit v1.2.3-70-g09d2


From 5e6292c0f28f03dfdb8ea3d685f0b838a23bfba4 Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Tue, 10 Jan 2012 15:11:17 -0800
Subject: signal: add block_sigmask() for adding sigmask to current->blocked

Abstract the code sequence for adding a signal handler's sa_mask to
current->blocked because the sequence is identical for all architectures.
Furthermore, in the past some architectures actually got this code wrong,
so introduce a wrapper that all architectures can use.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: "David S. Miller" <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/signal.c |  6 +-----
 include/linux/signal.h   |  1 +
 kernel/signal.c          | 21 +++++++++++++++++++++
 3 files changed, 23 insertions(+), 5 deletions(-)

(limited to 'include/linux')

diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 54ddaeb221c..46a01bdc27e 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -682,7 +682,6 @@ static int
 handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 		struct pt_regs *regs)
 {
-	sigset_t blocked;
 	int ret;
 
 	/* Are we from a system call? */
@@ -733,10 +732,7 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
 	 */
 	regs->flags &= ~X86_EFLAGS_TF;
 
-	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
-	if (!(ka->sa.sa_flags & SA_NODEFER))
-		sigaddset(&blocked, sig);
-	set_current_blocked(&blocked);
+	block_sigmask(ka, sig);
 
 	tracehook_signal_handler(sig, info, ka, regs,
 				 test_thread_flag(TIF_SINGLESTEP));
diff --git a/include/linux/signal.h b/include/linux/signal.h
index a822300a253..7987ce74874 100644
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -254,6 +254,7 @@ extern void set_current_blocked(const sigset_t *);
 extern int show_unhandled_signals;
 
 extern int get_signal_to_deliver(siginfo_t *info, struct k_sigaction *return_ka, struct pt_regs *regs, void *cookie);
+extern void block_sigmask(struct k_sigaction *ka, int signr);
 extern void exit_signals(struct task_struct *tsk);
 
 extern struct kmem_cache *sighand_cachep;
diff --git a/kernel/signal.c b/kernel/signal.c
index bb0efa5705e..d532f1709fb 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2318,6 +2318,27 @@ relock:
 	return signr;
 }
 
+/**
+ * block_sigmask - add @ka's signal mask to current->blocked
+ * @ka: action for @signr
+ * @signr: signal that has been successfully delivered
+ *
+ * This function should be called when a signal has succesfully been
+ * delivered. It adds the mask of signals for @ka to current->blocked
+ * so that they are blocked during the execution of the signal
+ * handler. In addition, @signr will be blocked unless %SA_NODEFER is
+ * set in @ka->sa.sa_flags.
+ */
+void block_sigmask(struct k_sigaction *ka, int signr)
+{
+	sigset_t blocked;
+
+	sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
+	if (!(ka->sa.sa_flags & SA_NODEFER))
+		sigaddset(&blocked, signr);
+	set_current_blocked(&blocked);
+}
+
 /*
  * It could be that complete_signal() picked us to notify about the
  * group-wide signal. Other threads should be notified now to take
-- 
cgit v1.2.3-70-g09d2


From 7773fbc54182a90cd248656619c7d33859e5f91d Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Tue, 10 Jan 2012 15:11:20 -0800
Subject: procfs: make proc_get_link to use dentry instead of inode

Prepare the ground for the next "map_files" patch which needs a name of a
link file to analyse.

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@parallels.com>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vasiliy Kulikov <segoon@openwall.com>
Cc: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c          | 20 ++++++++++----------
 include/linux/proc_fs.h |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index 1aab5fe05a1..e31d95055c6 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -166,9 +166,9 @@ static int get_task_root(struct task_struct *task, struct path *root)
 	return result;
 }
 
-static int proc_cwd_link(struct inode *inode, struct path *path)
+static int proc_cwd_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -183,9 +183,9 @@ static int proc_cwd_link(struct inode *inode, struct path *path)
 	return result;
 }
 
-static int proc_root_link(struct inode *inode, struct path *path)
+static int proc_root_link(struct dentry *dentry, struct path *path)
 {
-	struct task_struct *task = get_proc_task(inode);
+	struct task_struct *task = get_proc_task(dentry->d_inode);
 	int result = -ENOENT;
 
 	if (task) {
@@ -1456,13 +1456,13 @@ static const struct file_operations proc_pid_set_comm_operations = {
 	.release	= single_release,
 };
 
-static int proc_exe_link(struct inode *inode, struct path *exe_path)
+static int proc_exe_link(struct dentry *dentry, struct path *exe_path)
 {
 	struct task_struct *task;
 	struct mm_struct *mm;
 	struct file *exe_file;
 
-	task = get_proc_task(inode);
+	task = get_proc_task(dentry->d_inode);
 	if (!task)
 		return -ENOENT;
 	mm = get_task_mm(task);
@@ -1492,7 +1492,7 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &nd->path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &nd->path);
 out:
 	return ERR_PTR(error);
 }
@@ -1531,7 +1531,7 @@ static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int b
 	if (!proc_fd_access_allowed(inode))
 		goto out;
 
-	error = PROC_I(inode)->op.proc_get_link(inode, &path);
+	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
 	if (error)
 		goto out;
 
@@ -1823,9 +1823,9 @@ static int proc_fd_info(struct inode *inode, struct path *path, char *info)
 	return -ENOENT;
 }
 
-static int proc_fd_link(struct inode *inode, struct path *path)
+static int proc_fd_link(struct dentry *dentry, struct path *path)
 {
-	return proc_fd_info(inode, path, NULL);
+	return proc_fd_info(dentry->d_inode, path, NULL);
 }
 
 static int tid_fd_revalidate(struct dentry *dentry, struct nameidata *nd)
diff --git a/include/linux/proc_fs.h b/include/linux/proc_fs.h
index 6d9e575519c..85c50730623 100644
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -253,7 +253,7 @@ extern const struct proc_ns_operations utsns_operations;
 extern const struct proc_ns_operations ipcns_operations;
 
 union proc_op {
-	int (*proc_get_link)(struct inode *, struct path *);
+	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
 	int (*proc_show)(struct seq_file *m,
 		struct pid_namespace *ns, struct pid *pid,
-- 
cgit v1.2.3-70-g09d2


From 640708a2cff7f81e246243b0073c66e6ece7e53e Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@parallels.com>
Date: Tue, 10 Jan 2012 15:11:23 -0800
Subject: procfs: introduce the /proc/<pid>/map_files/ directory

This one behaves similarly to the /proc/<pid>/fd/ one - it contains
symlinks one for each mapping with file, the name of a symlink is
"vma->vm_start-vma->vm_end", the target is the file.  Opening a symlink
results in a file that point exactly to the same inode as them vma's one.

For example the ls -l of some arbitrary /proc/<pid>/map_files/

 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80403000-7f8f80404000 -> /lib64/libc-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f8061e000-7f8f80620000 -> /lib64/libselinux.so.1
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80826000-7f8f80827000 -> /lib64/libacl.so.1.1.0
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a2f000-7f8f80a30000 -> /lib64/librt-2.5.so
 | lr-x------ 1 root root 64 Aug 26 06:40 7f8f80a30000-7f8f80a4c000 -> /lib64/ld-2.5.so

This *helps* checkpointing process in three ways:

1. When dumping a task mappings we do know exact file that is mapped
   by particular region.  We do this by opening
   /proc/$pid/map_files/$address symlink the way we do with file
   descriptors.

2. This also helps in determining which anonymous shared mappings are
   shared with each other by comparing the inodes of them.

3. When restoring a set of processes in case two of them has a mapping
   shared, we map the memory by the 1st one and then open its
   /proc/$pid/map_files/$address file and map it by the 2nd task.

Using /proc/$pid/maps for this is quite inconvenient since it brings
repeatable re-reading and reparsing for this text file which slows down
restore procedure significantly.  Also as being pointed in (3) it is a way
easier to use top level shared mapping in children as
/proc/$pid/map_files/$address when needed.

[akpm@linux-foundation.org: coding-style fixes]
[gorcunov@openvz.org: make map_files depend on CHECKPOINT_RESTORE]
Signed-off-by: Pavel Emelyanov <xemul@parallels.com>
Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Reviewed-by: Vasiliy Kulikov <segoon@openwall.com>
Reviewed-by: "Kirill A. Shutemov" <kirill@shutemov.name>
Cc: Tejun Heo <tj@kernel.org>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/base.c     | 355 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 include/linux/mm.h |  12 ++
 2 files changed, 367 insertions(+)

(limited to 'include/linux')

diff --git a/fs/proc/base.c b/fs/proc/base.c
index e31d95055c6..4d755fed3ec 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -83,6 +83,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/fs_struct.h>
 #include <linux/slab.h>
+#include <linux/flex_array.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -134,6 +135,8 @@ struct pid_entry {
 		NULL, &proc_single_file_operations,	\
 		{ .proc_show = show } )
 
+static int proc_fd_permission(struct inode *inode, int mask);
+
 /*
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
@@ -2046,6 +2049,355 @@ static const struct file_operations proc_fd_operations = {
 	.llseek		= default_llseek,
 };
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+
+/*
+ * dname_to_vma_addr - maps a dentry name into two unsigned longs
+ * which represent vma start and end addresses.
+ */
+static int dname_to_vma_addr(struct dentry *dentry,
+			     unsigned long *start, unsigned long *end)
+{
+	if (sscanf(dentry->d_name.name, "%lx-%lx", start, end) != 2)
+		return -EINVAL;
+
+	return 0;
+}
+
+static int map_files_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	bool exact_vma_exists = false;
+	struct mm_struct *mm = NULL;
+	struct task_struct *task;
+	const struct cred *cred;
+	struct inode *inode;
+	int status = 0;
+
+	if (nd && nd->flags & LOOKUP_RCU)
+		return -ECHILD;
+
+	if (!capable(CAP_SYS_ADMIN)) {
+		status = -EACCES;
+		goto out_notask;
+	}
+
+	inode = dentry->d_inode;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out_notask;
+
+	if (!ptrace_may_access(task, PTRACE_MODE_READ))
+		goto out;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out;
+
+	if (!dname_to_vma_addr(dentry, &vm_start, &vm_end)) {
+		down_read(&mm->mmap_sem);
+		exact_vma_exists = !!find_exact_vma(mm, vm_start, vm_end);
+		up_read(&mm->mmap_sem);
+	}
+
+	mmput(mm);
+
+	if (exact_vma_exists) {
+		if (task_dumpable(task)) {
+			rcu_read_lock();
+			cred = __task_cred(task);
+			inode->i_uid = cred->euid;
+			inode->i_gid = cred->egid;
+			rcu_read_unlock();
+		} else {
+			inode->i_uid = 0;
+			inode->i_gid = 0;
+		}
+		security_task_to_inode(task, inode);
+		status = 1;
+	}
+
+out:
+	put_task_struct(task);
+
+out_notask:
+	if (status <= 0)
+		d_drop(dentry);
+
+	return status;
+}
+
+static const struct dentry_operations tid_map_files_dentry_operations = {
+	.d_revalidate	= map_files_d_revalidate,
+	.d_delete	= pid_delete_dentry,
+};
+
+static int proc_map_files_get_link(struct dentry *dentry, struct path *path)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	int rc;
+
+	rc = -ENOENT;
+	task = get_proc_task(dentry->d_inode);
+	if (!task)
+		goto out;
+
+	mm = get_task_mm(task);
+	put_task_struct(task);
+	if (!mm)
+		goto out;
+
+	rc = dname_to_vma_addr(dentry, &vm_start, &vm_end);
+	if (rc)
+		goto out_mmput;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (vma && vma->vm_file) {
+		*path = vma->vm_file->f_path;
+		path_get(path);
+		rc = 0;
+	}
+	up_read(&mm->mmap_sem);
+
+out_mmput:
+	mmput(mm);
+out:
+	return rc;
+}
+
+struct map_files_info {
+	struct file	*file;
+	unsigned long	len;
+	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
+};
+
+static struct dentry *
+proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
+			   struct task_struct *task, const void *ptr)
+{
+	const struct file *file = ptr;
+	struct proc_inode *ei;
+	struct inode *inode;
+
+	if (!file)
+		return ERR_PTR(-ENOENT);
+
+	inode = proc_pid_make_inode(dir->i_sb, task);
+	if (!inode)
+		return ERR_PTR(-ENOENT);
+
+	ei = PROC_I(inode);
+	ei->op.proc_get_link = proc_map_files_get_link;
+
+	inode->i_op = &proc_pid_link_inode_operations;
+	inode->i_size = 64;
+	inode->i_mode = S_IFLNK;
+
+	if (file->f_mode & FMODE_READ)
+		inode->i_mode |= S_IRUSR;
+	if (file->f_mode & FMODE_WRITE)
+		inode->i_mode |= S_IWUSR;
+
+	d_set_d_op(dentry, &tid_map_files_dentry_operations);
+	d_add(dentry, inode);
+
+	return NULL;
+}
+
+static struct dentry *proc_map_files_lookup(struct inode *dir,
+		struct dentry *dentry, struct nameidata *nd)
+{
+	unsigned long vm_start, vm_end;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct dentry *result;
+	struct mm_struct *mm;
+
+	result = ERR_PTR(-EACCES);
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	result = ERR_PTR(-ENOENT);
+	task = get_proc_task(dir);
+	if (!task)
+		goto out;
+
+	result = ERR_PTR(-EACCES);
+	if (lock_trace(task))
+		goto out_put_task;
+
+	result = ERR_PTR(-ENOENT);
+	if (dname_to_vma_addr(dentry, &vm_start, &vm_end))
+		goto out_unlock;
+
+	mm = get_task_mm(task);
+	if (!mm)
+		goto out_unlock;
+
+	down_read(&mm->mmap_sem);
+	vma = find_exact_vma(mm, vm_start, vm_end);
+	if (!vma)
+		goto out_no_vma;
+
+	result = proc_map_files_instantiate(dir, dentry, task, vma->vm_file);
+
+out_no_vma:
+	up_read(&mm->mmap_sem);
+	mmput(mm);
+out_unlock:
+	unlock_trace(task);
+out_put_task:
+	put_task_struct(task);
+out:
+	return result;
+}
+
+static const struct inode_operations proc_map_files_inode_operations = {
+	.lookup		= proc_map_files_lookup,
+	.permission	= proc_fd_permission,
+	.setattr	= proc_setattr,
+};
+
+static int
+proc_map_files_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	struct dentry *dentry = filp->f_path.dentry;
+	struct inode *inode = dentry->d_inode;
+	struct vm_area_struct *vma;
+	struct task_struct *task;
+	struct mm_struct *mm;
+	ino_t ino;
+	int ret;
+
+	ret = -EACCES;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	ret = -ENOENT;
+	task = get_proc_task(inode);
+	if (!task)
+		goto out;
+
+	ret = -EACCES;
+	if (lock_trace(task))
+		goto out_put_task;
+
+	ret = 0;
+	switch (filp->f_pos) {
+	case 0:
+		ino = inode->i_ino;
+		if (filldir(dirent, ".", 1, 0, ino, DT_DIR) < 0)
+			goto out_unlock;
+		filp->f_pos++;
+	case 1:
+		ino = parent_ino(dentry);
+		if (filldir(dirent, "..", 2, 1, ino, DT_DIR) < 0)
+			goto out_unlock;
+		filp->f_pos++;
+	default:
+	{
+		unsigned long nr_files, pos, i;
+		struct flex_array *fa = NULL;
+		struct map_files_info info;
+		struct map_files_info *p;
+
+		mm = get_task_mm(task);
+		if (!mm)
+			goto out_unlock;
+		down_read(&mm->mmap_sem);
+
+		nr_files = 0;
+
+		/*
+		 * We need two passes here:
+		 *
+		 *  1) Collect vmas of mapped files with mmap_sem taken
+		 *  2) Release mmap_sem and instantiate entries
+		 *
+		 * otherwise we get lockdep complained, since filldir()
+		 * routine might require mmap_sem taken in might_fault().
+		 */
+
+		for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) {
+			if (vma->vm_file && ++pos > filp->f_pos)
+				nr_files++;
+		}
+
+		if (nr_files) {
+			fa = flex_array_alloc(sizeof(info), nr_files,
+						GFP_KERNEL);
+			if (!fa || flex_array_prealloc(fa, 0, nr_files,
+							GFP_KERNEL)) {
+				ret = -ENOMEM;
+				if (fa)
+					flex_array_free(fa);
+				up_read(&mm->mmap_sem);
+				mmput(mm);
+				goto out_unlock;
+			}
+			for (i = 0, vma = mm->mmap, pos = 2; vma;
+					vma = vma->vm_next) {
+				if (!vma->vm_file)
+					continue;
+				if (++pos <= filp->f_pos)
+					continue;
+
+				get_file(vma->vm_file);
+				info.file = vma->vm_file;
+				info.len = snprintf(info.name,
+						sizeof(info.name), "%lx-%lx",
+						vma->vm_start, vma->vm_end);
+				if (flex_array_put(fa, i++, &info, GFP_KERNEL))
+					BUG();
+			}
+		}
+		up_read(&mm->mmap_sem);
+
+		for (i = 0; i < nr_files; i++) {
+			p = flex_array_get(fa, i);
+			ret = proc_fill_cache(filp, dirent, filldir,
+					      p->name, p->len,
+					      proc_map_files_instantiate,
+					      task, p->file);
+			if (ret)
+				break;
+			filp->f_pos++;
+			fput(p->file);
+		}
+		for (; i < nr_files; i++) {
+			/*
+			 * In case of error don't forget
+			 * to put rest of file refs.
+			 */
+			p = flex_array_get(fa, i);
+			fput(p->file);
+		}
+		if (fa)
+			flex_array_free(fa);
+		mmput(mm);
+	}
+	}
+
+out_unlock:
+	unlock_trace(task);
+out_put_task:
+	put_task_struct(task);
+out:
+	return ret;
+}
+
+static const struct file_operations proc_map_files_operations = {
+	.read		= generic_read_dir,
+	.readdir	= proc_map_files_readdir,
+	.llseek		= default_llseek,
+};
+
+#endif /* CONFIG_CHECKPOINT_RESTORE */
+
 /*
  * /proc/pid/fd needs a special permission handler so that a process can still
  * access /proc/self/fd after it has executed a setuid().
@@ -2661,6 +3013,9 @@ static const struct inode_operations proc_task_inode_operations;
 static const struct pid_entry tgid_base_stuff[] = {
 	DIR("task",       S_IRUGO|S_IXUGO, proc_task_inode_operations, proc_task_operations),
 	DIR("fd",         S_IRUSR|S_IXUSR, proc_fd_inode_operations, proc_fd_operations),
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	DIR("map_files",  S_IRUSR|S_IXUSR, proc_map_files_inode_operations, proc_map_files_operations),
+#endif
 	DIR("fdinfo",     S_IRUSR|S_IXUSR, proc_fdinfo_inode_operations, proc_fdinfo_operations),
 	DIR("ns",	  S_IRUSR|S_IXUGO, proc_ns_dir_inode_operations, proc_ns_dir_operations),
 #ifdef CONFIG_NET
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 5568553a41f..6eba2cc016c 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1482,6 +1482,18 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma)
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
 }
 
+/* Look up the first VMA which exactly match the interval vm_start ... vm_end */
+static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm,
+				unsigned long vm_start, unsigned long vm_end)
+{
+	struct vm_area_struct *vma = find_vma(mm, vm_start);
+
+	if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end))
+		vma = NULL;
+
+	return vma;
+}
+
 #ifdef CONFIG_MMU
 pgprot_t vm_get_page_prot(unsigned long vm_flags);
 #else
-- 
cgit v1.2.3-70-g09d2


From 0499680a42141d86417a8fbaa8c8db806bea1201 Mon Sep 17 00:00:00 2001
From: Vasiliy Kulikov <segooon@gmail.com>
Date: Tue, 10 Jan 2012 15:11:31 -0800
Subject: procfs: add hidepid= and gid= mount options

Add support for mount options to restrict access to /proc/PID/
directories.  The default backward-compatible "relaxed" behaviour is left
untouched.

The first mount option is called "hidepid" and its value defines how much
info about processes we want to be available for non-owners:

hidepid=0 (default) means the old behavior - anybody may read all
world-readable /proc/PID/* files.

hidepid=1 means users may not access any /proc/<pid>/ directories, but
their own.  Sensitive files like cmdline, sched*, status are now protected
against other users.  As permission checking done in proc_pid_permission()
and files' permissions are left untouched, programs expecting specific
files' modes are not confused.

hidepid=2 means hidepid=1 plus all /proc/PID/ will be invisible to other
users.  It doesn't mean that it hides whether a process exists (it can be
learned by other means, e.g.  by kill -0 $PID), but it hides process' euid
and egid.  It compicates intruder's task of gathering info about running
processes, whether some daemon runs with elevated privileges, whether
another user runs some sensitive program, whether other users run any
program at all, etc.

gid=XXX defines a group that will be able to gather all processes' info
(as in hidepid=0 mode).  This group should be used instead of putting
nonroot user in sudoers file or something.  However, untrusted users (like
daemons, etc.) which are not supposed to monitor the tasks in the whole
system should not be added to the group.

hidepid=1 or higher is designed to restrict access to procfs files, which
might reveal some sensitive private information like precise keystrokes
timings:

http://www.openwall.com/lists/oss-security/2011/11/05/3

hidepid=1/2 doesn't break monitoring userspace tools.  ps, top, pgrep, and
conky gracefully handle EPERM/ENOENT and behave as if the current user is
the only user running processes.  pstree shows the process subtree which
contains "pstree" process.

Note: the patch doesn't deal with setuid/setgid issues of keeping
preopened descriptors of procfs files (like
https://lkml.org/lkml/2011/2/7/368).  We rely on that the leaked
information like the scheduling counters of setuid apps doesn't threaten
anybody's privacy - only the user started the setuid program may read the
counters.

Signed-off-by: Vasiliy Kulikov <segoon@openwall.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Randy Dunlap <rdunlap@xenotime.net>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Greg KH <greg@kroah.com>
Cc: Theodore Tso <tytso@MIT.EDU>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: James Morris <jmorris@namei.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/filesystems/proc.txt | 39 +++++++++++++++++++++
 fs/proc/base.c                     | 69 +++++++++++++++++++++++++++++++++++++-
 fs/proc/inode.c                    |  8 +++++
 fs/proc/root.c                     | 21 ++++++++++--
 include/linux/pid_namespace.h      |  2 ++
 5 files changed, 135 insertions(+), 4 deletions(-)

(limited to 'include/linux')

diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 0ec91f03422..12fee132fbe 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -41,6 +41,8 @@ Table of Contents
   3.5	/proc/<pid>/mountinfo - Information about mounts
   3.6	/proc/<pid>/comm  & /proc/<pid>/task/<tid>/comm
 
+  4	Configuring procfs
+  4.1	Mount options
 
 ------------------------------------------------------------------------------
 Preface
@@ -1542,3 +1544,40 @@ a task to set its own or one of its thread siblings comm value. The comm value
 is limited in size compared to the cmdline value, so writing anything longer
 then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
 comm value.
+
+
+------------------------------------------------------------------------------
+Configuring procfs
+------------------------------------------------------------------------------
+
+4.1	Mount options
+---------------------
+
+The following mount options are supported:
+
+	hidepid=	Set /proc/<pid>/ access mode.
+	gid=		Set the group authorized to learn processes information.
+
+hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories
+(default).
+
+hidepid=1 means users may not access any /proc/<pid>/ directories but their
+own.  Sensitive files like cmdline, sched*, status are now protected against
+other users.  This makes it impossible to learn whether any user runs
+specific program (given the program doesn't reveal itself by its behaviour).
+As an additional bonus, as /proc/<pid>/cmdline is unaccessible for other users,
+poorly written programs passing sensitive information via program arguments are
+now protected against local eavesdroppers.
+
+hidepid=2 means hidepid=1 plus all /proc/<pid>/ will be fully invisible to other
+users.  It doesn't mean that it hides a fact whether a process with a specific
+pid value exists (it can be learned by other means, e.g. by "kill -0 $PID"),
+but it hides process' uid and gid, which may be learned by stat()'ing
+/proc/<pid>/ otherwise.  It greatly complicates an intruder's task of gathering
+information about running processes, whether some daemon runs with elevated
+privileges, whether other user runs some sensitive program, whether other users
+run any program at all, etc.
+
+gid= defines a group authorized to learn processes information otherwise
+prohibited by hidepid=.  If you use some daemon like identd which needs to learn
+information about processes information, just add identd to this group.
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 4d755fed3ec..8173dfd89cb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -631,6 +631,50 @@ int proc_setattr(struct dentry *dentry, struct iattr *attr)
 	return 0;
 }
 
+/*
+ * May current process learn task's sched/cmdline info (for hide_pid_min=1)
+ * or euid/egid (for hide_pid_min=2)?
+ */
+static bool has_pid_permissions(struct pid_namespace *pid,
+				 struct task_struct *task,
+				 int hide_pid_min)
+{
+	if (pid->hide_pid < hide_pid_min)
+		return true;
+	if (in_group_p(pid->pid_gid))
+		return true;
+	return ptrace_may_access(task, PTRACE_MODE_READ);
+}
+
+
+static int proc_pid_permission(struct inode *inode, int mask)
+{
+	struct pid_namespace *pid = inode->i_sb->s_fs_info;
+	struct task_struct *task;
+	bool has_perms;
+
+	task = get_proc_task(inode);
+	has_perms = has_pid_permissions(pid, task, 1);
+	put_task_struct(task);
+
+	if (!has_perms) {
+		if (pid->hide_pid == 2) {
+			/*
+			 * Let's make getdents(), stat(), and open()
+			 * consistent with each other.  If a process
+			 * may not stat() a file, it shouldn't be seen
+			 * in procfs at all.
+			 */
+			return -ENOENT;
+		}
+
+		return -EPERM;
+	}
+	return generic_permission(inode, mask);
+}
+
+
+
 static const struct inode_operations proc_def_inode_operations = {
 	.setattr	= proc_setattr,
 };
@@ -1615,6 +1659,7 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	struct inode *inode = dentry->d_inode;
 	struct task_struct *task;
 	const struct cred *cred;
+	struct pid_namespace *pid = dentry->d_sb->s_fs_info;
 
 	generic_fillattr(inode, stat);
 
@@ -1623,6 +1668,14 @@ int pid_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	stat->gid = 0;
 	task = pid_task(proc_pid(inode), PIDTYPE_PID);
 	if (task) {
+		if (!has_pid_permissions(pid, task, 2)) {
+			rcu_read_unlock();
+			/*
+			 * This doesn't prevent learning whether PID exists,
+			 * it only makes getattr() consistent with readdir().
+			 */
+			return -ENOENT;
+		}
 		if ((inode->i_mode == (S_IFDIR|S_IRUGO|S_IXUGO)) ||
 		    task_dumpable(task)) {
 			cred = __task_cred(task);
@@ -3119,6 +3172,7 @@ static const struct inode_operations proc_tgid_base_inode_operations = {
 	.lookup		= proc_tgid_base_lookup,
 	.getattr	= pid_getattr,
 	.setattr	= proc_setattr,
+	.permission	= proc_pid_permission,
 };
 
 static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid)
@@ -3322,6 +3376,12 @@ static int proc_pid_fill_cache(struct file *filp, void *dirent, filldir_t filldi
 				proc_pid_instantiate, iter.task, NULL);
 }
 
+static int fake_filldir(void *buf, const char *name, int namelen,
+			loff_t offset, u64 ino, unsigned d_type)
+{
+	return 0;
+}
+
 /* for the /proc/ directory itself, after non-process stuff has been done */
 int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 {
@@ -3329,6 +3389,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	struct task_struct *reaper;
 	struct tgid_iter iter;
 	struct pid_namespace *ns;
+	filldir_t __filldir;
 
 	if (filp->f_pos >= PID_MAX_LIMIT + TGID_OFFSET)
 		goto out_no_task;
@@ -3350,8 +3411,13 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
+		if (has_pid_permissions(ns, iter.task, 2))
+			__filldir = filldir;
+		else
+			__filldir = fake_filldir;
+
 		filp->f_pos = iter.tgid + TGID_OFFSET;
-		if (proc_pid_fill_cache(filp, dirent, filldir, iter) < 0) {
+		if (proc_pid_fill_cache(filp, dirent, __filldir, iter) < 0) {
 			put_task_struct(iter.task);
 			goto out;
 		}
@@ -3686,6 +3752,7 @@ static const struct inode_operations proc_task_inode_operations = {
 	.lookup		= proc_task_lookup,
 	.getattr	= proc_task_getattr,
 	.setattr	= proc_setattr,
+	.permission	= proc_pid_permission,
 };
 
 static const struct file_operations proc_task_operations = {
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 27c762f3487..84fd3235a59 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -106,6 +106,14 @@ void __init proc_init_inodecache(void)
 
 static int proc_show_options(struct seq_file *seq, struct dentry *root)
 {
+	struct super_block *sb = root->d_sb;
+	struct pid_namespace *pid = sb->s_fs_info;
+
+	if (pid->pid_gid)
+		seq_printf(seq, ",gid=%lu", (unsigned long)pid->pid_gid);
+	if (pid->hide_pid != 0)
+		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+
 	return 0;
 }
 
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 6a8ac1d361a..46a15d8a29c 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,10 +38,12 @@ static int proc_set_super(struct super_block *sb, void *data)
 }
 
 enum {
-	Opt_err,
+	Opt_gid, Opt_hidepid, Opt_err,
 };
 
 static const match_table_t tokens = {
+	{Opt_hidepid, "hidepid=%u"},
+	{Opt_gid, "gid=%u"},
 	{Opt_err, NULL},
 };
 
@@ -49,8 +51,7 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 {
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
-
-	pr_debug("proc: options = %s\n", options);
+	int option;
 
 	if (!options)
 		return 1;
@@ -63,6 +64,20 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
 		switch (token) {
+		case Opt_gid:
+			if (match_int(&args[0], &option))
+				return 0;
+			pid->pid_gid = option;
+			break;
+		case Opt_hidepid:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 2) {
+				pr_err("proc: hidepid value must be between 0 and 2.\n");
+				return 0;
+			}
+			pid->hide_pid = option;
+			break;
 		default:
 			pr_err("proc: unrecognized mount option \"%s\" "
 			       "or missing value\n", p);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 38d10326246..e7cf6669ac3 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -30,6 +30,8 @@ struct pid_namespace {
 #ifdef CONFIG_BSD_PROCESS_ACCT
 	struct bsd_acct_struct *bacct;
 #endif
+	gid_t pid_gid;
+	int hide_pid;
 };
 
 extern struct pid_namespace init_pid_ns;
-- 
cgit v1.2.3-70-g09d2


From b196be89cdc14a88cc637cdad845a75c5886c82d Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 10 Jan 2012 15:11:35 -0800
Subject: workqueue: make alloc_workqueue() take printf fmt and args for name

alloc_workqueue() currently expects the passed in @name pointer to remain
accessible.  This is inconvenient and a bit silly given that the whole wq
is being dynamically allocated.  This patch updates alloc_workqueue() and
friends to take printf format string instead of opaque string and matching
varargs at the end.  The name is allocated together with the wq and
formatted.

alloc_ordered_workqueue() is converted to a macro to unify varargs
handling with alloc_workqueue(), and, while at it, add comment to
alloc_workqueue().

None of the current in-kernel users pass in string with '%' as constant
name and this change shouldn't cause any problem.

[akpm@linux-foundation.org: use __printf]
Signed-off-by: Tejun Heo <tj@kernel.org>
Suggested-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h | 47 +++++++++++++++++++++++++++++++----------------
 kernel/workqueue.c        | 32 ++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 26 deletions(-)

(limited to 'include/linux')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 0d556deb497..eb8b9f15f2e 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -297,32 +297,50 @@ extern struct workqueue_struct *system_unbound_wq;
 extern struct workqueue_struct *system_freezable_wq;
 
 extern struct workqueue_struct *
-__alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
-		      struct lock_class_key *key, const char *lock_name);
+__alloc_workqueue_key(const char *fmt, unsigned int flags, int max_active,
+	struct lock_class_key *key, const char *lock_name, ...) __printf(1, 6);
 
+/**
+ * alloc_workqueue - allocate a workqueue
+ * @fmt: printf format for the name of the workqueue
+ * @flags: WQ_* flags
+ * @max_active: max in-flight work items, 0 for default
+ * @args: args for @fmt
+ *
+ * Allocate a workqueue with the specified parameters.  For detailed
+ * information on WQ_* flags, please refer to Documentation/workqueue.txt.
+ *
+ * The __lock_name macro dance is to guarantee that single lock_class_key
+ * doesn't end up with different namesm, which isn't allowed by lockdep.
+ *
+ * RETURNS:
+ * Pointer to the allocated workqueue on success, %NULL on failure.
+ */
 #ifdef CONFIG_LOCKDEP
-#define alloc_workqueue(name, flags, max_active)		\
+#define alloc_workqueue(fmt, flags, max_active, args...)	\
 ({								\
 	static struct lock_class_key __key;			\
 	const char *__lock_name;				\
 								\
-	if (__builtin_constant_p(name))				\
-		__lock_name = (name);				\
+	if (__builtin_constant_p(fmt))				\
+		__lock_name = (fmt);				\
 	else							\
-		__lock_name = #name;				\
+		__lock_name = #fmt;				\
 								\
-	__alloc_workqueue_key((name), (flags), (max_active),	\
-			      &__key, __lock_name);		\
+	__alloc_workqueue_key((fmt), (flags), (max_active),	\
+			      &__key, __lock_name, ##args);	\
 })
 #else
-#define alloc_workqueue(name, flags, max_active)		\
-	__alloc_workqueue_key((name), (flags), (max_active), NULL, NULL)
+#define alloc_workqueue(fmt, flags, max_active, args...)	\
+	__alloc_workqueue_key((fmt), (flags), (max_active),	\
+			      NULL, NULL, ##args)
 #endif
 
 /**
  * alloc_ordered_workqueue - allocate an ordered workqueue
- * @name: name of the workqueue
+ * @fmt: printf format for the name of the workqueue
  * @flags: WQ_* flags (only WQ_FREEZABLE and WQ_MEM_RECLAIM are meaningful)
+ * @args: args for @fmt
  *
  * Allocate an ordered workqueue.  An ordered workqueue executes at
  * most one work item at any given time in the queued order.  They are
@@ -331,11 +349,8 @@ __alloc_workqueue_key(const char *name, unsigned int flags, int max_active,
  * RETURNS:
  * Pointer to the allocated workqueue on success, %NULL on failure.
  */
-static inline struct workqueue_struct *
-alloc_ordered_workqueue(const char *name, unsigned int flags)
-{
-	return alloc_workqueue(name, WQ_UNBOUND | flags, 1);
-}
+#define alloc_ordered_workqueue(fmt, flags, args...)		\
+	alloc_workqueue(fmt, WQ_UNBOUND | (flags), 1, ##args)
 
 #define create_workqueue(name)					\
 	alloc_workqueue((name), WQ_MEM_RECLAIM, 1)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 42fa9ad0a81..bec7b5b53e0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -242,10 +242,10 @@ struct workqueue_struct {
 
 	int			nr_drainers;	/* W: drain in progress */
 	int			saved_max_active; /* W: saved cwq max_active */
-	const char		*name;		/* I: workqueue name */
 #ifdef CONFIG_LOCKDEP
 	struct lockdep_map	lockdep_map;
 #endif
+	char			name[];		/* I: workqueue name */
 };
 
 struct workqueue_struct *system_wq __read_mostly;
@@ -2954,14 +2954,29 @@ static int wq_clamp_max_active(int max_active, unsigned int flags,
 	return clamp_val(max_active, 1, lim);
 }
 
-struct workqueue_struct *__alloc_workqueue_key(const char *name,
+struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
 					       unsigned int flags,
 					       int max_active,
 					       struct lock_class_key *key,
-					       const char *lock_name)
+					       const char *lock_name, ...)
 {
+	va_list args, args1;
 	struct workqueue_struct *wq;
 	unsigned int cpu;
+	size_t namelen;
+
+	/* determine namelen, allocate wq and format name */
+	va_start(args, lock_name);
+	va_copy(args1, args);
+	namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+
+	wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+	if (!wq)
+		goto err;
+
+	vsnprintf(wq->name, namelen, fmt, args1);
+	va_end(args);
+	va_end(args1);
 
 	/*
 	 * Workqueues which may be used during memory reclaim should
@@ -2978,12 +2993,9 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 		flags |= WQ_HIGHPRI;
 
 	max_active = max_active ?: WQ_DFL_ACTIVE;
-	max_active = wq_clamp_max_active(max_active, flags, name);
-
-	wq = kzalloc(sizeof(*wq), GFP_KERNEL);
-	if (!wq)
-		goto err;
+	max_active = wq_clamp_max_active(max_active, flags, wq->name);
 
+	/* init wq */
 	wq->flags = flags;
 	wq->saved_max_active = max_active;
 	mutex_init(&wq->flush_mutex);
@@ -2991,7 +3003,6 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 	INIT_LIST_HEAD(&wq->flusher_queue);
 	INIT_LIST_HEAD(&wq->flusher_overflow);
 
-	wq->name = name;
 	lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
 	INIT_LIST_HEAD(&wq->list);
 
@@ -3020,7 +3031,8 @@ struct workqueue_struct *__alloc_workqueue_key(const char *name,
 		if (!rescuer)
 			goto err;
 
-		rescuer->task = kthread_create(rescuer_thread, wq, "%s", name);
+		rescuer->task = kthread_create(rescuer_thread, wq, "%s",
+					       wq->name);
 		if (IS_ERR(rescuer->task))
 			goto err;
 
-- 
cgit v1.2.3-70-g09d2