From 05ba1f0823004e947748523782e9c2f07f3bff0d Mon Sep 17 00:00:00 2001 From: Anatol Pomozov Date: Sun, 22 Apr 2012 18:45:24 -0700 Subject: fuse: add FALLOCATE operation fallocate filesystem operation preallocates media space for the given file. If fallocate returns success then any subsequent write to the given range never fails with 'not enough space' error. Signed-off-by: Anatol Pomozov Signed-off-by: Miklos Szeredi --- include/linux/fuse.h | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/fuse.h b/include/linux/fuse.h index 8f2ab8fef92..9303348965f 100644 --- a/include/linux/fuse.h +++ b/include/linux/fuse.h @@ -54,6 +54,9 @@ * 7.18 * - add FUSE_IOCTL_DIR flag * - add FUSE_NOTIFY_DELETE + * + * 7.19 + * - add FUSE_FALLOCATE */ #ifndef _LINUX_FUSE_H @@ -85,7 +88,7 @@ #define FUSE_KERNEL_VERSION 7 /** Minor version number of this interface */ -#define FUSE_KERNEL_MINOR_VERSION 18 +#define FUSE_KERNEL_MINOR_VERSION 19 /** The node ID of the root inode */ #define FUSE_ROOT_ID 1 @@ -278,6 +281,7 @@ enum fuse_opcode { FUSE_POLL = 40, FUSE_NOTIFY_REPLY = 41, FUSE_BATCH_FORGET = 42, + FUSE_FALLOCATE = 43, /* CUSE specific operations */ CUSE_INIT = 4096, @@ -571,6 +575,14 @@ struct fuse_notify_poll_wakeup_out { __u64 kh; }; +struct fuse_fallocate_in { + __u64 fh; + __u64 offset; + __u64 length; + __u32 mode; + __u32 padding; +}; + struct fuse_in_header { __u32 len; __u32 opcode; -- cgit v1.2.3-70-g09d2 From c3ba9698152b17fdc2c7cd0f7cbeb571e3367e9d Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Mon, 9 Apr 2012 17:06:54 -0600 Subject: mm: frontswap: add frontswap header file Frontswap is the alter ego of cleancache, the "yang" to cleancache's "yin"... and more precisely frontswap is the provider of anonymous pages to transcendent memory to nicely complement cleancache's providing of clean pagecache pages to transcendent memory. For optimal use of transcendent memory, both are necessary... because a kernel under memory pressure first reclaims clean pagecache pages and, when under more memory pressure, starts swapping anonymous pages. Frontswap and cleancache (which was merged at 3.0) are the "frontends" and the only necessary changes to the core kernel for transcendent memory; all other supporting code -- the "backends" -- is implemented as drivers. See the LWN.net article "Transcendent memory in a nutshell" for a detailed overview of frontswap and related kernel parts: https://lwn.net/Articles/454795/ Frontswap code was first posted publicly in January 2009 and on LKML in May 2009, and has remained functionally stable for nearly three years now. It is barely invasive, touching only the swap subsystem and adds less than 100 lines of code to existing swap subsystem code files. It has improved syntactically substantially between V1 and this posting of V14, thanks to the review of a few kernel developers, and has adapted easily to at least one major swap subsystem change. As of 3.4, there are three in-tree users of frontswap patiently waiting for this patchset and for CONFIG_FRONTSWAP to be enabled: zcache (staging driver merged at 2.6.39), Xen tmem (merged at 3.0 and 3.1) and RAMster (staging driver merged at 3.4). In addition, a RFC has been posted for a KVM backend. The frontswap patchset has been in linux-next since next-110603. Earlier versions of frontswap already ship in the Oracle Unbreakable Enterprise Kernel and SuSE SLES. This patch, 1of4, provides the header file for the core code for frontswap that interfaces between the hooks in the swap subsystem and a frontswap backend via frontswap_ops. --- New file added: include/linux/frontswap.h [v14: add support for writethrough, per suggestion by aarcange@redhat.com] [v14: rebase to 3.4-rc2] [v11: konrad.wilk@oracle.com: squashed s/flush/invalidate/ in] [v10: no change] [v9: akpm@linux-foundation.org: change "flush" to "invalidate", part 1] [v8: rebase to 3.0-rc4] [v7: rebase to 3.0-rc3] [v7: JBeulich@novell.com: new static inlines resolve to no-ops if not config'd] [v7: JBeulich@novell.com: avoid redundant shifts/divides for *_bit lib calls] [v6: rebase to 3.1-rc1] [v5: no change from v4] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer Reviewed-by: Andrew Morton Acked-by: Jan Beulich Acked-by: Seth Jennings Cc: Jeremy Fitzhardinge Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nitin Gupta Cc: Matthew Wilcox Cc: Chris Mason Cc: Rik Riel [v15: int/bool on some functions] Signed-off-by: Konrad Wilk --- include/linux/frontswap.h | 127 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 127 insertions(+) create mode 100644 include/linux/frontswap.h (limited to 'include') diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h new file mode 100644 index 00000000000..68ff7af5c5f --- /dev/null +++ b/include/linux/frontswap.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_FRONTSWAP_H +#define _LINUX_FRONTSWAP_H + +#include +#include +#include + +struct frontswap_ops { + void (*init)(unsigned); + int (*put_page)(unsigned, pgoff_t, struct page *); + int (*get_page)(unsigned, pgoff_t, struct page *); + void (*invalidate_page)(unsigned, pgoff_t); + void (*invalidate_area)(unsigned); +}; + +extern bool frontswap_enabled; +extern struct frontswap_ops + frontswap_register_ops(struct frontswap_ops *ops); +extern void frontswap_shrink(unsigned long); +extern unsigned long frontswap_curr_pages(void); +extern void frontswap_writethrough(bool); + +extern void __frontswap_init(unsigned type); +extern int __frontswap_put_page(struct page *page); +extern int __frontswap_get_page(struct page *page); +extern void __frontswap_invalidate_page(unsigned, pgoff_t); +extern void __frontswap_invalidate_area(unsigned); + +#ifdef CONFIG_FRONTSWAP + +static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) +{ + bool ret = false; + + if (frontswap_enabled && sis->frontswap_map) + ret = test_bit(offset, sis->frontswap_map); + return ret; +} + +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + set_bit(offset, sis->frontswap_map); +} + +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ + if (frontswap_enabled && sis->frontswap_map) + clear_bit(offset, sis->frontswap_map); +} + +static inline void frontswap_map_set(struct swap_info_struct *p, + unsigned long *map) +{ + p->frontswap_map = map; +} + +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) +{ + return p->frontswap_map; +} +#else +/* all inline routines become no-ops and all externs are ignored */ + +#define frontswap_enabled (0) + +static inline bool frontswap_test(struct swap_info_struct *sis, pgoff_t offset) +{ + return false; +} + +static inline void frontswap_set(struct swap_info_struct *sis, pgoff_t offset) +{ +} + +static inline void frontswap_clear(struct swap_info_struct *sis, pgoff_t offset) +{ +} + +static inline void frontswap_map_set(struct swap_info_struct *p, + unsigned long *map) +{ +} + +static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) +{ + return NULL; +} +#endif + +static inline int frontswap_put_page(struct page *page) +{ + int ret = -1; + + if (frontswap_enabled) + ret = __frontswap_put_page(page); + return ret; +} + +static inline int frontswap_get_page(struct page *page) +{ + int ret = -1; + + if (frontswap_enabled) + ret = __frontswap_get_page(page); + return ret; +} + +static inline void frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + if (frontswap_enabled) + __frontswap_invalidate_page(type, offset); +} + +static inline void frontswap_invalidate_area(unsigned type) +{ + if (frontswap_enabled) + __frontswap_invalidate_area(type); +} + +static inline void frontswap_init(unsigned type) +{ + if (frontswap_enabled) + __frontswap_init(type); +} + +#endif /* _LINUX_FRONTSWAP_H */ -- cgit v1.2.3-70-g09d2 From 38b5faf4b178d5279b1fca5d7dadc68881342660 Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Mon, 9 Apr 2012 17:08:06 -0600 Subject: mm: frontswap: core swap subsystem hooks and headers This patch, 2of4, contains the changes to the core swap subsystem. This includes: (1) makes available core swap data structures (swap_lock, swap_list and swap_info) that are needed by frontswap.c but we don't need to expose them to the dozens of files that include swap.h so we create a new swapfile.h just to extern-ify these and modify their declarations to non-static (2) adds frontswap-related elements to swap_info_struct. Frontswap_map points to vzalloc'ed one-bit-per-swap-page metadata that indicates whether the swap page is in frontswap or in the device and frontswap_pages counts how many pages are in frontswap. (3) adds hooks in the swap subsystem and extends try_to_unuse so that frontswap_shrink can do a "partial swapoff". Note that a failed frontswap_map allocation is safe... failure is noted by lack of "FS" in the subsequent printk. --- [v14: rebase to 3.4-rc2] [v10: no change] [v9: akpm@linux-foundation.org: mark some statics __read_mostly] [v9: akpm@linux-foundation.org: add clarifying comments] [v9: akpm@linux-foundation.org: no need to loop repeating try_to_unuse] [v9: error27@gmail.com: remove superfluous check for NULL] [v8: rebase to 3.0-rc4] [v8: kamezawa.hiroyu@jp.fujitsu.com: change counter to atomic_t to avoid races] [v8: kamezawa.hiroyu@jp.fujitsu.com: comment to clarify informational counters] [v7: rebase to 3.0-rc3] [v7: JBeulich@novell.com: add new swap struct elements only if config'd] [v6: rebase to 3.0-rc1] [v6: lliubbo@gmail.com: fix null pointer deref if vzalloc fails] [v6: konrad.wilk@oracl.com: various checks and code clarifications/comments] [v5: no change from v4] [v4: rebase to 2.6.39] Signed-off-by: Dan Magenheimer Reviewed-by: Kamezawa Hiroyuki Acked-by: Jan Beulich Acked-by: Seth Jennings Cc: Jeremy Fitzhardinge Cc: Hugh Dickins Cc: Johannes Weiner Cc: Nitin Gupta Cc: Matthew Wilcox Cc: Chris Mason Cc: Rik Riel Cc: Andrew Morton [v11: Rebased, fixed mm/swapfile.c context change] Signed-off-by: Konrad Rzeszutek Wilk --- include/linux/swap.h | 4 ++++ include/linux/swapfile.h | 13 ++++++++++++ mm/page_io.c | 12 +++++++++++ mm/swapfile.c | 54 ++++++++++++++++++++++++++++++++++++------------ 4 files changed, 70 insertions(+), 13 deletions(-) create mode 100644 include/linux/swapfile.h (limited to 'include') diff --git a/include/linux/swap.h b/include/linux/swap.h index b1fd5c7925f..50a55e2d58e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -197,6 +197,10 @@ struct swap_info_struct { struct block_device *bdev; /* swap device or bdev of swap file */ struct file *swap_file; /* seldom referenced */ unsigned int old_block_size; /* seldom referenced */ +#ifdef CONFIG_FRONTSWAP + unsigned long *frontswap_map; /* frontswap in-use, one bit per page */ + atomic_t frontswap_pages; /* frontswap pages in-use counter */ +#endif }; struct swap_list_t { diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h new file mode 100644 index 00000000000..e282624e8c1 --- /dev/null +++ b/include/linux/swapfile.h @@ -0,0 +1,13 @@ +#ifndef _LINUX_SWAPFILE_H +#define _LINUX_SWAPFILE_H + +/* + * these were static in swapfile.c but frontswap.c needs them and we don't + * want to expose them to the dozens of source files that include swap.h + */ +extern spinlock_t swap_lock; +extern struct swap_list_t swap_list; +extern struct swap_info_struct *swap_info[]; +extern int try_to_unuse(unsigned int, bool, unsigned long); + +#endif /* _LINUX_SWAPFILE_H */ diff --git a/mm/page_io.c b/mm/page_io.c index dc76b4d0611..651a9125931 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -18,6 +18,7 @@ #include #include #include +#include #include static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -98,6 +99,12 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } + if (frontswap_put_page(page) == 0) { + set_page_writeback(page); + unlock_page(page); + end_page_writeback(page); + goto out; + } bio = get_swap_bio(GFP_NOIO, page, end_swap_bio_write); if (bio == NULL) { set_page_dirty(page); @@ -122,6 +129,11 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); + if (frontswap_get_page(page) == 0) { + SetPageUptodate(page); + unlock_page(page); + goto out; + } bio = get_swap_bio(GFP_KERNEL, page, end_swap_bio_read); if (bio == NULL) { unlock_page(page); diff --git a/mm/swapfile.c b/mm/swapfile.c index fafc26d1b1d..9c7be87175c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -31,6 +31,8 @@ #include #include #include +#include +#include #include #include @@ -42,7 +44,7 @@ static bool swap_count_continued(struct swap_info_struct *, pgoff_t, static void free_swap_count_continuations(struct swap_info_struct *); static sector_t map_swap_entry(swp_entry_t, struct block_device**); -static DEFINE_SPINLOCK(swap_lock); +DEFINE_SPINLOCK(swap_lock); static unsigned int nr_swapfiles; long nr_swap_pages; long total_swap_pages; @@ -53,9 +55,9 @@ static const char Unused_file[] = "Unused swap file entry "; static const char Bad_offset[] = "Bad swap offset entry "; static const char Unused_offset[] = "Unused swap offset entry "; -static struct swap_list_t swap_list = {-1, -1}; +struct swap_list_t swap_list = {-1, -1}; -static struct swap_info_struct *swap_info[MAX_SWAPFILES]; +struct swap_info_struct *swap_info[MAX_SWAPFILES]; static DEFINE_MUTEX(swapon_mutex); @@ -556,6 +558,7 @@ static unsigned char swap_entry_free(struct swap_info_struct *p, swap_list.next = p->type; nr_swap_pages++; p->inuse_pages--; + frontswap_invalidate_page(p->type, offset); if ((p->flags & SWP_BLKDEV) && disk->fops->swap_slot_free_notify) disk->fops->swap_slot_free_notify(p->bdev, offset); @@ -1016,11 +1019,12 @@ static int unuse_mm(struct mm_struct *mm, } /* - * Scan swap_map from current position to next entry still in use. + * Scan swap_map (or frontswap_map if frontswap parameter is true) + * from current position to next entry still in use. * Recycle to start on reaching the end, returning 0 when empty. */ static unsigned int find_next_to_unuse(struct swap_info_struct *si, - unsigned int prev) + unsigned int prev, bool frontswap) { unsigned int max = si->max; unsigned int i = prev; @@ -1046,6 +1050,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, prev = 0; i = 1; } + if (frontswap) { + if (frontswap_test(si, i)) + break; + else + continue; + } count = si->swap_map[i]; if (count && swap_count(count) != SWAP_MAP_BAD) break; @@ -1057,8 +1067,12 @@ static unsigned int find_next_to_unuse(struct swap_info_struct *si, * We completely avoid races by reading each swap page in advance, * and then search for the process using it. All the necessary * page table adjustments can then be made atomically. + * + * if the boolean frontswap is true, only unuse pages_to_unuse pages; + * pages_to_unuse==0 means all pages; ignored if frontswap is false */ -static int try_to_unuse(unsigned int type) +int try_to_unuse(unsigned int type, bool frontswap, + unsigned long pages_to_unuse) { struct swap_info_struct *si = swap_info[type]; struct mm_struct *start_mm; @@ -1091,7 +1105,7 @@ static int try_to_unuse(unsigned int type) * one pass through swap_map is enough, but not necessarily: * there are races when an instance of an entry might be missed. */ - while ((i = find_next_to_unuse(si, i)) != 0) { + while ((i = find_next_to_unuse(si, i, frontswap)) != 0) { if (signal_pending(current)) { retval = -EINTR; break; @@ -1258,6 +1272,10 @@ static int try_to_unuse(unsigned int type) * interactive performance. */ cond_resched(); + if (frontswap && pages_to_unuse > 0) { + if (!--pages_to_unuse) + break; + } } mmput(start_mm); @@ -1517,7 +1535,8 @@ bad_bmap: } static void enable_swap_info(struct swap_info_struct *p, int prio, - unsigned char *swap_map) + unsigned char *swap_map, + unsigned long *frontswap_map) { int i, prev; @@ -1527,6 +1546,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, else p->prio = --least_priority; p->swap_map = swap_map; + frontswap_map_set(p, frontswap_map); p->flags |= SWP_WRITEOK; nr_swap_pages += p->pages; total_swap_pages += p->pages; @@ -1543,6 +1563,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; + frontswap_init(p->type); spin_unlock(&swap_lock); } @@ -1616,7 +1637,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); - err = try_to_unuse(type); + err = try_to_unuse(type, false, 0); /* force all pages to be unused */ compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { @@ -1627,7 +1648,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) * sys_swapoff for this swap_info_struct at this point. */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map); + enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); goto out_dput; } @@ -1653,9 +1674,11 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) swap_map = p->swap_map; p->swap_map = NULL; p->flags = 0; + frontswap_invalidate_area(type); spin_unlock(&swap_lock); mutex_unlock(&swapon_mutex); vfree(swap_map); + vfree(frontswap_map_get(p)); /* Destroy swap account informatin */ swap_cgroup_swapoff(type); @@ -2019,6 +2042,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) sector_t span; unsigned long maxpages; unsigned char *swap_map = NULL; + unsigned long *frontswap_map = NULL; struct page *page = NULL; struct inode *inode = NULL; @@ -2102,6 +2126,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = nr_extents; goto bad_swap; } + /* frontswap enabled? set up bit-per-page map for frontswap */ + if (frontswap_enabled) + frontswap_map = vzalloc(maxpages / sizeof(long)); if (p->bdev) { if (blk_queue_nonrot(bdev_get_queue(p->bdev))) { @@ -2117,14 +2144,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (swap_flags & SWAP_FLAG_PREFER) prio = (swap_flags & SWAP_FLAG_PRIO_MASK) >> SWAP_FLAG_PRIO_SHIFT; - enable_swap_info(p, prio, swap_map); + enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s\n", p->pages<<(PAGE_SHIFT-10), name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", - (p->flags & SWP_DISCARDABLE) ? "D" : ""); + (p->flags & SWP_DISCARDABLE) ? "D" : "", + (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); atomic_inc(&proc_poll_event); -- cgit v1.2.3-70-g09d2 From 165c8aed5bbc6bdddbccae0ba9db451732558ff9 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Tue, 15 May 2012 11:32:15 -0400 Subject: frontswap: s/put_page/store/g s/get_page/load Sounds so much more natural. Suggested-by: Andrea Arcangeli Signed-off-by: Konrad Rzeszutek Wilk --- Documentation/vm/frontswap.txt | 50 +++++++++++++++---------------- drivers/staging/ramster/zcache-main.c | 8 ++--- drivers/staging/zcache/zcache-main.c | 10 +++---- drivers/xen/tmem.c | 8 ++--- include/linux/frontswap.h | 16 +++++----- mm/frontswap.c | 56 +++++++++++++++++------------------ mm/page_io.c | 4 +-- 7 files changed, 76 insertions(+), 76 deletions(-) (limited to 'include') diff --git a/Documentation/vm/frontswap.txt b/Documentation/vm/frontswap.txt index a9f731af0fa..37067cf455f 100644 --- a/Documentation/vm/frontswap.txt +++ b/Documentation/vm/frontswap.txt @@ -21,21 +21,21 @@ frontswap_ops funcs appropriately and the functions it provides must conform to certain policies as follows: An "init" prepares the device to receive frontswap pages associated -with the specified swap device number (aka "type"). A "put_page" will +with the specified swap device number (aka "type"). A "store" will copy the page to transcendent memory and associate it with the type and -offset associated with the page. A "get_page" will copy the page, if found, +offset associated with the page. A "load" will copy the page, if found, from transcendent memory into kernel memory, but will NOT remove the page from from transcendent memory. An "invalidate_page" will remove the page from transcendent memory and an "invalidate_area" will remove ALL pages associated with the swap type (e.g., like swapoff) and notify the "device" -to refuse further puts with that swap type. +to refuse further stores with that swap type. -Once a page is successfully put, a matching get on the page will normally +Once a page is successfully stored, a matching load on the page will normally succeed. So when the kernel finds itself in a situation where it needs -to swap out a page, it first attempts to use frontswap. If the put returns +to swap out a page, it first attempts to use frontswap. If the store returns success, the data has been successfully saved to transcendent memory and a disk write and, if the data is later read back, a disk read are avoided. -If a put returns failure, transcendent memory has rejected the data, and the +If a store returns failure, transcendent memory has rejected the data, and the page can be written to swap as usual. If a backend chooses, frontswap can be configured as a "writethrough @@ -44,18 +44,18 @@ in swap device writes is lost (and also a non-trivial performance advantage) in order to allow the backend to arbitrarily "reclaim" space used to store frontswap pages to more completely manage its memory usage. -Note that if a page is put and the page already exists in transcendent memory -(a "duplicate" put), either the put succeeds and the data is overwritten, -or the put fails AND the page is invalidated. This ensures stale data may +Note that if a page is stored and the page already exists in transcendent memory +(a "duplicate" store), either the store succeeds and the data is overwritten, +or the store fails AND the page is invalidated. This ensures stale data may never be obtained from frontswap. If properly configured, monitoring of frontswap is done via debugfs in the /sys/kernel/debug/frontswap directory. The effectiveness of frontswap can be measured (across all swap devices) with: -failed_puts - how many put attempts have failed -gets - how many gets were attempted (all should succeed) -succ_puts - how many put attempts have succeeded +failed_stores - how many store attempts have failed +loads - how many loads were attempted (all should succeed) +succ_stores - how many store attempts have succeeded invalidates - how many invalidates were attempted A backend implementation may provide additional metrics. @@ -125,7 +125,7 @@ nothingness and the only overhead is a few extra bytes per swapon'ed swap device. If CONFIG_FRONTSWAP is enabled but no frontswap "backend" registers, there is one extra global variable compared to zero for every swap page read or written. If CONFIG_FRONTSWAP is enabled -AND a frontswap backend registers AND the backend fails every "put" +AND a frontswap backend registers AND the backend fails every "store" request (i.e. provides no memory despite claiming it might), CPU overhead is still negligible -- and since every frontswap fail precedes a swap page write-to-disk, the system is highly likely @@ -159,13 +159,13 @@ entirely dynamic and random. Whenever a swap-device is swapon'd frontswap_init() is called, passing the swap device number (aka "type") as a parameter. -This notifies frontswap to expect attempts to "put" swap pages +This notifies frontswap to expect attempts to "store" swap pages associated with that number. Whenever the swap subsystem is readying a page to write to a swap -device (c.f swap_writepage()), frontswap_put_page is called. Frontswap +device (c.f swap_writepage()), frontswap_store is called. Frontswap consults with the frontswap backend and if the backend says it does NOT -have room, frontswap_put_page returns -1 and the kernel swaps the page +have room, frontswap_store returns -1 and the kernel swaps the page to the swap device as normal. Note that the response from the frontswap backend is unpredictable to the kernel; it may choose to never accept a page, it could accept every ninth page, or it might accept every @@ -177,7 +177,7 @@ corresponding to the page offset on the swap device to which it would otherwise have written the data. When the swap subsystem needs to swap-in a page (swap_readpage()), -it first calls frontswap_get_page() which checks the frontswap_map to +it first calls frontswap_load() which checks the frontswap_map to see if the page was earlier accepted by the frontswap backend. If it was, the page of data is filled from the frontswap backend and the swap-in is complete. If not, the normal swap-in code is @@ -185,7 +185,7 @@ executed to obtain the page of data from the real swap device. So every time the frontswap backend accepts a page, a swap device read and (potentially) a swap device write are replaced by a "frontswap backend -put" and (possibly) a "frontswap backend get", which are presumably much +store" and (possibly) a "frontswap backend loads", which are presumably much faster. 4) Can't frontswap be configured as a "special" swap device that is @@ -215,8 +215,8 @@ that are inappropriate for a RAM-oriented device including delaying the write of some pages for a significant amount of time. Synchrony is required to ensure the dynamicity of the backend and to avoid thorny race conditions that would unnecessarily and greatly complicate frontswap -and/or the block I/O subsystem. That said, only the initial "put" -and "get" operations need be synchronous. A separate asynchronous thread +and/or the block I/O subsystem. That said, only the initial "store" +and "load" operations need be synchronous. A separate asynchronous thread is free to manipulate the pages stored by frontswap. For example, the "remotification" thread in RAMster uses standard asynchronous kernel sockets to move compressed frontswap pages to a remote machine. @@ -229,7 +229,7 @@ choose to accept pages only until host-swapping might be imminent, then force guests to do their own swapping. There is a downside to the transcendent memory specifications for -frontswap: Since any "put" might fail, there must always be a real +frontswap: Since any "store" might fail, there must always be a real slot on a real swap device to swap the page. Thus frontswap must be implemented as a "shadow" to every swapon'd device with the potential capability of holding every page that the swap device might have held @@ -240,16 +240,16 @@ installation, frontswap is useless. Swapless portable devices can still use frontswap but a backend for such devices must configure some kind of "ghost" swap device and ensure that it is never used. -5) Why this weird definition about "duplicate puts"? If a page - has been previously successfully put, can't it always be +5) Why this weird definition about "duplicate stores"? If a page + has been previously successfully stored, can't it always be successfully overwritten? Nearly always it can, but no, sometimes it cannot. Consider an example where data is compressed and the original 4K page has been compressed to 1K. Now an attempt is made to overwrite the page with data that is non-compressible and so would take the entire 4K. But the backend -has no more space. In this case, the put must be rejected. Whenever -frontswap rejects a put that would overwrite, it also must invalidate +has no more space. In this case, the store must be rejected. Whenever +frontswap rejects a store that would overwrite, it also must invalidate the old data and ensure that it is no longer accessible. Since the swap subsystem then writes the new data to the read swap device, this is the correct course of action to ensure coherency. diff --git a/drivers/staging/ramster/zcache-main.c b/drivers/staging/ramster/zcache-main.c index 68b2e053a0e..2627b3de0d2 100644 --- a/drivers/staging/ramster/zcache-main.c +++ b/drivers/staging/ramster/zcache-main.c @@ -3002,7 +3002,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) return oid; } -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -3025,7 +3025,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, /* returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -3080,8 +3080,8 @@ static void zcache_frontswap_init(unsigned ignored) } static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, + .store = zcache_frontswap_store, + .load = zcache_frontswap_load, .invalidate_page = zcache_frontswap_flush_page, .invalidate_area = zcache_frontswap_flush_area, .init = zcache_frontswap_init diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c index 2734dacacba..784c796b984 100644 --- a/drivers/staging/zcache/zcache-main.c +++ b/drivers/staging/zcache/zcache-main.c @@ -1835,7 +1835,7 @@ static int zcache_frontswap_poolid = -1; * Swizzling increases objects per swaptype, increasing tmem concurrency * for heavy swaploads. Later, larger nr_cpus -> larger SWIZ_BITS * Setting SWIZ_BITS to 27 basically reconstructs the swap entry from - * frontswap_get_page(), but has side-effects. Hence using 8. + * frontswap_load(), but has side-effects. Hence using 8. */ #define SWIZ_BITS 8 #define SWIZ_MASK ((1 << SWIZ_BITS) - 1) @@ -1849,7 +1849,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) return oid; } -static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -1870,7 +1870,7 @@ static int zcache_frontswap_put_page(unsigned type, pgoff_t offset, /* returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int zcache_frontswap_get_page(unsigned type, pgoff_t offset, +static int zcache_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -1919,8 +1919,8 @@ static void zcache_frontswap_init(unsigned ignored) } static struct frontswap_ops zcache_frontswap_ops = { - .put_page = zcache_frontswap_put_page, - .get_page = zcache_frontswap_get_page, + .store = zcache_frontswap_store, + .load = zcache_frontswap_load, .invalidate_page = zcache_frontswap_flush_page, .invalidate_area = zcache_frontswap_flush_area, .init = zcache_frontswap_init diff --git a/drivers/xen/tmem.c b/drivers/xen/tmem.c index dcb79521e6c..89f264c6742 100644 --- a/drivers/xen/tmem.c +++ b/drivers/xen/tmem.c @@ -269,7 +269,7 @@ static inline struct tmem_oid oswiz(unsigned type, u32 ind) } /* returns 0 if the page was successfully put into frontswap, -1 if not */ -static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, +static int tmem_frontswap_store(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -295,7 +295,7 @@ static int tmem_frontswap_put_page(unsigned type, pgoff_t offset, * returns 0 if the page was successfully gotten from frontswap, -1 if * was not present (should never happen!) */ -static int tmem_frontswap_get_page(unsigned type, pgoff_t offset, +static int tmem_frontswap_load(unsigned type, pgoff_t offset, struct page *page) { u64 ind64 = (u64)offset; @@ -362,8 +362,8 @@ static int __init no_frontswap(char *s) __setup("nofrontswap", no_frontswap); static struct frontswap_ops __initdata tmem_frontswap_ops = { - .put_page = tmem_frontswap_put_page, - .get_page = tmem_frontswap_get_page, + .store = tmem_frontswap_store, + .load = tmem_frontswap_load, .invalidate_page = tmem_frontswap_flush_page, .invalidate_area = tmem_frontswap_flush_area, .init = tmem_frontswap_init diff --git a/include/linux/frontswap.h b/include/linux/frontswap.h index 68ff7af5c5f..0e4e2eec5c1 100644 --- a/include/linux/frontswap.h +++ b/include/linux/frontswap.h @@ -7,8 +7,8 @@ struct frontswap_ops { void (*init)(unsigned); - int (*put_page)(unsigned, pgoff_t, struct page *); - int (*get_page)(unsigned, pgoff_t, struct page *); + int (*store)(unsigned, pgoff_t, struct page *); + int (*load)(unsigned, pgoff_t, struct page *); void (*invalidate_page)(unsigned, pgoff_t); void (*invalidate_area)(unsigned); }; @@ -21,8 +21,8 @@ extern unsigned long frontswap_curr_pages(void); extern void frontswap_writethrough(bool); extern void __frontswap_init(unsigned type); -extern int __frontswap_put_page(struct page *page); -extern int __frontswap_get_page(struct page *page); +extern int __frontswap_store(struct page *page); +extern int __frontswap_load(struct page *page); extern void __frontswap_invalidate_page(unsigned, pgoff_t); extern void __frontswap_invalidate_area(unsigned); @@ -88,21 +88,21 @@ static inline unsigned long *frontswap_map_get(struct swap_info_struct *p) } #endif -static inline int frontswap_put_page(struct page *page) +static inline int frontswap_store(struct page *page) { int ret = -1; if (frontswap_enabled) - ret = __frontswap_put_page(page); + ret = __frontswap_store(page); return ret; } -static inline int frontswap_get_page(struct page *page) +static inline int frontswap_load(struct page *page) { int ret = -1; if (frontswap_enabled) - ret = __frontswap_get_page(page); + ret = __frontswap_load(page); return ret; } diff --git a/mm/frontswap.c b/mm/frontswap.c index 8c0a5f8683f..e25025574a0 100644 --- a/mm/frontswap.c +++ b/mm/frontswap.c @@ -39,7 +39,7 @@ bool frontswap_enabled __read_mostly; EXPORT_SYMBOL(frontswap_enabled); /* - * If enabled, frontswap_put will return failure even on success. As + * If enabled, frontswap_store will return failure even on success. As * a result, the swap subsystem will always write the page to swap, in * effect converting frontswap into a writethrough cache. In this mode, * there is no direct reduction in swap writes, but a frontswap backend @@ -54,27 +54,27 @@ static bool frontswap_writethrough_enabled __read_mostly; * properly configured). These are for information only so are not protected * against increment races. */ -static u64 frontswap_gets; -static u64 frontswap_succ_puts; -static u64 frontswap_failed_puts; +static u64 frontswap_loads; +static u64 frontswap_succ_stores; +static u64 frontswap_failed_stores; static u64 frontswap_invalidates; -static inline void inc_frontswap_gets(void) { - frontswap_gets++; +static inline void inc_frontswap_loads(void) { + frontswap_loads++; } -static inline void inc_frontswap_succ_puts(void) { - frontswap_succ_puts++; +static inline void inc_frontswap_succ_stores(void) { + frontswap_succ_stores++; } -static inline void inc_frontswap_failed_puts(void) { - frontswap_failed_puts++; +static inline void inc_frontswap_failed_stores(void) { + frontswap_failed_stores++; } static inline void inc_frontswap_invalidates(void) { frontswap_invalidates++; } #else -static inline void inc_frontswap_gets(void) { } -static inline void inc_frontswap_succ_puts(void) { } -static inline void inc_frontswap_failed_puts(void) { } +static inline void inc_frontswap_loads(void) { } +static inline void inc_frontswap_succ_stores(void) { } +static inline void inc_frontswap_failed_stores(void) { } static inline void inc_frontswap_invalidates(void) { } #endif /* @@ -116,13 +116,13 @@ void __frontswap_init(unsigned type) EXPORT_SYMBOL(__frontswap_init); /* - * "Put" data from a page to frontswap and associate it with the page's + * "Store" data from a page to frontswap and associate it with the page's * swaptype and offset. Page must be locked and in the swap cache. * If frontswap already contains a page with matching swaptype and * offset, the frontswap implmentation may either overwrite the data and * return success or invalidate the page from frontswap and return failure. */ -int __frontswap_put_page(struct page *page) +int __frontswap_store(struct page *page) { int ret = -1, dup = 0; swp_entry_t entry = { .val = page_private(page), }; @@ -134,10 +134,10 @@ int __frontswap_put_page(struct page *page) BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) dup = 1; - ret = (*frontswap_ops.put_page)(type, offset, page); + ret = (*frontswap_ops.store)(type, offset, page); if (ret == 0) { frontswap_set(sis, offset); - inc_frontswap_succ_puts(); + inc_frontswap_succ_stores(); if (!dup) atomic_inc(&sis->frontswap_pages); } else if (dup) { @@ -147,22 +147,22 @@ int __frontswap_put_page(struct page *page) */ frontswap_clear(sis, offset); atomic_dec(&sis->frontswap_pages); - inc_frontswap_failed_puts(); + inc_frontswap_failed_stores(); } else - inc_frontswap_failed_puts(); + inc_frontswap_failed_stores(); if (frontswap_writethrough_enabled) /* report failure so swap also writes to swap device */ ret = -1; return ret; } -EXPORT_SYMBOL(__frontswap_put_page); +EXPORT_SYMBOL(__frontswap_store); /* * "Get" data from frontswap associated with swaptype and offset that were * specified when the data was put to frontswap and use it to fill the * specified page with data. Page must be locked and in the swap cache. */ -int __frontswap_get_page(struct page *page) +int __frontswap_load(struct page *page) { int ret = -1; swp_entry_t entry = { .val = page_private(page), }; @@ -173,12 +173,12 @@ int __frontswap_get_page(struct page *page) BUG_ON(!PageLocked(page)); BUG_ON(sis == NULL); if (frontswap_test(sis, offset)) - ret = (*frontswap_ops.get_page)(type, offset, page); + ret = (*frontswap_ops.load)(type, offset, page); if (ret == 0) - inc_frontswap_gets(); + inc_frontswap_loads(); return ret; } -EXPORT_SYMBOL(__frontswap_get_page); +EXPORT_SYMBOL(__frontswap_load); /* * Invalidate any data from frontswap associated with the specified swaptype @@ -301,10 +301,10 @@ static int __init init_frontswap(void) struct dentry *root = debugfs_create_dir("frontswap", NULL); if (root == NULL) return -ENXIO; - debugfs_create_u64("gets", S_IRUGO, root, &frontswap_gets); - debugfs_create_u64("succ_puts", S_IRUGO, root, &frontswap_succ_puts); - debugfs_create_u64("failed_puts", S_IRUGO, root, - &frontswap_failed_puts); + debugfs_create_u64("loads", S_IRUGO, root, &frontswap_loads); + debugfs_create_u64("succ_stores", S_IRUGO, root, &frontswap_succ_stores); + debugfs_create_u64("failed_stores", S_IRUGO, root, + &frontswap_failed_stores); debugfs_create_u64("invalidates", S_IRUGO, root, &frontswap_invalidates); #endif diff --git a/mm/page_io.c b/mm/page_io.c index 651a9125931..34f02923744 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -99,7 +99,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) unlock_page(page); goto out; } - if (frontswap_put_page(page) == 0) { + if (frontswap_store(page) == 0) { set_page_writeback(page); unlock_page(page); end_page_writeback(page); @@ -129,7 +129,7 @@ int swap_readpage(struct page *page) VM_BUG_ON(!PageLocked(page)); VM_BUG_ON(PageUptodate(page)); - if (frontswap_get_page(page) == 0) { + if (frontswap_load(page) == 0) { SetPageUptodate(page); unlock_page(page); goto out; -- cgit v1.2.3-70-g09d2 From e5400321a6f15ce0fe77c8455954f213ef7dcc54 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 9 May 2012 23:39:34 +0900 Subject: clockevents: Make clockevents_config() a global symbol Make clockevents_config() into a global symbol to allow it to be used by compiled-in clockevent drivers. This is needed by drivers that want to update the timer frequency after registration time. Signed-off-by: Magnus Damm Tested-by: Simon Horman Cc: arnd@arndb.de Cc: johnstul@us.ibm.com Cc: rjw@sisk.pl Cc: lethal@linux-sh.org Cc: gregkh@linuxfoundation.org Cc: olof@lixom.net Cc: Magnus Damm Link: http://lkml.kernel.org/r/20120509143934.27521.46553.sendpatchset@w520 Signed-off-by: Thomas Gleixner --- include/linux/clockchips.h | 1 + kernel/time/clockevents.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 81e803e90aa..acba894374a 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -132,6 +132,7 @@ extern u64 clockevent_delta2ns(unsigned long latch, struct clock_event_device *evt); extern void clockevents_register_device(struct clock_event_device *dev); +extern void clockevents_config(struct clock_event_device *dev, u32 freq); extern void clockevents_config_and_register(struct clock_event_device *dev, u32 freq, unsigned long min_delta, unsigned long max_delta); diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 9cd928f7a7c..7e1ce012a85 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -297,8 +297,7 @@ void clockevents_register_device(struct clock_event_device *dev) } EXPORT_SYMBOL_GPL(clockevents_register_device); -static void clockevents_config(struct clock_event_device *dev, - u32 freq) +void clockevents_config(struct clock_event_device *dev, u32 freq) { u64 sec; -- cgit v1.2.3-70-g09d2 From 5aaa0b7a2ed5b12692c9ffb5222182bd558d3146 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 17 May 2012 17:15:29 +0200 Subject: sched/nohz: Fix rq->cpu_load calculations some more Follow up on commit 556061b00 ("sched/nohz: Fix rq->cpu_load[] calculations") since while that fixed the busy case it regressed the mostly idle case. Add a callback from the nohz exit to also age the rq->cpu_load[] array. This closes the hole where either there was no nohz load balance pass during the nohz, or there was a 'significant' amount of idle time between the last nohz balance and the nohz exit. So we'll update unconditionally from the tick to not insert any accidental 0 load periods while busy, and we try and catch up from nohz idle balance and nohz exit. Both these are still prone to missing a jiffy, but that has always been the case. Signed-off-by: Peter Zijlstra Cc: pjt@google.com Cc: Venkatesh Pallipadi Link: http://lkml.kernel.org/n/tip-kt0trz0apodbf84ucjfdbr1a@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 + kernel/sched/core.c | 53 +++++++++++++++++++++++++++++++++++++++--------- kernel/time/tick-sched.c | 1 + 3 files changed, 45 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index f45c0b280b5..d61e5977e51 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -145,6 +145,7 @@ extern unsigned long this_cpu_load(void); extern void calc_global_load(unsigned long ticks); +extern void update_cpu_load_nohz(void); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 39eb6011bc3..75844a8f9ae 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2517,25 +2517,32 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, sched_avg_update(this_rq); } +#ifdef CONFIG_NO_HZ +/* + * There is no sane way to deal with nohz on smp when using jiffies because the + * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading + * causing off-by-one errors in observed deltas; {0,2} instead of {1,1}. + * + * Therefore we cannot use the delta approach from the regular tick since that + * would seriously skew the load calculation. However we'll make do for those + * updates happening while idle (nohz_idle_balance) or coming out of idle + * (tick_nohz_idle_exit). + * + * This means we might still be one tick off for nohz periods. + */ + /* * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ void update_idle_cpu_load(struct rq *this_rq) { - unsigned long curr_jiffies = jiffies; + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); unsigned long load = this_rq->load.weight; unsigned long pending_updates; /* - * Bloody broken means of dealing with nohz, but better than nothing.. - * jiffies is updated by one cpu, another cpu can drift wrt the jiffy - * update and see 0 difference the one time and 2 the next, even though - * we ticked at roughtly the same rate. - * - * Hence we only use this from nohz_idle_balance() and skip this - * nonsense when called from the scheduler_tick() since that's - * guaranteed a stable rate. + * bail if there's load or we're actually up-to-date. */ if (load || curr_jiffies == this_rq->last_load_update_tick) return; @@ -2546,13 +2553,39 @@ void update_idle_cpu_load(struct rq *this_rq) __update_cpu_load(this_rq, load, pending_updates); } +/* + * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. + */ +void update_cpu_load_nohz(void) +{ + struct rq *this_rq = this_rq(); + unsigned long curr_jiffies = ACCESS_ONCE(jiffies); + unsigned long pending_updates; + + if (curr_jiffies == this_rq->last_load_update_tick) + return; + + raw_spin_lock(&this_rq->lock); + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * We were idle, this means load 0, the current load might be + * !0 due to remote wakeups and the sort. + */ + __update_cpu_load(this_rq, 0, pending_updates); + } + raw_spin_unlock(&this_rq->lock); +} +#endif /* CONFIG_NO_HZ */ + /* * Called from scheduler_tick() */ static void update_cpu_load_active(struct rq *this_rq) { /* - * See the mess in update_idle_cpu_load(). + * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; __update_cpu_load(this_rq, this_rq->load.weight, 1); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff56..0c927cd8534 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -576,6 +576,7 @@ void tick_nohz_idle_exit(void) /* Update jiffies first */ select_nohz_load_balancer(0); tick_do_update_jiffies64(now); + update_cpu_load_nohz(); #ifndef CONFIG_VIRT_CPU_ACCOUNTING /* -- cgit v1.2.3-70-g09d2 From 29baa7478ba47d746e3625c91d3b2afbf46b4312 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Apr 2012 12:11:21 +0200 Subject: sched: Move nr_cpus_allowed out of 'struct sched_rt_entity' Since nr_cpus_allowed is used outside of sched/rt.c and wants to be used outside of there more, move it to a more natural site. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-kr61f02y9brwzkh6x53pdptm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/blackfin/kernel/process.c | 2 +- include/linux/init_task.h | 2 +- include/linux/sched.h | 2 +- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 2 +- kernel/sched/rt.c | 36 +++++++++++++++++++++--------------- 6 files changed, 26 insertions(+), 20 deletions(-) (limited to 'include') diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c index 2e3994b2016..62bcea7dcc6 100644 --- a/arch/blackfin/kernel/process.c +++ b/arch/blackfin/kernel/process.c @@ -173,7 +173,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs) unsigned long newsp; #ifdef __ARCH_SYNC_CORE_DCACHE - if (current->rt.nr_cpus_allowed == num_possible_cpus()) + if (current->nr_cpus_allowed == num_possible_cpus()) set_cpus_allowed_ptr(current, cpumask_of(smp_processor_id())); #endif diff --git a/include/linux/init_task.h b/include/linux/init_task.h index e4baff5f7ff..9e65eff6af3 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -149,6 +149,7 @@ extern struct cred init_cred; .normal_prio = MAX_PRIO-20, \ .policy = SCHED_NORMAL, \ .cpus_allowed = CPU_MASK_ALL, \ + .nr_cpus_allowed= NR_CPUS, \ .mm = NULL, \ .active_mm = &init_mm, \ .se = { \ @@ -157,7 +158,6 @@ extern struct cred init_cred; .rt = { \ .run_list = LIST_HEAD_INIT(tsk.rt.run_list), \ .time_slice = RR_TIMESLICE, \ - .nr_cpus_allowed = NR_CPUS, \ }, \ .tasks = LIST_HEAD_INIT(tsk.tasks), \ INIT_PUSHABLE_TASKS(tsk) \ diff --git a/include/linux/sched.h b/include/linux/sched.h index d61e5977e51..0f50e78f7f4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1188,7 +1188,6 @@ struct sched_rt_entity { struct list_head run_list; unsigned long timeout; unsigned int time_slice; - int nr_cpus_allowed; struct sched_rt_entity *back; #ifdef CONFIG_RT_GROUP_SCHED @@ -1253,6 +1252,7 @@ struct task_struct { #endif unsigned int policy; + int nr_cpus_allowed; cpumask_t cpus_allowed; #ifdef CONFIG_PREEMPT_RCU diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3a69374fb42..70cc36a6073 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5015,7 +5015,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) p->sched_class->set_cpus_allowed(p, new_mask); cpumask_copy(&p->cpus_allowed, new_mask); - p->rt.nr_cpus_allowed = cpumask_weight(new_mask); + p->nr_cpus_allowed = cpumask_weight(new_mask); } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 2b449a76207..b2a2d236f27 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2703,7 +2703,7 @@ select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags) int want_sd = 1; int sync = wake_flags & WF_SYNC; - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) return prev_cpu; if (sd_flag & SD_BALANCE_WAKE) { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c5565c3c515..295da737b6f 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -274,13 +274,16 @@ static void update_rt_migration(struct rt_rq *rt_rq) static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total++; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory++; update_rt_migration(rt_rq); @@ -288,13 +291,16 @@ static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { + struct task_struct *p; + if (!rt_entity_is_task(rt_se)) return; + p = rt_task_of(rt_se); rt_rq = &rq_of_rt_rq(rt_rq)->rt; rt_rq->rt_nr_total--; - if (rt_se->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1) rt_rq->rt_nr_migratory--; update_rt_migration(rt_rq); @@ -1161,7 +1167,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); - if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1) + if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); inc_nr_running(rq); @@ -1225,7 +1231,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) cpu = task_cpu(p); - if (p->rt.nr_cpus_allowed == 1) + if (p->nr_cpus_allowed == 1) goto out; /* For anything but wake ups, just return the task_cpu */ @@ -1260,9 +1266,9 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags) * will have to sort it out. */ if (curr && unlikely(rt_task(curr)) && - (curr->rt.nr_cpus_allowed < 2 || + (curr->nr_cpus_allowed < 2 || curr->prio <= p->prio) && - (p->rt.nr_cpus_allowed > 1)) { + (p->nr_cpus_allowed > 1)) { int target = find_lowest_rq(p); if (target != -1) @@ -1276,10 +1282,10 @@ out: static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) { - if (rq->curr->rt.nr_cpus_allowed == 1) + if (rq->curr->nr_cpus_allowed == 1) return; - if (p->rt.nr_cpus_allowed != 1 + if (p->nr_cpus_allowed != 1 && cpupri_find(&rq->rd->cpupri, p, NULL)) return; @@ -1395,7 +1401,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) * The previous task needs to be made eligible for pushing * if it is still active */ - if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1) + if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); } @@ -1408,7 +1414,7 @@ static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && (cpu < 0 || cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) && - (p->rt.nr_cpus_allowed > 1)) + (p->nr_cpus_allowed > 1)) return 1; return 0; } @@ -1464,7 +1470,7 @@ static int find_lowest_rq(struct task_struct *task) if (unlikely(!lowest_mask)) return -1; - if (task->rt.nr_cpus_allowed == 1) + if (task->nr_cpus_allowed == 1) return -1; /* No other targets possible */ if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask)) @@ -1586,7 +1592,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq) BUG_ON(rq->cpu != task_cpu(p)); BUG_ON(task_current(rq, p)); - BUG_ON(p->rt.nr_cpus_allowed <= 1); + BUG_ON(p->nr_cpus_allowed <= 1); BUG_ON(!p->on_rq); BUG_ON(!rt_task(p)); @@ -1793,9 +1799,9 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) if (!task_running(rq, p) && !test_tsk_need_resched(rq->curr) && has_pushable_tasks(rq) && - p->rt.nr_cpus_allowed > 1 && + p->nr_cpus_allowed > 1 && rt_task(rq->curr) && - (rq->curr->rt.nr_cpus_allowed < 2 || + (rq->curr->nr_cpus_allowed < 2 || rq->curr->prio <= p->prio)) push_rt_tasks(rq); } @@ -1817,7 +1823,7 @@ static void set_cpus_allowed_rt(struct task_struct *p, * Only update if the process changes its state from whether it * can migrate or not. */ - if ((p->rt.nr_cpus_allowed > 1) == (weight > 1)) + if ((p->nr_cpus_allowed > 1) == (weight > 1)) return; rq = task_rq(p); -- cgit v1.2.3-70-g09d2 From 114067b69e7b2c691faace0e33db2f04096f668d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Thu, 31 May 2012 14:43:27 +0900 Subject: perf tools: Check if callchain is corrupted We faced segmentation fault on perf top -G at very high sampling rate due to a corrupted callchain. While the root cause was not revealed (I failed to figure it out), this patch tries to protect us from the segfault on such cases. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Namhyung Kim Cc: Namhyung Kim Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Sunjin Yang Link: http://lkml.kernel.org/r/1338443007-24857-2-git-send-email-namhyung.kim@lge.com Signed-off-by: Arnaldo Carvalho de Melo --- include/linux/perf_event.h | 4 ++-- tools/perf/util/session.c | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f32578634d9..1817d4015e5 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -555,6 +555,8 @@ enum perf_event_type { PERF_RECORD_MAX, /* non-ABI */ }; +#define PERF_MAX_STACK_DEPTH 255 + enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, PERF_CONTEXT_KERNEL = (__u64)-128, @@ -609,8 +611,6 @@ struct perf_guest_info_callbacks { #include #include -#define PERF_MAX_STACK_DEPTH 255 - struct perf_callchain_entry { __u64 nr; __u64 ip[PERF_MAX_STACK_DEPTH]; diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 3b6f8e460a3..04d1e33f459 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -300,6 +300,11 @@ int machine__resolve_callchain(struct machine *self, callchain_cursor_reset(&callchain_cursor); + if (chain->nr > PERF_MAX_STACK_DEPTH) { + pr_warning("corrupted callchain. skipping...\n"); + return 0; + } + for (i = 0; i < chain->nr; i++) { u64 ip; struct addr_location al; @@ -318,7 +323,14 @@ int machine__resolve_callchain(struct machine *self, case PERF_CONTEXT_USER: cpumode = PERF_RECORD_MISC_USER; break; default: - break; + pr_debug("invalid callchain context: " + "%"PRId64"\n", (s64) ip); + /* + * It seems the callchain is corrupted. + * Discard all. + */ + callchain_cursor_reset(&callchain_cursor); + return 0; } continue; } -- cgit v1.2.3-70-g09d2 From 2f9d3df8aa1cc3c6db5cfa0bad3f0745e04cc27d Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jun 2012 14:50:19 -0700 Subject: vfs: move inode stat information closer together The comment above it says "Stat data, not accessed from path walking", but in fact some of inode fields we use for the common stat data was way down at the end of the inode, causing unnecessary cache misses for the common stat operations. The inode structure is pretty big, and this can change padding depending on field width, but at least on the common 64-bit configurations this doesn't change the size. Some of our inode layout has historically been to tro to avoid unnecessary padding fields, but cache locality is at least as important for layout, if not more. Noticed by looking at kernel profiles, and noticing that the "i_blkbits" access stood out like a sore thumb. Signed-off-by: Linus Torvalds --- include/linux/fs.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 51978ed43e9..17fd887c798 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -802,13 +802,14 @@ struct inode { unsigned int __i_nlink; }; dev_t i_rdev; + loff_t i_size; struct timespec i_atime; struct timespec i_mtime; struct timespec i_ctime; spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ unsigned short i_bytes; + unsigned int i_blkbits; blkcnt_t i_blocks; - loff_t i_size; #ifdef __NEED_I_SIZE_ORDERED seqcount_t i_size_seqcount; @@ -828,9 +829,8 @@ struct inode { struct list_head i_dentry; struct rcu_head i_rcu; }; - atomic_t i_count; - unsigned int i_blkbits; u64 i_version; + atomic_t i_count; atomic_t i_dio_count; atomic_t i_writecount; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ -- cgit v1.2.3-70-g09d2 From 68e3e92620c323703bc7db75c2ba15239ee85c39 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 3 Jun 2012 20:05:57 -0700 Subject: Revert "mm: compaction: handle incorrect MIGRATE_UNMOVABLE type pageblocks" This reverts commit 5ceb9ce6fe9462a298bb2cd5c9f1ca6cb80a0199. That commit seems to be the cause of the mm compation list corruption issues that Dave Jones reported. The locking (or rather, absense there-of) is dubious, as is the use of the 'page' variable once it has been found to be outside the pageblock range. So revert it for now, we can re-visit this for 3.6. If we even need to: as Minchan Kim says, "The patch wasn't a bug fix and even test workload was very theoretical". Reported-and-tested-by: Dave Jones Acked-by: Hugh Dickins Acked-by: KOSAKI Motohiro Acked-by: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Cc: Kyungmin Park Cc: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 19 ------ mm/compaction.c | 142 ++++++++------------------------------------- mm/internal.h | 9 +-- mm/page_alloc.c | 8 +-- 4 files changed, 28 insertions(+), 150 deletions(-) (limited to 'include') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index e988037abd2..51a90b7f2d6 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -1,8 +1,6 @@ #ifndef _LINUX_COMPACTION_H #define _LINUX_COMPACTION_H -#include - /* Return values for compact_zone() and try_to_compact_pages() */ /* compaction didn't start as it was not possible or direct reclaim was more suitable */ #define COMPACT_SKIPPED 0 @@ -13,23 +11,6 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 -/* - * compaction supports three modes - * - * COMPACT_ASYNC_MOVABLE uses asynchronous migration and only scans - * MIGRATE_MOVABLE pageblocks as migration sources and targets. - * COMPACT_ASYNC_UNMOVABLE uses asynchronous migration and only scans - * MIGRATE_MOVABLE pageblocks as migration sources. - * MIGRATE_UNMOVABLE pageblocks are scanned as potential migration - * targets and convers them to MIGRATE_MOVABLE if possible - * COMPACT_SYNC uses synchronous migration and scans all pageblocks - */ -enum compact_mode { - COMPACT_ASYNC_MOVABLE, - COMPACT_ASYNC_UNMOVABLE, - COMPACT_SYNC, -}; - #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, diff --git a/mm/compaction.c b/mm/compaction.c index 4ac338af512..7ea259d82a9 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -236,7 +236,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, */ while (unlikely(too_many_isolated(zone))) { /* async migration should just abort */ - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) return 0; congestion_wait(BLK_RW_ASYNC, HZ/10); @@ -304,8 +304,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, * satisfies the allocation */ pageblock_nr = low_pfn >> pageblock_order; - if (cc->mode != COMPACT_SYNC && - last_pageblock_nr != pageblock_nr && + if (!cc->sync && last_pageblock_nr != pageblock_nr && !migrate_async_suitable(get_pageblock_migratetype(page))) { low_pfn += pageblock_nr_pages; low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; @@ -326,7 +325,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, continue; } - if (cc->mode != COMPACT_SYNC) + if (!cc->sync) mode |= ISOLATE_ASYNC_MIGRATE; lruvec = mem_cgroup_page_lruvec(page, zone); @@ -361,90 +360,27 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, #endif /* CONFIG_COMPACTION || CONFIG_CMA */ #ifdef CONFIG_COMPACTION -/* - * Returns true if MIGRATE_UNMOVABLE pageblock was successfully - * converted to MIGRATE_MOVABLE type, false otherwise. - */ -static bool rescue_unmovable_pageblock(struct page *page) -{ - unsigned long pfn, start_pfn, end_pfn; - struct page *start_page, *end_page; - - pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages; - - start_page = pfn_to_page(start_pfn); - end_page = pfn_to_page(end_pfn); - - /* Do not deal with pageblocks that overlap zones */ - if (page_zone(start_page) != page_zone(end_page)) - return false; - - for (page = start_page, pfn = start_pfn; page < end_page; pfn++, - page++) { - if (!pfn_valid_within(pfn)) - continue; - - if (PageBuddy(page)) { - int order = page_order(page); - - pfn += (1 << order) - 1; - page += (1 << order) - 1; - - continue; - } else if (page_count(page) == 0 || PageLRU(page)) - continue; - - return false; - } - - set_pageblock_migratetype(page, MIGRATE_MOVABLE); - move_freepages_block(page_zone(page), page, MIGRATE_MOVABLE); - return true; -} -enum smt_result { - GOOD_AS_MIGRATION_TARGET, - FAIL_UNMOVABLE_TARGET, - FAIL_BAD_TARGET, -}; - -/* - * Returns GOOD_AS_MIGRATION_TARGET if the page is within a block - * suitable for migration to, FAIL_UNMOVABLE_TARGET if the page - * is within a MIGRATE_UNMOVABLE block, FAIL_BAD_TARGET otherwise. - */ -static enum smt_result suitable_migration_target(struct page *page, - struct compact_control *cc) +/* Returns true if the page is within a block suitable for migration to */ +static bool suitable_migration_target(struct page *page) { int migratetype = get_pageblock_migratetype(page); /* Don't interfere with memory hot-remove or the min_free_kbytes blocks */ if (migratetype == MIGRATE_ISOLATE || migratetype == MIGRATE_RESERVE) - return FAIL_BAD_TARGET; + return false; /* If the page is a large free page, then allow migration */ if (PageBuddy(page) && page_order(page) >= pageblock_order) - return GOOD_AS_MIGRATION_TARGET; + return true; /* If the block is MIGRATE_MOVABLE or MIGRATE_CMA, allow migration */ - if (cc->mode != COMPACT_ASYNC_UNMOVABLE && - migrate_async_suitable(migratetype)) - return GOOD_AS_MIGRATION_TARGET; - - if (cc->mode == COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE) - return FAIL_UNMOVABLE_TARGET; - - if (cc->mode != COMPACT_ASYNC_MOVABLE && - migratetype == MIGRATE_UNMOVABLE && - rescue_unmovable_pageblock(page)) - return GOOD_AS_MIGRATION_TARGET; + if (migrate_async_suitable(migratetype)) + return true; /* Otherwise skip the block */ - return FAIL_BAD_TARGET; + return false; } /* @@ -477,13 +413,6 @@ static void isolate_freepages(struct zone *zone, zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* - * isolate_freepages() may be called more than once during - * compact_zone_order() run and we want only the most recent - * count. - */ - cc->nr_pageblocks_skipped = 0; - /* * Isolate free pages until enough are available to migrate the * pages on cc->migratepages. We stop searching if the migrate @@ -492,7 +421,6 @@ static void isolate_freepages(struct zone *zone, for (; pfn > low_pfn && cc->nr_migratepages > nr_freepages; pfn -= pageblock_nr_pages) { unsigned long isolated; - enum smt_result ret; if (!pfn_valid(pfn)) continue; @@ -509,12 +437,9 @@ static void isolate_freepages(struct zone *zone, continue; /* Check the block is suitable for migration */ - ret = suitable_migration_target(page, cc); - if (ret != GOOD_AS_MIGRATION_TARGET) { - if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + if (!suitable_migration_target(page)) continue; - } + /* * Found a block suitable for isolating free pages from. Now * we disabled interrupts, double check things are ok and @@ -523,14 +448,12 @@ static void isolate_freepages(struct zone *zone, */ isolated = 0; spin_lock_irqsave(&zone->lock, flags); - ret = suitable_migration_target(page, cc); - if (ret == GOOD_AS_MIGRATION_TARGET) { + if (suitable_migration_target(page)) { end_pfn = min(pfn + pageblock_nr_pages, zone_end_pfn); isolated = isolate_freepages_block(pfn, end_pfn, freelist, false); nr_freepages += isolated; - } else if (ret == FAIL_UNMOVABLE_TARGET) - cc->nr_pageblocks_skipped++; + } spin_unlock_irqrestore(&zone->lock, flags); /* @@ -762,9 +685,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; err = migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)&cc->freepages, false, - (cc->mode == COMPACT_SYNC) ? MIGRATE_SYNC_LIGHT - : MIGRATE_ASYNC); + (unsigned long)cc, false, + cc->sync ? MIGRATE_SYNC_LIGHT : MIGRATE_ASYNC); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -793,8 +715,7 @@ out: static unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask, - enum compact_mode mode, - unsigned long *nr_pageblocks_skipped) + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -802,17 +723,12 @@ static unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, - .mode = mode, + .sync = sync, }; - unsigned long rc; - INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); - rc = compact_zone(zone, &cc); - *nr_pageblocks_skipped = cc.nr_pageblocks_skipped; - - return rc; + return compact_zone(zone, &cc); } int sysctl_extfrag_threshold = 500; @@ -837,8 +753,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; - unsigned long nr_pageblocks_skipped; - enum compact_mode mode; /* * Check whether it is worth even starting compaction. The order check is @@ -855,22 +769,12 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - mode = sync ? COMPACT_SYNC : COMPACT_ASYNC_MOVABLE; -retry: - status = compact_zone_order(zone, order, gfp_mask, mode, - &nr_pageblocks_skipped); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; - - if (rc == COMPACT_COMPLETE && mode == COMPACT_ASYNC_MOVABLE) { - if (nr_pageblocks_skipped) { - mode = COMPACT_ASYNC_UNMOVABLE; - goto retry; - } - } } return rc; @@ -904,7 +808,7 @@ static int __compact_pgdat(pg_data_t *pgdat, struct compact_control *cc) if (ok && cc->order > zone->compact_order_failed) zone->compact_order_failed = cc->order + 1; /* Currently async compaction is never deferred. */ - else if (!ok && cc->mode == COMPACT_SYNC) + else if (!ok && cc->sync) defer_compaction(zone, cc->order); } @@ -919,7 +823,7 @@ int compact_pgdat(pg_data_t *pgdat, int order) { struct compact_control cc = { .order = order, - .mode = COMPACT_ASYNC_MOVABLE, + .sync = false, }; return __compact_pgdat(pgdat, &cc); @@ -929,7 +833,7 @@ static int compact_node(int nid) { struct compact_control cc = { .order = -1, - .mode = COMPACT_SYNC, + .sync = true, }; return __compact_pgdat(NODE_DATA(nid), &cc); diff --git a/mm/internal.h b/mm/internal.h index 5cbb7819004..2ba87fbfb75 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -94,9 +94,6 @@ extern void putback_lru_page(struct page *page); /* * in mm/page_alloc.c */ -extern void set_pageblock_migratetype(struct page *page, int migratetype); -extern int move_freepages_block(struct zone *zone, struct page *page, - int migratetype); extern void __free_pages_bootmem(struct page *page, unsigned int order); extern void prep_compound_page(struct page *page, unsigned long order); #ifdef CONFIG_MEMORY_FAILURE @@ -104,7 +101,6 @@ extern bool is_free_buddy_page(struct page *page); #endif #if defined CONFIG_COMPACTION || defined CONFIG_CMA -#include /* * in mm/compaction.c @@ -123,14 +119,11 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ - enum compact_mode mode; /* Compaction mode */ + bool sync; /* Synchronous migration */ int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; - - /* Number of UNMOVABLE destination pageblocks skipped during scan */ - unsigned long nr_pageblocks_skipped; }; unsigned long diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6092f331b32..44030096da6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -219,7 +219,7 @@ EXPORT_SYMBOL(nr_online_nodes); int page_group_by_mobility_disabled __read_mostly; -void set_pageblock_migratetype(struct page *page, int migratetype) +static void set_pageblock_migratetype(struct page *page, int migratetype) { if (unlikely(page_group_by_mobility_disabled)) @@ -954,8 +954,8 @@ static int move_freepages(struct zone *zone, return pages_moved; } -int move_freepages_block(struct zone *zone, struct page *page, - int migratetype) +static int move_freepages_block(struct zone *zone, struct page *page, + int migratetype) { unsigned long start_pfn, end_pfn; struct page *start_page, *end_page; @@ -5651,7 +5651,7 @@ static int __alloc_contig_migrate_range(unsigned long start, unsigned long end) .nr_migratepages = 0, .order = -1, .zone = page_zone(pfn_to_page(start)), - .mode = COMPACT_SYNC, + .sync = true, }; INIT_LIST_HEAD(&cc.migratepages); -- cgit v1.2.3-70-g09d2 From 7ae30986dc63d214cb075a40f2cf205f0a7806cd Mon Sep 17 00:00:00 2001 From: Len Brown Date: Mon, 4 Jun 2012 00:29:11 -0400 Subject: ACPI: fix acpi_bus.h build warnings when ACPI is not enabled introduced in Linux-3.5-rc1 by 66886d6f8c9bcdee3d7fce5796dcffd6b4bc0b48 (ACPI: Add stubs for (un)register_acpi_bus_type) Fix header file warnings when CONFIG_ACPI is not enabled: include/acpi/acpi_bus.h:443:42: warning: 'struct acpi_bus_type' declared inside parameter list include/acpi/acpi_bus.h:443:42: warning: its scope is only this definition or declaration, which is probably not include/acpi/acpi_bus.h:444:44: warning: 'struct acpi_bus_type' declared inside parameter list Signed-off-by: Len Brown --- include/acpi/acpi_bus.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/acpi/acpi_bus.h b/include/acpi/acpi_bus.h index b0d62820ada..9e6e1c6eb60 100644 --- a/include/acpi/acpi_bus.h +++ b/include/acpi/acpi_bus.h @@ -440,8 +440,8 @@ static inline int acpi_pm_device_sleep_wake(struct device *dev, bool enable) #else /* CONFIG_ACPI */ -static int register_acpi_bus_type(struct acpi_bus_type *bus) { return 0; } -static int unregister_acpi_bus_type(struct acpi_bus_type *bus) { return 0; } +static inline int register_acpi_bus_type(void *bus) { return 0; } +static inline int unregister_acpi_bus_type(void *bus) { return 0; } #endif /* CONFIG_ACPI */ -- cgit v1.2.3-70-g09d2 From ae58d1e406986f31d1e88b32f5ac601506c196d8 Mon Sep 17 00:00:00 2001 From: Stephen Warren Date: Fri, 18 May 2012 09:29:34 -0600 Subject: i2c: Add generic I2C multiplexer using pinctrl API This is useful for SoCs whose I2C module's signals can be routed to different sets of pins at run-time, using the pinctrl API. +-----+ +-----+ | dev | | dev | +------------------------+ +-----+ +-----+ | SoC | | | | /----|------+--------+ | +---+ +------+ | child bus A, on first set of pins | |I2C|---|Pinmux| | | +---+ +------+ | child bus B, on second set of pins | \----|------+--------+--------+ | | | | | +------------------------+ +-----+ +-----+ +-----+ | dev | | dev | | dev | +-----+ +-----+ +-----+ Signed-off-by: Stephen Warren Acked-by: Linus Walleij Acked-by: Rob Herring Signed-off-by: Wolfram Sang --- .../devicetree/bindings/i2c/i2c-mux-pinctrl.txt | 93 +++++++ drivers/i2c/muxes/Kconfig | 12 + drivers/i2c/muxes/Makefile | 1 + drivers/i2c/muxes/i2c-mux-pinctrl.c | 279 +++++++++++++++++++++ include/linux/i2c-mux-pinctrl.h | 41 +++ 5 files changed, 426 insertions(+) create mode 100644 Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt create mode 100644 drivers/i2c/muxes/i2c-mux-pinctrl.c create mode 100644 include/linux/i2c-mux-pinctrl.h (limited to 'include') diff --git a/Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt b/Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt new file mode 100644 index 00000000000..ae8af1694e9 --- /dev/null +++ b/Documentation/devicetree/bindings/i2c/i2c-mux-pinctrl.txt @@ -0,0 +1,93 @@ +Pinctrl-based I2C Bus Mux + +This binding describes an I2C bus multiplexer that uses pin multiplexing to +route the I2C signals, and represents the pin multiplexing configuration +using the pinctrl device tree bindings. + + +-----+ +-----+ + | dev | | dev | + +------------------------+ +-----+ +-----+ + | SoC | | | + | /----|------+--------+ + | +---+ +------+ | child bus A, on first set of pins + | |I2C|---|Pinmux| | + | +---+ +------+ | child bus B, on second set of pins + | \----|------+--------+--------+ + | | | | | + +------------------------+ +-----+ +-----+ +-----+ + | dev | | dev | | dev | + +-----+ +-----+ +-----+ + +Required properties: +- compatible: i2c-mux-pinctrl +- i2c-parent: The phandle of the I2C bus that this multiplexer's master-side + port is connected to. + +Also required are: + +* Standard pinctrl properties that specify the pin mux state for each child + bus. See ../pinctrl/pinctrl-bindings.txt. + +* Standard I2C mux properties. See mux.txt in this directory. + +* I2C child bus nodes. See mux.txt in this directory. + +For each named state defined in the pinctrl-names property, an I2C child bus +will be created. I2C child bus numbers are assigned based on the index into +the pinctrl-names property. + +The only exception is that no bus will be created for a state named "idle". If +such a state is defined, it must be the last entry in pinctrl-names. For +example: + + pinctrl-names = "ddc", "pta", "idle" -> ddc = bus 0, pta = bus 1 + pinctrl-names = "ddc", "idle", "pta" -> Invalid ("idle" not last) + pinctrl-names = "idle", "ddc", "pta" -> Invalid ("idle" not last) + +Whenever an access is made to a device on a child bus, the relevant pinctrl +state will be programmed into hardware. + +If an idle state is defined, whenever an access is not being made to a device +on a child bus, the idle pinctrl state will be programmed into hardware. + +If an idle state is not defined, the most recently used pinctrl state will be +left programmed into hardware whenever no access is being made of a device on +a child bus. + +Example: + + i2cmux { + compatible = "i2c-mux-pinctrl"; + #address-cells = <1>; + #size-cells = <0>; + + i2c-parent = <&i2c1>; + + pinctrl-names = "ddc", "pta", "idle"; + pinctrl-0 = <&state_i2cmux_ddc>; + pinctrl-1 = <&state_i2cmux_pta>; + pinctrl-2 = <&state_i2cmux_idle>; + + i2c@0 { + reg = <0>; + #address-cells = <1>; + #size-cells = <0>; + + eeprom { + compatible = "eeprom"; + reg = <0x50>; + }; + }; + + i2c@1 { + reg = <1>; + #address-cells = <1>; + #size-cells = <0>; + + eeprom { + compatible = "eeprom"; + reg = <0x50>; + }; + }; + }; + diff --git a/drivers/i2c/muxes/Kconfig b/drivers/i2c/muxes/Kconfig index beb2491db27..a0edd985421 100644 --- a/drivers/i2c/muxes/Kconfig +++ b/drivers/i2c/muxes/Kconfig @@ -37,4 +37,16 @@ config I2C_MUX_PCA954x This driver can also be built as a module. If so, the module will be called i2c-mux-pca954x. +config I2C_MUX_PINCTRL + tristate "pinctrl-based I2C multiplexer" + depends on PINCTRL + help + If you say yes to this option, support will be included for an I2C + multiplexer that uses the pinctrl subsystem, i.e. pin multiplexing. + This is useful for SoCs whose I2C module's signals can be routed to + different sets of pins at run-time. + + This driver can also be built as a module. If so, the module will be + called pinctrl-i2cmux. + endmenu diff --git a/drivers/i2c/muxes/Makefile b/drivers/i2c/muxes/Makefile index 5826249b29c..76da8692aff 100644 --- a/drivers/i2c/muxes/Makefile +++ b/drivers/i2c/muxes/Makefile @@ -4,5 +4,6 @@ obj-$(CONFIG_I2C_MUX_GPIO) += i2c-mux-gpio.o obj-$(CONFIG_I2C_MUX_PCA9541) += i2c-mux-pca9541.o obj-$(CONFIG_I2C_MUX_PCA954x) += i2c-mux-pca954x.o +obj-$(CONFIG_I2C_MUX_PINCTRL) += i2c-mux-pinctrl.o ccflags-$(CONFIG_I2C_DEBUG_BUS) := -DDEBUG diff --git a/drivers/i2c/muxes/i2c-mux-pinctrl.c b/drivers/i2c/muxes/i2c-mux-pinctrl.c new file mode 100644 index 00000000000..46a66976347 --- /dev/null +++ b/drivers/i2c/muxes/i2c-mux-pinctrl.c @@ -0,0 +1,279 @@ +/* + * I2C multiplexer using pinctrl API + * + * Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct i2c_mux_pinctrl { + struct device *dev; + struct i2c_mux_pinctrl_platform_data *pdata; + struct pinctrl *pinctrl; + struct pinctrl_state **states; + struct pinctrl_state *state_idle; + struct i2c_adapter *parent; + struct i2c_adapter **busses; +}; + +static int i2c_mux_pinctrl_select(struct i2c_adapter *adap, void *data, + u32 chan) +{ + struct i2c_mux_pinctrl *mux = data; + + return pinctrl_select_state(mux->pinctrl, mux->states[chan]); +} + +static int i2c_mux_pinctrl_deselect(struct i2c_adapter *adap, void *data, + u32 chan) +{ + struct i2c_mux_pinctrl *mux = data; + + return pinctrl_select_state(mux->pinctrl, mux->state_idle); +} + +#ifdef CONFIG_OF +static int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux, + struct platform_device *pdev) +{ + struct device_node *np = pdev->dev.of_node; + int num_names, i, ret; + struct device_node *adapter_np; + struct i2c_adapter *adapter; + + if (!np) + return 0; + + mux->pdata = devm_kzalloc(&pdev->dev, sizeof(*mux->pdata), GFP_KERNEL); + if (!mux->pdata) { + dev_err(mux->dev, + "Cannot allocate i2c_mux_pinctrl_platform_data\n"); + return -ENOMEM; + } + + num_names = of_property_count_strings(np, "pinctrl-names"); + if (num_names < 0) { + dev_err(mux->dev, "Cannot parse pinctrl-names: %d\n", + num_names); + return num_names; + } + + mux->pdata->pinctrl_states = devm_kzalloc(&pdev->dev, + sizeof(*mux->pdata->pinctrl_states) * num_names, + GFP_KERNEL); + if (!mux->pdata->pinctrl_states) { + dev_err(mux->dev, "Cannot allocate pinctrl_states\n"); + return -ENOMEM; + } + + for (i = 0; i < num_names; i++) { + ret = of_property_read_string_index(np, "pinctrl-names", i, + &mux->pdata->pinctrl_states[mux->pdata->bus_count]); + if (ret < 0) { + dev_err(mux->dev, "Cannot parse pinctrl-names: %d\n", + ret); + return ret; + } + if (!strcmp(mux->pdata->pinctrl_states[mux->pdata->bus_count], + "idle")) { + if (i != num_names - 1) { + dev_err(mux->dev, "idle state must be last\n"); + return -EINVAL; + } + mux->pdata->pinctrl_state_idle = "idle"; + } else { + mux->pdata->bus_count++; + } + } + + adapter_np = of_parse_phandle(np, "i2c-parent", 0); + if (!adapter_np) { + dev_err(mux->dev, "Cannot parse i2c-parent\n"); + return -ENODEV; + } + adapter = of_find_i2c_adapter_by_node(adapter_np); + if (!adapter) { + dev_err(mux->dev, "Cannot find parent bus\n"); + return -ENODEV; + } + mux->pdata->parent_bus_num = i2c_adapter_id(adapter); + put_device(&adapter->dev); + + return 0; +} +#else +static inline int i2c_mux_pinctrl_parse_dt(struct i2c_mux_pinctrl *mux, + struct platform_device *pdev) +{ + return 0; +} +#endif + +static int __devinit i2c_mux_pinctrl_probe(struct platform_device *pdev) +{ + struct i2c_mux_pinctrl *mux; + int (*deselect)(struct i2c_adapter *, void *, u32); + int i, ret; + + mux = devm_kzalloc(&pdev->dev, sizeof(*mux), GFP_KERNEL); + if (!mux) { + dev_err(&pdev->dev, "Cannot allocate i2c_mux_pinctrl\n"); + ret = -ENOMEM; + goto err; + } + platform_set_drvdata(pdev, mux); + + mux->dev = &pdev->dev; + + mux->pdata = pdev->dev.platform_data; + if (!mux->pdata) { + ret = i2c_mux_pinctrl_parse_dt(mux, pdev); + if (ret < 0) + goto err; + } + if (!mux->pdata) { + dev_err(&pdev->dev, "Missing platform data\n"); + ret = -ENODEV; + goto err; + } + + mux->states = devm_kzalloc(&pdev->dev, + sizeof(*mux->states) * mux->pdata->bus_count, + GFP_KERNEL); + if (!mux->states) { + dev_err(&pdev->dev, "Cannot allocate states\n"); + ret = -ENOMEM; + goto err; + } + + mux->busses = devm_kzalloc(&pdev->dev, + sizeof(mux->busses) * mux->pdata->bus_count, + GFP_KERNEL); + if (!mux->states) { + dev_err(&pdev->dev, "Cannot allocate busses\n"); + ret = -ENOMEM; + goto err; + } + + mux->pinctrl = devm_pinctrl_get(&pdev->dev); + if (IS_ERR(mux->pinctrl)) { + ret = PTR_ERR(mux->pinctrl); + dev_err(&pdev->dev, "Cannot get pinctrl: %d\n", ret); + goto err; + } + for (i = 0; i < mux->pdata->bus_count; i++) { + mux->states[i] = pinctrl_lookup_state(mux->pinctrl, + mux->pdata->pinctrl_states[i]); + if (IS_ERR(mux->states[i])) { + ret = PTR_ERR(mux->states[i]); + dev_err(&pdev->dev, + "Cannot look up pinctrl state %s: %d\n", + mux->pdata->pinctrl_states[i], ret); + goto err; + } + } + if (mux->pdata->pinctrl_state_idle) { + mux->state_idle = pinctrl_lookup_state(mux->pinctrl, + mux->pdata->pinctrl_state_idle); + if (IS_ERR(mux->state_idle)) { + ret = PTR_ERR(mux->state_idle); + dev_err(&pdev->dev, + "Cannot look up pinctrl state %s: %d\n", + mux->pdata->pinctrl_state_idle, ret); + goto err; + } + + deselect = i2c_mux_pinctrl_deselect; + } else { + deselect = NULL; + } + + mux->parent = i2c_get_adapter(mux->pdata->parent_bus_num); + if (!mux->parent) { + dev_err(&pdev->dev, "Parent adapter (%d) not found\n", + mux->pdata->parent_bus_num); + ret = -ENODEV; + goto err; + } + + for (i = 0; i < mux->pdata->bus_count; i++) { + u32 bus = mux->pdata->base_bus_num ? + (mux->pdata->base_bus_num + i) : 0; + + mux->busses[i] = i2c_add_mux_adapter(mux->parent, &pdev->dev, + mux, bus, i, + i2c_mux_pinctrl_select, + deselect); + if (!mux->busses[i]) { + ret = -ENODEV; + dev_err(&pdev->dev, "Failed to add adapter %d\n", i); + goto err_del_adapter; + } + } + + return 0; + +err_del_adapter: + for (; i > 0; i--) + i2c_del_mux_adapter(mux->busses[i - 1]); + i2c_put_adapter(mux->parent); +err: + return ret; +} + +static int __devexit i2c_mux_pinctrl_remove(struct platform_device *pdev) +{ + struct i2c_mux_pinctrl *mux = platform_get_drvdata(pdev); + int i; + + for (i = 0; i < mux->pdata->bus_count; i++) + i2c_del_mux_adapter(mux->busses[i]); + + i2c_put_adapter(mux->parent); + + return 0; +} + +#ifdef CONFIG_OF +static const struct of_device_id i2c_mux_pinctrl_of_match[] __devinitconst = { + { .compatible = "i2c-mux-pinctrl", }, + {}, +}; +MODULE_DEVICE_TABLE(of, i2c_mux_pinctrl_of_match); +#endif + +static struct platform_driver i2c_mux_pinctrl_driver = { + .driver = { + .name = "i2c-mux-pinctrl", + .owner = THIS_MODULE, + .of_match_table = of_match_ptr(i2c_mux_pinctrl_of_match), + }, + .probe = i2c_mux_pinctrl_probe, + .remove = __devexit_p(i2c_mux_pinctrl_remove), +}; +module_platform_driver(i2c_mux_pinctrl_driver); + +MODULE_DESCRIPTION("pinctrl-based I2C multiplexer driver"); +MODULE_AUTHOR("Stephen Warren "); +MODULE_LICENSE("GPL v2"); +MODULE_ALIAS("platform:i2c-mux-pinctrl"); diff --git a/include/linux/i2c-mux-pinctrl.h b/include/linux/i2c-mux-pinctrl.h new file mode 100644 index 00000000000..a65c86429e8 --- /dev/null +++ b/include/linux/i2c-mux-pinctrl.h @@ -0,0 +1,41 @@ +/* + * i2c-mux-pinctrl platform data + * + * Copyright (c) 2012, NVIDIA CORPORATION. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#ifndef _LINUX_I2C_MUX_PINCTRL_H +#define _LINUX_I2C_MUX_PINCTRL_H + +/** + * struct i2c_mux_pinctrl_platform_data - Platform data for i2c-mux-pinctrl + * @parent_bus_num: Parent I2C bus number + * @base_bus_num: Base I2C bus number for the child busses. 0 for dynamic. + * @bus_count: Number of child busses. Also the number of elements in + * @pinctrl_states + * @pinctrl_states: The names of the pinctrl state to select for each child bus + * @pinctrl_state_idle: The pinctrl state to select when no child bus is being + * accessed. If NULL, the most recently used pinctrl state will be left + * selected. + */ +struct i2c_mux_pinctrl_platform_data { + int parent_bus_num; + int base_bus_num; + int bus_count; + const char **pinctrl_states; + const char *pinctrl_state_idle; +}; + +#endif -- cgit v1.2.3-70-g09d2 From 13b87b27421e12f82ebbaac018cea30f82e5c33e Mon Sep 17 00:00:00 2001 From: Inki Dae Date: Mon, 14 May 2012 20:04:38 +0900 Subject: drm/exynos: fixed size type. size type of drm_exynos_gem_mmap struct is changed to uint64_t and it adds pad for the struct to be aligned as 64bit. Signed-off-by: Inki Dae Signed-off-by: Kyungmin Park --- include/drm/exynos_drm.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/exynos_drm.h b/include/drm/exynos_drm.h index b6d7ce92ead..68733587e70 100644 --- a/include/drm/exynos_drm.h +++ b/include/drm/exynos_drm.h @@ -64,6 +64,7 @@ struct drm_exynos_gem_map_off { * A structure for mapping buffer. * * @handle: a handle to gem object created. + * @pad: just padding to be 64-bit aligned. * @size: memory size to be mapped. * @mapped: having user virtual address mmaped. * - this variable would be filled by exynos gem module @@ -72,7 +73,8 @@ struct drm_exynos_gem_map_off { */ struct drm_exynos_gem_mmap { unsigned int handle; - unsigned int size; + unsigned int pad; + uint64_t size; uint64_t mapped; }; -- cgit v1.2.3-70-g09d2 From a43fd50dc99a5f65505f174eca5a421707d73b4c Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Tue, 5 Jun 2012 14:34:03 +0100 Subject: regmap: Implement support for wake IRQs Allow chips to provide a bank of registers for controlling the wake state in a similar fashion to the masks and propagate the wake count to the parent interrupt controller. Signed-off-by: Mark Brown --- drivers/base/regmap/regmap-irq.c | 47 ++++++++++++++++++++++++++++++++++++++++ include/linux/regmap.h | 2 ++ 2 files changed, 49 insertions(+) (limited to 'include') diff --git a/drivers/base/regmap/regmap-irq.c b/drivers/base/regmap/regmap-irq.c index b74e14c4dff..b480b529f02 100644 --- a/drivers/base/regmap/regmap-irq.c +++ b/drivers/base/regmap/regmap-irq.c @@ -29,9 +29,13 @@ struct regmap_irq_chip_data { int irq_base; struct irq_domain *domain; + int irq; + int wake_count; + unsigned int *status_buf; unsigned int *mask_buf; unsigned int *mask_buf_def; + unsigned int *wake_buf; unsigned int irq_reg_stride; }; @@ -71,6 +75,16 @@ static void regmap_irq_sync_unlock(struct irq_data *data) d->chip->mask_base + (i * map->reg_stride)); } + /* If we've changed our wakeup count propagate it to the parent */ + if (d->wake_count < 0) + for (i = d->wake_count; i < 0; i++) + irq_set_irq_wake(d->irq, 0); + else if (d->wake_count > 0) + for (i = 0; i < d->wake_count; i++) + irq_set_irq_wake(d->irq, 1); + + d->wake_count = 0; + mutex_unlock(&d->lock); } @@ -92,12 +106,35 @@ static void regmap_irq_disable(struct irq_data *data) d->mask_buf[irq_data->reg_offset / map->reg_stride] |= irq_data->mask; } +static int regmap_irq_set_wake(struct irq_data *data, unsigned int on) +{ + struct regmap_irq_chip_data *d = irq_data_get_irq_chip_data(data); + struct regmap *map = d->map; + const struct regmap_irq *irq_data = irq_to_regmap_irq(d, data->hwirq); + + if (!d->chip->wake_base) + return -EINVAL; + + if (on) { + d->wake_buf[irq_data->reg_offset / map->reg_stride] + &= ~irq_data->mask; + d->wake_count++; + } else { + d->wake_buf[irq_data->reg_offset / map->reg_stride] + |= irq_data->mask; + d->wake_count--; + } + + return 0; +} + static struct irq_chip regmap_irq_chip = { .name = "regmap", .irq_bus_lock = regmap_irq_lock, .irq_bus_sync_unlock = regmap_irq_sync_unlock, .irq_disable = regmap_irq_disable, .irq_enable = regmap_irq_enable, + .irq_set_wake = regmap_irq_set_wake, }; static irqreturn_t regmap_irq_thread(int irq, void *d) @@ -240,6 +277,14 @@ int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags, if (!d->mask_buf_def) goto err_alloc; + if (chip->wake_base) { + d->wake_buf = kzalloc(sizeof(unsigned int) * chip->num_regs, + GFP_KERNEL); + if (!d->wake_buf) + goto err_alloc; + } + + d->irq = irq; d->map = map; d->chip = chip; d->irq_base = irq_base; @@ -294,6 +339,7 @@ int regmap_add_irq_chip(struct regmap *map, int irq, int irq_flags, err_domain: /* Should really dispose of the domain but... */ err_alloc: + kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); kfree(d->status_buf); @@ -315,6 +361,7 @@ void regmap_del_irq_chip(int irq, struct regmap_irq_chip_data *d) free_irq(irq, d); /* We should unmap the domain but... */ + kfree(d->wake_buf); kfree(d->mask_buf_def); kfree(d->mask_buf); kfree(d->status_buf); diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 56af22ec9ab..58ec0cba0ae 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -219,6 +219,7 @@ struct regmap_irq { * @status_base: Base status register address. * @mask_base: Base mask register address. * @ack_base: Base ack address. If zero then the chip is clear on read. + * @wake_base: Base address for wake enables. If zero unsupported. * @irq_reg_stride: Stride to use for chips where registers are not contiguous. * * @num_regs: Number of registers in each control bank. @@ -232,6 +233,7 @@ struct regmap_irq_chip { unsigned int status_base; unsigned int mask_base; unsigned int ack_base; + unsigned int wake_base; unsigned int irq_reg_stride; int num_regs; -- cgit v1.2.3-70-g09d2 From 1549210fcc17e9ae20c09ac8cd4c48a8dfd431bd Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 09:16:47 -0400 Subject: NFSv4: Fix an Oops in the open recovery code The open recovery code does not need to request a new value for the mdsthreshold, and so does not allocate a struct nfs4_threshold. The problem is that encode_getfattr_open() will still request an mdsthreshold, and so we end up Oopsing in decode_attr_mdsthreshold. This patch fixes encode_getfattr_open so that it doesn't request an mdsthreshold when the caller isn't asking for one. It also fixes decode_attr_mdsthreshold so that it errors if the server returns an mdsthreshold that we didn't ask for (instead of Oopsing). Signed-off-by: Trond Myklebust Cc: Andy Adamson --- fs/nfs/nfs4_fs.h | 2 +- fs/nfs/nfs4proc.c | 22 +++++++++++++++++++++- fs/nfs/nfs4xdr.c | 12 ++++++++---- include/linux/nfs_xdr.h | 1 + 4 files changed, 31 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index c6827f93ab5..cc5900ac61b 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -295,7 +295,7 @@ is_ds_client(struct nfs_client *clp) extern const struct nfs4_minor_version_ops *nfs_v4_minor_ops[]; -extern const u32 nfs4_fattr_bitmap[2]; +extern const u32 nfs4_fattr_bitmap[3]; extern const u32 nfs4_statfs_bitmap[2]; extern const u32 nfs4_pathconf_bitmap[2]; extern const u32 nfs4_fsinfo_bitmap[3]; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index d48dbefa0e7..ad1515521a6 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -116,7 +116,7 @@ static int nfs4_map_errors(int err) /* * This is our standard bitmap for GETATTR requests. */ -const u32 nfs4_fattr_bitmap[2] = { +const u32 nfs4_fattr_bitmap[3] = { FATTR4_WORD0_TYPE | FATTR4_WORD0_CHANGE | FATTR4_WORD0_SIZE @@ -133,6 +133,24 @@ const u32 nfs4_fattr_bitmap[2] = { | FATTR4_WORD1_TIME_MODIFY }; +static const u32 nfs4_pnfs_open_bitmap[3] = { + FATTR4_WORD0_TYPE + | FATTR4_WORD0_CHANGE + | FATTR4_WORD0_SIZE + | FATTR4_WORD0_FSID + | FATTR4_WORD0_FILEID, + FATTR4_WORD1_MODE + | FATTR4_WORD1_NUMLINKS + | FATTR4_WORD1_OWNER + | FATTR4_WORD1_OWNER_GROUP + | FATTR4_WORD1_RAWDEV + | FATTR4_WORD1_SPACE_USED + | FATTR4_WORD1_TIME_ACCESS + | FATTR4_WORD1_TIME_METADATA + | FATTR4_WORD1_TIME_MODIFY, + FATTR4_WORD2_MDSTHRESHOLD +}; + const u32 nfs4_statfs_bitmap[2] = { FATTR4_WORD0_FILES_AVAIL | FATTR4_WORD0_FILES_FREE @@ -844,6 +862,7 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry, p->o_arg.name = &dentry->d_name; p->o_arg.server = server; p->o_arg.bitmask = server->attr_bitmask; + p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0]; p->o_arg.claim = NFS4_OPEN_CLAIM_NULL; if (attrs != NULL && attrs->ia_valid != 0) { __be32 verf[2]; @@ -1820,6 +1839,7 @@ static int _nfs4_do_open(struct inode *dir, opendata->f_attr.mdsthreshold = pnfs_mdsthreshold_alloc(); if (!opendata->f_attr.mdsthreshold) goto err_opendata_put; + opendata->o_arg.open_bitmap = &nfs4_pnfs_open_bitmap[0]; } if (dentry->d_inode != NULL) opendata->state = nfs4_get_open_state(dentry->d_inode, sp); diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index ee4a74db95d..9ca1428da9d 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -1198,12 +1198,13 @@ static void encode_getfattr(struct xdr_stream *xdr, const u32* bitmask, struct c } static void encode_getfattr_open(struct xdr_stream *xdr, const u32 *bitmask, + const u32 *open_bitmap, struct compound_hdr *hdr) { encode_getattr_three(xdr, - bitmask[0] & nfs4_fattr_bitmap[0], - bitmask[1] & nfs4_fattr_bitmap[1], - bitmask[2] & FATTR4_WORD2_MDSTHRESHOLD, + bitmask[0] & open_bitmap[0], + bitmask[1] & open_bitmap[1], + bitmask[2] & open_bitmap[2], hdr); } @@ -2221,7 +2222,7 @@ static void nfs4_xdr_enc_open(struct rpc_rqst *req, struct xdr_stream *xdr, encode_putfh(xdr, args->fh, &hdr); encode_open(xdr, args, &hdr); encode_getfh(xdr, &hdr); - encode_getfattr_open(xdr, args->bitmask, &hdr); + encode_getfattr_open(xdr, args->bitmask, args->open_bitmap, &hdr); encode_nops(&hdr); } @@ -4360,6 +4361,9 @@ static int decode_attr_mdsthreshold(struct xdr_stream *xdr, if (unlikely(bitmap[2] & (FATTR4_WORD2_MDSTHRESHOLD - 1U))) return -EIO; if (likely(bitmap[2] & FATTR4_WORD2_MDSTHRESHOLD)) { + /* Did the server return an unrequested attribute? */ + if (unlikely(res == NULL)) + return -EREMOTEIO; p = xdr_inline_decode(xdr, 4); if (unlikely(!p)) goto out_overflow; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index d1a7bf51c32..7519baef025 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -348,6 +348,7 @@ struct nfs_openargs { const struct qstr * name; const struct nfs_server *server; /* Needed for ID mapping */ const u32 * bitmask; + const u32 * open_bitmap; __u32 claim; struct nfs4_sequence_args seq_args; }; -- cgit v1.2.3-70-g09d2 From d430f7dbf7bd6aaaa40c0660b3204df8cf07b22b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:28 -0400 Subject: drm/radeon/kms: add new Trinity PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- drivers/gpu/drm/radeon/ni.c | 21 +++++++++++++++++---- include/drm/drm_pciids.h | 8 ++++++++ 2 files changed, 25 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/radeon/ni.c b/drivers/gpu/drm/radeon/ni.c index 3df4efa1194..3186522a445 100644 --- a/drivers/gpu/drm/radeon/ni.c +++ b/drivers/gpu/drm/radeon/ni.c @@ -460,15 +460,28 @@ static void cayman_gpu_init(struct radeon_device *rdev) rdev->config.cayman.max_pipes_per_simd = 4; rdev->config.cayman.max_tile_pipes = 2; if ((rdev->pdev->device == 0x9900) || - (rdev->pdev->device == 0x9901)) { + (rdev->pdev->device == 0x9901) || + (rdev->pdev->device == 0x9905) || + (rdev->pdev->device == 0x9906) || + (rdev->pdev->device == 0x9907) || + (rdev->pdev->device == 0x9908) || + (rdev->pdev->device == 0x9909) || + (rdev->pdev->device == 0x9910) || + (rdev->pdev->device == 0x9917)) { rdev->config.cayman.max_simds_per_se = 6; rdev->config.cayman.max_backends_per_se = 2; } else if ((rdev->pdev->device == 0x9903) || - (rdev->pdev->device == 0x9904)) { + (rdev->pdev->device == 0x9904) || + (rdev->pdev->device == 0x990A) || + (rdev->pdev->device == 0x9913) || + (rdev->pdev->device == 0x9918)) { rdev->config.cayman.max_simds_per_se = 4; rdev->config.cayman.max_backends_per_se = 2; - } else if ((rdev->pdev->device == 0x9990) || - (rdev->pdev->device == 0x9991)) { + } else if ((rdev->pdev->device == 0x9919) || + (rdev->pdev->device == 0x9990) || + (rdev->pdev->device == 0x9991) || + (rdev->pdev->device == 0x9994) || + (rdev->pdev->device == 0x99A0)) { rdev->config.cayman.max_simds_per_se = 3; rdev->config.cayman.max_backends_per_se = 1; } else { diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 58d0bdab68d..961dae0d26e 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -561,11 +561,19 @@ {0x1002, 0x9909, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x990A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x990F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9910, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9913, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9917, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9918, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x9919, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9990, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9991, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9992, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9993, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9994, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x99A0, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x99A2, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x99A4, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0, 0, 0} #define r128_PCI_IDS \ -- cgit v1.2.3-70-g09d2 From 4a6991cc1fad514745b79181df3ace72d561e7aa Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:29 -0400 Subject: drm/radeon/kms: add new Palm, Sumo PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 961dae0d26e..c5b0d8cd056 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -531,6 +531,7 @@ {0x1002, 0x9645, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO2|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9647, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP},\ {0x1002, 0x9648, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP},\ + {0x1002, 0x9649, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP},\ {0x1002, 0x964a, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x964b, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x964c, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_SUMO|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ @@ -550,6 +551,7 @@ {0x1002, 0x9807, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9808, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9809, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ + {0x1002, 0x980A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_PALM|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9900, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9901, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ {0x1002, 0x9903, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_ARUBA|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP|RADEON_IS_IGP}, \ -- cgit v1.2.3-70-g09d2 From a2bef8ce826dd1e787fd8ad9b6e0566ba59dab43 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:30 -0400 Subject: drm/radeon/kms: add new BTC PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index c5b0d8cd056..86c4cf91639 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -181,6 +181,7 @@ {0x1002, 0x6747, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6748, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6749, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x674A, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6750, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6751, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6758, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_TURKS|RADEON_NEW_MEMMAP}, \ @@ -198,6 +199,7 @@ {0x1002, 0x6767, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6768, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6770, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6771, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6772, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6778, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6779, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_CAICOS|RADEON_NEW_MEMMAP}, \ -- cgit v1.2.3-70-g09d2 From 7aaa61b3476462b69f1ac7669fcca8d608ce3cb5 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 5 Jun 2012 09:50:31 -0400 Subject: drm/radeon/kms: add new SI PCI ids Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Signed-off-by: Dave Airlie --- include/drm/drm_pciids.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/drm/drm_pciids.h b/include/drm/drm_pciids.h index 86c4cf91639..81368ab6c61 100644 --- a/include/drm/drm_pciids.h +++ b/include/drm/drm_pciids.h @@ -231,10 +231,11 @@ {0x1002, 0x6827, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6828, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6829, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x682B, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x682D, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x682F, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ - {0x1002, 0x6830, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ - {0x1002, 0x6831, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6830, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ + {0x1002, 0x6831, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_IS_MOBILITY|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6837, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6838, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ {0x1002, 0x6839, PCI_ANY_ID, PCI_ANY_ID, 0, 0, CHIP_VERDE|RADEON_NEW_MEMMAP}, \ -- cgit v1.2.3-70-g09d2 From fffaee365fded09f9ebf2db19066065fa54323c3 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 5 Jun 2012 21:36:33 +0400 Subject: radix-tree: fix contiguous iterator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes bug in macro radix_tree_for_each_contig(). If radix_tree_next_slot() sees NULL in next slot it returns NULL, but following radix_tree_next_chunk() switches iterating into next chunk. As result iterating becomes non-contiguous and breaks vfs "splice" and all its users. Signed-off-by: Konstantin Khlebnikov Reported-and-bisected-by: Hans de Bruin Reported-and-bisected-by: Ondrej Zary Reported-bisected-and-tested-by: Toralf Förster Link: https://lkml.org/lkml/2012/6/5/64 Cc: stable # 3.4.x Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 5 ++++- lib/radix-tree.c | 3 +++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 0d04cd69ab9..ffc444c38b0 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -368,8 +368,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) iter->index++; if (likely(*slot)) return slot; - if (flags & RADIX_TREE_ITER_CONTIG) + if (flags & RADIX_TREE_ITER_CONTIG) { + /* forbid switching to the next chunk */ + iter->next_index = 0; break; + } } } return NULL; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d7c878cc006..e7964296fd5 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -686,6 +686,9 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, * during iterating; it can be zero only at the beginning. * And we cannot overflow iter->next_index in a single step, * because RADIX_TREE_MAP_SHIFT < BITS_PER_LONG. + * + * This condition also used by radix_tree_next_slot() to stop + * contiguous iterating, and forbid swithing to the next chunk. */ index = iter->next_index; if (!index && iter->index) -- cgit v1.2.3-70-g09d2 From 9bce008bae8b57bc7b007bcc2071d1247a527120 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 5 Jun 2012 18:32:03 -0400 Subject: NFS: Fix a commit bug The new commit code fails to copy the verifier into the wb_verf field of _all_ the nfs_page structures; it only copies it into the first entry. The consequence is that most requests end up failing to match in nfs_commit_release. Fix is to copy the verifier into the req->wb_verf field in nfs_write_completion. Signed-off-by: Trond Myklebust Cc: Fred Isaman --- fs/nfs/direct.c | 4 ++-- fs/nfs/write.c | 7 ++++--- include/linux/nfs_xdr.h | 2 ++ 3 files changed, 8 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 23d170bc44f..b5385a7efd5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -710,12 +710,12 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) bit = NFS_IOHDR_NEED_RESCHED; else if (dreq->flags == 0) { - memcpy(&dreq->verf, &req->wb_verf, + memcpy(&dreq->verf, hdr->verf, sizeof(dreq->verf)); bit = NFS_IOHDR_NEED_COMMIT; dreq->flags = NFS_ODIRECT_DO_COMMIT; } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) { - if (memcmp(&dreq->verf, &req->wb_verf, sizeof(dreq->verf))) { + if (memcmp(&dreq->verf, hdr->verf, sizeof(dreq->verf))) { dreq->flags = NFS_ODIRECT_RESCHED_WRITES; bit = NFS_IOHDR_NEED_RESCHED; } else diff --git a/fs/nfs/write.c b/fs/nfs/write.c index e6fe3d69d14..4d6861c0dc1 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -80,6 +80,7 @@ struct nfs_write_header *nfs_writehdr_alloc(void) INIT_LIST_HEAD(&hdr->rpc_list); spin_lock_init(&hdr->lock); atomic_set(&hdr->refcnt, 0); + hdr->verf = &p->verf; } return p; } @@ -619,6 +620,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) goto next; } if (test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) { + memcpy(&req->wb_verf, hdr->verf, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo); goto next; } @@ -1255,15 +1257,14 @@ static void nfs_writeback_release_common(void *calldata) struct nfs_write_data *data = calldata; struct nfs_pgio_header *hdr = data->header; int status = data->task.tk_status; - struct nfs_page *req = hdr->req; if ((status >= 0) && nfs_write_need_commit(data)) { spin_lock(&hdr->lock); if (test_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags)) ; /* Do nothing */ else if (!test_and_set_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags)) - memcpy(&req->wb_verf, &data->verf, sizeof(req->wb_verf)); - else if (memcmp(&req->wb_verf, &data->verf, sizeof(req->wb_verf))) + memcpy(hdr->verf, &data->verf, sizeof(*hdr->verf)); + else if (memcmp(hdr->verf, &data->verf, sizeof(*hdr->verf))) set_bit(NFS_IOHDR_NEED_RESCHED, &hdr->flags); spin_unlock(&hdr->lock); } diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 7519baef025..8aadd90b808 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1237,6 +1237,7 @@ struct nfs_pgio_header { struct list_head rpc_list; atomic_t refcnt; struct nfs_page *req; + struct nfs_writeverf *verf; struct pnfs_layout_segment *lseg; loff_t io_start; const struct rpc_call_ops *mds_ops; @@ -1274,6 +1275,7 @@ struct nfs_write_data { struct nfs_write_header { struct nfs_pgio_header header; struct nfs_write_data rpc_data; + struct nfs_writeverf verf; }; struct nfs_mds_commit_info { -- cgit v1.2.3-70-g09d2 From c1174876874dcf8986806e4dad3d7d07af20b439 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 31 May 2012 14:47:33 +0200 Subject: sched: Fix domain iteration Weird topologies can lead to asymmetric domain setups. This needs further consideration since these setups are typically non-minimal too. For now, make it work by adding an extra mask selecting which CPUs are allowed to iterate up. The topology that triggered it is the one from David Rientjes: 10 20 20 30 20 10 20 20 20 20 10 20 30 20 20 10 resulting in boxes that wouldn't even boot. Reported-by: David Rientjes Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-3p86l9cuaqnxz7uxsojmz5rm@git.kernel.org Signed-off-by: Ingo Molnar --- include/linux/sched.h | 11 +++++++++ kernel/sched/core.c | 64 ++++++++++++++++++++++++++++++++++++++++++++------- kernel/sched/fair.c | 5 ++-- kernel/sched/sched.h | 2 ++ 4 files changed, 72 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c5447..ac321d75347 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -876,6 +876,8 @@ struct sched_group_power { * Number of busy cpus in this group. */ atomic_t nr_busy_cpus; + + unsigned long cpumask[0]; /* iteration mask */ }; struct sched_group { @@ -900,6 +902,15 @@ static inline struct cpumask *sched_group_cpus(struct sched_group *sg) return to_cpumask(sg->cpumask); } +/* + * cpumask masking which cpus in the group are allowed to iterate up the domain + * tree. + */ +static inline struct cpumask *sched_group_mask(struct sched_group *sg) +{ + return to_cpumask(sg->sgp->cpumask); +} + /** * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. * @group: The group whose first cpu is to be returned. diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6546083af3e..781acb91a50 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5994,6 +5994,44 @@ struct sched_domain_topology_level { struct sd_data data; }; +/* + * Build an iteration mask that can exclude certain CPUs from the upwards + * domain traversal. + * + * Asymmetric node setups can result in situations where the domain tree is of + * unequal depth, make sure to skip domains that already cover the entire + * range. + * + * In that case build_sched_domains() will have terminated the iteration early + * and our sibling sd spans will be empty. Domains should always include the + * cpu they're built on, so check that. + * + */ +static void build_group_mask(struct sched_domain *sd, struct sched_group *sg) +{ + const struct cpumask *span = sched_domain_span(sd); + struct sd_data *sdd = sd->private; + struct sched_domain *sibling; + int i; + + for_each_cpu(i, span) { + sibling = *per_cpu_ptr(sdd->sd, i); + if (!cpumask_test_cpu(i, sched_domain_span(sibling))) + continue; + + cpumask_set_cpu(i, sched_group_mask(sg)); + } +} + +/* + * Return the canonical balance cpu for this group, this is the first cpu + * of this group that's also in the iteration mask. + */ +int group_balance_cpu(struct sched_group *sg) +{ + return cpumask_first_and(sched_group_cpus(sg), sched_group_mask(sg)); +} + static int build_overlap_sched_groups(struct sched_domain *sd, int cpu) { @@ -6012,6 +6050,12 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) if (cpumask_test_cpu(i, covered)) continue; + child = *per_cpu_ptr(sdd->sd, i); + + /* See the comment near build_group_mask(). */ + if (!cpumask_test_cpu(i, sched_domain_span(child))) + continue; + sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(), GFP_KERNEL, cpu_to_node(cpu)); @@ -6019,8 +6063,6 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) goto fail; sg_span = sched_group_cpus(sg); - - child = *per_cpu_ptr(sdd->sd, i); if (child->child) { child = child->child; cpumask_copy(sg_span, sched_domain_span(child)); @@ -6030,13 +6072,18 @@ build_overlap_sched_groups(struct sched_domain *sd, int cpu) cpumask_or(covered, covered, sg_span); sg->sgp = *per_cpu_ptr(sdd->sgp, i); - atomic_inc(&sg->sgp->ref); + if (atomic_inc_return(&sg->sgp->ref) == 1) + build_group_mask(sd, sg); + + /* + * Make sure the first group of this domain contains the + * canonical balance cpu. Otherwise the sched_domain iteration + * breaks. See update_sg_lb_stats(). + */ if ((!groups && cpumask_test_cpu(cpu, sg_span)) || - cpumask_first(sg_span) == cpu) { - WARN_ON_ONCE(!cpumask_test_cpu(cpu, sg_span)); + group_balance_cpu(sg) == cpu) groups = sg; - } if (!first) first = sg; @@ -6109,6 +6156,7 @@ build_sched_groups(struct sched_domain *sd, int cpu) cpumask_clear(sched_group_cpus(sg)); sg->sgp->power = 0; + cpumask_setall(sched_group_mask(sg)); for_each_cpu(j, span) { if (get_group(j, sdd, NULL) != group) @@ -6150,7 +6198,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) sg = sg->next; } while (sg != sd->groups); - if (cpu != group_first_cpu(sg)) + if (cpu != group_balance_cpu(sg)) return; update_group_power(sd, cpu); @@ -6525,7 +6573,7 @@ static int __sdt_alloc(const struct cpumask *cpu_map) *per_cpu_ptr(sdd->sg, j) = sg; - sgp = kzalloc_node(sizeof(struct sched_group_power), + sgp = kzalloc_node(sizeof(struct sched_group_power) + cpumask_size(), GFP_KERNEL, cpu_to_node(j)); if (!sgp) return -ENOMEM; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b2a2d236f27..54cbaa4e7b3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3652,7 +3652,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, int i; if (local_group) - balance_cpu = group_first_cpu(group); + balance_cpu = group_balance_cpu(group); /* Tally up the load of all CPUs in the group */ max_cpu_load = 0; @@ -3667,7 +3667,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, /* Bias balancing toward cpus of our domain */ if (local_group) { - if (idle_cpu(i) && !first_idle_cpu) { + if (idle_cpu(i) && !first_idle_cpu && + cpumask_test_cpu(i, sched_group_mask(group))) { first_idle_cpu = 1; balance_cpu = i; } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ba9dccfd24c..6d52cea7f33 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -526,6 +526,8 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain *, sd_llc); DECLARE_PER_CPU(int, sd_llc_id); +extern int group_balance_cpu(struct sched_group *sg); + #endif /* CONFIG_SMP */ #include "stats.h" -- cgit v1.2.3-70-g09d2 From 0b0d9cf6ec7bab91977da2d71c09157f110f7c2e Mon Sep 17 00:00:00 2001 From: Arun Sharma Date: Fri, 20 Apr 2012 15:41:34 -0700 Subject: perf: Limit callchains to 127 Stack depth of 255 seems excessive, given that copy_from_user_nmi() could be slow. Signed-off-by: Arun Sharma Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1334961696-19580-3-git-send-email-asharma@fb.com Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 1817d4015e5..45db49f64bb 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -555,7 +555,7 @@ enum perf_event_type { PERF_RECORD_MAX, /* non-ABI */ }; -#define PERF_MAX_STACK_DEPTH 255 +#define PERF_MAX_STACK_DEPTH 127 enum perf_callchain_context { PERF_CONTEXT_HV = (__u64)-32, -- cgit v1.2.3-70-g09d2 From 55432d2b543a4b6dfae54f5c432a566877a85d90 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Jun 2012 03:00:18 +0000 Subject: inetpeer: fix a race in inetpeer_gc_worker() commit 5faa5df1fa2024 (inetpeer: Invalidate the inetpeer tree along with the routing cache) added a race : Before freeing an inetpeer, we must respect a RCU grace period, and make sure no user will attempt to increase refcnt. inetpeer_invalidate_tree() waits for a RCU grace period before inserting inetpeer tree into gc_list and waking the worker. At that time, no concurrent lookup can find a inetpeer in this tree. Signed-off-by: Eric Dumazet Cc: Steffen Klassert Acked-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/inetpeer.h | 5 ++++- net/ipv4/inetpeer.c | 16 ++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index b94765e38e8..2040bff945d 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -40,7 +40,10 @@ struct inet_peer { u32 pmtu_orig; u32 pmtu_learned; struct inetpeer_addr_base redirect_learned; - struct list_head gc_list; + union { + struct list_head gc_list; + struct rcu_head gc_rcu; + }; /* * Once inet_peer is queued for deletion (refcnt == -1), following fields * are not available: rid, ip_id_count, tcp_ts, tcp_ts_stamp diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index d4d61b694fa..dfba343b250 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -560,6 +560,17 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) } EXPORT_SYMBOL(inet_peer_xrlim_allow); +static void inetpeer_inval_rcu(struct rcu_head *head) +{ + struct inet_peer *p = container_of(head, struct inet_peer, gc_rcu); + + spin_lock_bh(&gc_lock); + list_add_tail(&p->gc_list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} + void inetpeer_invalidate_tree(int family) { struct inet_peer *old, *new, *prev; @@ -576,10 +587,7 @@ void inetpeer_invalidate_tree(int family) prev = cmpxchg(&base->root, old, new); if (prev == old) { base->total = 0; - spin_lock(&gc_lock); - list_add_tail(&prev->gc_list, &gc_list); - spin_unlock(&gc_lock); - schedule_delayed_work(&gc_work, gc_delay); + call_rcu(&prev->gc_rcu, inetpeer_inval_rcu); } out: -- cgit v1.2.3-70-g09d2 From fd4b352687fd8604d49c190c4c9ea9e369fd42d5 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 5 May 2012 19:10:35 -0700 Subject: rcu: Update RCU_FAST_NO_HZ tracing for lazy callbacks In the current code, a short dyntick-idle interval (where there is at least one non-lazy callback on the CPU) and a long dyntick-idle interval (where there are only lazy callbacks on the CPU) are traced identically, which can be less than helpful. This commit therefore emits different event traces in these two cases. Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- include/trace/events/rcu.h | 1 + kernel/rcutree_plugin.h | 8 +++++--- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 1480900c511..d274734b2aa 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -289,6 +289,7 @@ TRACE_EVENT(rcu_dyntick, * "In holdoff": Nothing to do, holding off after unsuccessful attempt. * "Begin holdoff": Attempt failed, don't retry until next jiffy. * "Dyntick with callbacks": Entering dyntick-idle despite callbacks. + * "Dyntick with lazy callbacks": Entering dyntick-idle w/lazy callbacks. * "More callbacks": Still more callbacks, try again to clear them out. * "Callbacks drained": All callbacks processed, off to dyntick idle! * "Timer": Timer fired to cause CPU to continue processing callbacks. diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 2411000d986..5449f02c482 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -2165,15 +2165,17 @@ static void rcu_prepare_for_idle(int cpu) !rcu_pending(cpu) && !local_softirq_pending()) { /* Can we go dyntick-idle despite still having callbacks? */ - trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_dyntick_drain, cpu) = 0; per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; - if (rcu_cpu_has_nonlazy_callbacks(cpu)) + if (rcu_cpu_has_nonlazy_callbacks(cpu)) { + trace_rcu_prep_idle("Dyntick with callbacks"); per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_GP_DELAY; - else + } else { per_cpu(rcu_idle_gp_timer_expires, cpu) = jiffies + RCU_IDLE_LAZY_GP_DELAY; + trace_rcu_prep_idle("Dyntick with lazy callbacks"); + } tp = &per_cpu(rcu_idle_gp_timer, cpu); mod_timer_pinned(tp, per_cpu(rcu_idle_gp_timer_expires, cpu)); per_cpu(rcu_nonlazy_posted_snap, cpu) = -- cgit v1.2.3-70-g09d2 From aa9b16306e3243229580ff889cc59fd66bf77973 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 10 May 2012 16:41:44 -0700 Subject: rcu: Precompute RCU_FAST_NO_HZ timer offsets When a CPU is entering dyntick-idle mode, tick_nohz_stop_sched_tick() calls rcu_needs_cpu() see if RCU needs that CPU, and, if not, computes the next wakeup time based on the timer wheels. Only later, when actually entering the idle loop, rcu_prepare_for_idle() will be invoked. In some cases, rcu_prepare_for_idle() will post timers to wake the CPU back up. But all for naught: The next wakeup time for the CPU has already been computed, and posting a timer afterwards does not force that wakeup time to be recomputed. This means that rcu_prepare_for_idle()'s have no effect. This is not a problem on a busy system because something else will wake up the CPU soon enough. However, on lightly loaded systems, the CPU might stay asleep for a considerable length of time. If that CPU has a callback that the rest of the system is waiting on, the system might run very slowly or (in theory) even hang. This commit avoids this problem by having rcu_needs_cpu() give tick_nohz_stop_sched_tick() an estimate of when RCU will need the CPU to wake back up, which tick_nohz_stop_sched_tick() takes into account when programming the CPU's wakeup time. An alternative approach is for rcu_prepare_for_idle() to use hrtimers instead of normal timers, but timers are much more efficient than are hrtimers for frequently and repeatedly posting and cancelling a given timer, which is exactly what RCU_FAST_NO_HZ does. Reported-by: Pascal Chapperon Reported-by: Heiko Carstens Signed-off-by: Paul E. McKenney Signed-off-by: Paul E. McKenney Tested-by: Heiko Carstens Tested-by: Pascal Chapperon --- include/linux/rcutiny.h | 6 +++-- include/linux/rcutree.h | 2 +- kernel/rcutree_plugin.h | 66 +++++++++++++++++++++++++++++++----------------- kernel/time/tick-sched.c | 7 ++++- 4 files changed, 54 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index adb5e5a38ca..854dc4c5c27 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -87,8 +87,9 @@ static inline void kfree_call_rcu(struct rcu_head *head, #ifdef CONFIG_TINY_RCU -static inline int rcu_needs_cpu(int cpu) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return 0; } @@ -96,8 +97,9 @@ static inline int rcu_needs_cpu(int cpu) int rcu_preempt_needs_cpu(void); -static inline int rcu_needs_cpu(int cpu) +static inline int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_preempt_needs_cpu(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 3c6083cde4f..952b7933930 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -32,7 +32,7 @@ extern void rcu_init(void); extern void rcu_note_context_switch(int cpu); -extern int rcu_needs_cpu(int cpu); +extern int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies); extern void rcu_cpu_stall_reset(void); /* diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 6bd9637d5d8..5271a020887 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1886,8 +1886,9 @@ static void __cpuinit rcu_prepare_kthreads(int cpu) * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs * any flavor of RCU. */ -int rcu_needs_cpu(int cpu) +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) { + *delta_jiffies = ULONG_MAX; return rcu_cpu_has_callbacks(cpu); } @@ -1962,28 +1963,6 @@ static void rcu_idle_count_callbacks_posted(void) #define RCU_IDLE_GP_DELAY 6 /* Roughly one grace period. */ #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ -/* - * Allow the CPU to enter dyntick-idle mode if either: (1) There are no - * callbacks on this CPU, (2) this CPU has not yet attempted to enter - * dyntick-idle mode, or (3) this CPU is in the process of attempting to - * enter dyntick-idle mode. Otherwise, if we have recently tried and failed - * to enter dyntick-idle mode, we refuse to try to enter it. After all, - * it is better to incur scheduling-clock interrupts than to spin - * continuously for the same time duration! - */ -int rcu_needs_cpu(int cpu) -{ - struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); - - /* Flag a new idle sojourn to the idle-entry state machine. */ - rdtp->idle_first_pass = 1; - /* If no callbacks, RCU doesn't need the CPU. */ - if (!rcu_cpu_has_callbacks(cpu)) - return 0; - /* Otherwise, RCU needs the CPU only if it recently tried and failed. */ - return rdtp->dyntick_holdoff == jiffies; -} - /* * Does the specified flavor of RCU have non-lazy callbacks pending on * the specified CPU? Both RCU flavor and CPU are specified by the @@ -2026,6 +2005,47 @@ static bool rcu_cpu_has_nonlazy_callbacks(int cpu) rcu_preempt_cpu_has_nonlazy_callbacks(cpu); } +/* + * Allow the CPU to enter dyntick-idle mode if either: (1) There are no + * callbacks on this CPU, (2) this CPU has not yet attempted to enter + * dyntick-idle mode, or (3) this CPU is in the process of attempting to + * enter dyntick-idle mode. Otherwise, if we have recently tried and failed + * to enter dyntick-idle mode, we refuse to try to enter it. After all, + * it is better to incur scheduling-clock interrupts than to spin + * continuously for the same time duration! + * + * The delta_jiffies argument is used to store the time when RCU is + * going to need the CPU again if it still has callbacks. The reason + * for this is that rcu_prepare_for_idle() might need to post a timer, + * but if so, it will do so after tick_nohz_stop_sched_tick() has set + * the wakeup time for this CPU. This means that RCU's timer can be + * delayed until the wakeup time, which defeats the purpose of posting + * a timer. + */ +int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) +{ + struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); + + /* Flag a new idle sojourn to the idle-entry state machine. */ + rdtp->idle_first_pass = 1; + /* If no callbacks, RCU doesn't need the CPU. */ + if (!rcu_cpu_has_callbacks(cpu)) { + *delta_jiffies = ULONG_MAX; + return 0; + } + if (rdtp->dyntick_holdoff == jiffies) { + /* RCU recently tried and failed, so don't try again. */ + *delta_jiffies = 1; + return 1; + } + /* Set up for the possibility that RCU will post a timer. */ + if (rcu_cpu_has_nonlazy_callbacks(cpu)) + *delta_jiffies = RCU_IDLE_GP_DELAY; + else + *delta_jiffies = RCU_IDLE_LAZY_GP_DELAY; + return 0; +} + /* * Handler for smp_call_function_single(). The only point of this * handler is to wake the CPU up, so the handler does only tracing. diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 6a3a5b9ff56..52f5ebbd443 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -274,6 +274,7 @@ EXPORT_SYMBOL_GPL(get_cpu_iowait_time_us); static void tick_nohz_stop_sched_tick(struct tick_sched *ts) { unsigned long seq, last_jiffies, next_jiffies, delta_jiffies; + unsigned long rcu_delta_jiffies; ktime_t last_update, expires, now; struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; u64 time_delta; @@ -322,7 +323,7 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) time_delta = timekeeping_max_deferment(); } while (read_seqretry(&xtime_lock, seq)); - if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || + if (rcu_needs_cpu(cpu, &rcu_delta_jiffies) || printk_needs_cpu(cpu) || arch_needs_cpu(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; @@ -330,6 +331,10 @@ static void tick_nohz_stop_sched_tick(struct tick_sched *ts) /* Get the next timer wheel timer */ next_jiffies = get_next_timer_interrupt(last_jiffies); delta_jiffies = next_jiffies - last_jiffies; + if (rcu_delta_jiffies < delta_jiffies) { + next_jiffies = last_jiffies + rcu_delta_jiffies; + delta_jiffies = rcu_delta_jiffies; + } } /* * Do not stop the tick, if we are only one off -- cgit v1.2.3-70-g09d2 From d1992b169d31f339dc5ea4e9f312567c8cf322a3 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Thu, 17 May 2012 22:35:46 +0000 Subject: netfilter: xt_HMARK: fix endianness and provide consistent hashing This patch addresses two issues: a) Fix usage of u32 and __be32 that causes endianess warnings via sparse. b) Ensure consistent hashing in a cluster that is composed of big and little endian systems. Thus, we obtain the same hash mark in an heterogeneous cluster. Reported-by: Dan Carpenter Signed-off-by: Hans Schillstrom Signed-off-by: Pablo Neira Ayuso --- include/linux/netfilter/xt_HMARK.h | 5 +++ net/netfilter/xt_HMARK.c | 72 ++++++++++++++++++++++---------------- 2 files changed, 46 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/netfilter/xt_HMARK.h b/include/linux/netfilter/xt_HMARK.h index abb1650940d..826fc580757 100644 --- a/include/linux/netfilter/xt_HMARK.h +++ b/include/linux/netfilter/xt_HMARK.h @@ -27,7 +27,12 @@ union hmark_ports { __u16 src; __u16 dst; } p16; + struct { + __be16 src; + __be16 dst; + } b16; __u32 v32; + __be32 b32; }; struct xt_hmark_info { diff --git a/net/netfilter/xt_HMARK.c b/net/netfilter/xt_HMARK.c index 0a96a43108e..1686ca1b53a 100644 --- a/net/netfilter/xt_HMARK.c +++ b/net/netfilter/xt_HMARK.c @@ -32,13 +32,13 @@ MODULE_ALIAS("ipt_HMARK"); MODULE_ALIAS("ip6t_HMARK"); struct hmark_tuple { - u32 src; - u32 dst; + __be32 src; + __be32 dst; union hmark_ports uports; - uint8_t proto; + u8 proto; }; -static inline u32 hmark_addr6_mask(const __u32 *addr32, const __u32 *mask) +static inline __be32 hmark_addr6_mask(const __be32 *addr32, const __be32 *mask) { return (addr32[0] & mask[0]) ^ (addr32[1] & mask[1]) ^ @@ -46,8 +46,8 @@ static inline u32 hmark_addr6_mask(const __u32 *addr32, const __u32 *mask) (addr32[3] & mask[3]); } -static inline u32 -hmark_addr_mask(int l3num, const __u32 *addr32, const __u32 *mask) +static inline __be32 +hmark_addr_mask(int l3num, const __be32 *addr32, const __be32 *mask) { switch (l3num) { case AF_INET: @@ -58,6 +58,22 @@ hmark_addr_mask(int l3num, const __u32 *addr32, const __u32 *mask) return 0; } +static inline void hmark_swap_ports(union hmark_ports *uports, + const struct xt_hmark_info *info) +{ + union hmark_ports hp; + u16 src, dst; + + hp.b32 = (uports->b32 & info->port_mask.b32) | info->port_set.b32; + src = ntohs(hp.b16.src); + dst = ntohs(hp.b16.dst); + + if (dst > src) + uports->v32 = (dst << 16) | src; + else + uports->v32 = (src << 16) | dst; +} + static int hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, const struct xt_hmark_info *info) @@ -74,22 +90,19 @@ hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, otuple = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; rtuple = &ct->tuplehash[IP_CT_DIR_REPLY].tuple; - t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.all, - info->src_mask.all); - t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.all, - info->dst_mask.all); + t->src = hmark_addr_mask(otuple->src.l3num, otuple->src.u3.ip6, + info->src_mask.ip6); + t->dst = hmark_addr_mask(otuple->src.l3num, rtuple->src.u3.ip6, + info->dst_mask.ip6); if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; t->proto = nf_ct_protonum(ct); if (t->proto != IPPROTO_ICMP) { - t->uports.p16.src = otuple->src.u.all; - t->uports.p16.dst = rtuple->src.u.all; - t->uports.v32 = (t->uports.v32 & info->port_mask.v32) | - info->port_set.v32; - if (t->uports.p16.dst < t->uports.p16.src) - swap(t->uports.p16.dst, t->uports.p16.src); + t->uports.b16.src = otuple->src.u.all; + t->uports.b16.dst = rtuple->src.u.all; + hmark_swap_ports(&t->uports, info); } return 0; @@ -98,15 +111,19 @@ hmark_ct_set_htuple(const struct sk_buff *skb, struct hmark_tuple *t, #endif } +/* This hash function is endian independent, to ensure consistent hashing if + * the cluster is composed of big and little endian systems. */ static inline u32 hmark_hash(struct hmark_tuple *t, const struct xt_hmark_info *info) { u32 hash; + u32 src = ntohl(t->src); + u32 dst = ntohl(t->dst); - if (t->dst < t->src) - swap(t->src, t->dst); + if (dst < src) + swap(src, dst); - hash = jhash_3words(t->src, t->dst, t->uports.v32, info->hashrnd); + hash = jhash_3words(src, dst, t->uports.v32, info->hashrnd); hash = hash ^ (t->proto & info->proto_mask); return (((u64)hash * info->hmodulus) >> 32) + info->hoffset; @@ -126,11 +143,7 @@ hmark_set_tuple_ports(const struct sk_buff *skb, unsigned int nhoff, if (skb_copy_bits(skb, nhoff, &t->uports, sizeof(t->uports)) < 0) return; - t->uports.v32 = (t->uports.v32 & info->port_mask.v32) | - info->port_set.v32; - - if (t->uports.p16.dst < t->uports.p16.src) - swap(t->uports.p16.dst, t->uports.p16.src); + hmark_swap_ports(&t->uports, info); } #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) @@ -178,8 +191,8 @@ hmark_pkt_set_htuple_ipv6(const struct sk_buff *skb, struct hmark_tuple *t, return -1; } noicmp: - t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.all); - t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.all); + t->src = hmark_addr6_mask(ip6->saddr.s6_addr32, info->src_mask.ip6); + t->dst = hmark_addr6_mask(ip6->daddr.s6_addr32, info->dst_mask.ip6); if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; @@ -255,11 +268,8 @@ hmark_pkt_set_htuple_ipv4(const struct sk_buff *skb, struct hmark_tuple *t, } } - t->src = (__force u32) ip->saddr; - t->dst = (__force u32) ip->daddr; - - t->src &= info->src_mask.ip; - t->dst &= info->dst_mask.ip; + t->src = ip->saddr & info->src_mask.ip; + t->dst = ip->daddr & info->dst_mask.ip; if (info->flags & XT_HMARK_FLAG(XT_HMARK_METHOD_L3)) return 0; -- cgit v1.2.3-70-g09d2 From bafb282df29c1524b1617019adebd6d0c3eb7a47 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 7 Jun 2012 14:21:11 -0700 Subject: c/r: prctl: update prctl_set_mm_exe_file() after mm->num_exe_file_vmas removal A fix for commit b32dfe377102 ("c/r: prctl: add ability to set new mm_struct::exe_file"). After removing mm->num_exe_file_vmas kernel keeps mm->exe_file until final mmput(), it never becomes NULL while task is alive. We can check for other mapped files in mm instead of checking mm->num_exe_file_vmas, and mark mm with flag MMF_EXE_FILE_CHANGED in order to forbid second changing of mm->exe_file. Signed-off-by: Konstantin Khlebnikov Reviewed-by: Cyrill Gorcunov Cc: Oleg Nesterov Cc: Matt Helsley Cc: Kees Cook Cc: KOSAKI Motohiro Cc: Tejun Heo Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/sys.c | 31 +++++++++++++++++++------------ 2 files changed, 20 insertions(+), 12 deletions(-) (limited to 'include') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6029d8c5447..c688d4cc2e4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -439,6 +439,7 @@ extern int get_dumpable(struct mm_struct *mm); /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ #define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ +#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/sys.c b/kernel/sys.c index 9ff89cb9657..54f20fdee93 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1796,17 +1796,11 @@ static bool vma_flags_mismatch(struct vm_area_struct *vma, static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) { + struct vm_area_struct *vma; struct file *exe_file; struct dentry *dentry; int err; - /* - * Setting new mm::exe_file is only allowed when no VM_EXECUTABLE vma's - * remain. So perform a quick test first. - */ - if (mm->num_exe_file_vmas) - return -EBUSY; - exe_file = fget(fd); if (!exe_file) return -EBADF; @@ -1827,17 +1821,30 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd) if (err) goto exit; + down_write(&mm->mmap_sem); + + /* + * Forbid mm->exe_file change if there are mapped other files. + */ + err = -EBUSY; + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (vma->vm_file && !path_equal(&vma->vm_file->f_path, + &exe_file->f_path)) + goto exit_unlock; + } + /* * The symlink can be changed only once, just to disallow arbitrary * transitions malicious software might bring in. This means one * could make a snapshot over all processes running and monitor * /proc/pid/exe changes to notice unusual activity if needed. */ - down_write(&mm->mmap_sem); - if (likely(!mm->exe_file)) - set_mm_exe_file(mm, exe_file); - else - err = -EBUSY; + err = -EPERM; + if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags)) + goto exit_unlock; + + set_mm_exe_file(mm, exe_file); +exit_unlock: up_write(&mm->mmap_sem); exit: -- cgit v1.2.3-70-g09d2 From 300f786b2683f8bb1ec0afb6e1851183a479c86d Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 7 Jun 2012 14:21:12 -0700 Subject: c/r: prctl: add ability to get clear_tid_address Zero is written at clear_tid_address when the process exits. This functionality is used by pthread_join(). We already have sys_set_tid_address() to change this address for the current task but there is no way to obtain it from user space. Without the ability to find this address and dump it we can't restore pthread'ed apps which call pthread_join() once they have been restored. This patch introduces the PR_GET_TID_ADDRESS prctl option which allows the current process to obtain own clear_tid_address. This feature is available iif CONFIG_CHECKPOINT_RESTORE is set. [akpm@linux-foundation.org: fix prctl numbering] Signed-off-by: Andrew Vagin Signed-off-by: Cyrill Gorcunov Cc: Pedro Alves Cc: Oleg Nesterov Cc: Pavel Emelyanov Cc: Tejun Heo Acked-by: Kees Cook Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/prctl.h | 10 ++++++---- kernel/sys.c | 13 +++++++++++++ 2 files changed, 19 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 711e0a30aac..3988012255d 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -127,8 +127,8 @@ #define PR_SET_PTRACER 0x59616d61 # define PR_SET_PTRACER_ANY ((unsigned long)-1) -#define PR_SET_CHILD_SUBREAPER 36 -#define PR_GET_CHILD_SUBREAPER 37 +#define PR_SET_CHILD_SUBREAPER 36 +#define PR_GET_CHILD_SUBREAPER 37 /* * If no_new_privs is set, then operations that grant new privileges (i.e. @@ -142,7 +142,9 @@ * asking selinux for a specific new context (e.g. with runcon) will result * in execve returning -EPERM. */ -#define PR_SET_NO_NEW_PRIVS 38 -#define PR_GET_NO_NEW_PRIVS 39 +#define PR_SET_NO_NEW_PRIVS 38 +#define PR_GET_NO_NEW_PRIVS 39 + +#define PR_GET_TID_ADDRESS 40 #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/sys.c b/kernel/sys.c index 19a2c713996..0ec1942ba7e 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1988,12 +1988,22 @@ out: up_read(&mm->mmap_sem); return error; } + +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return put_user(me->clear_child_tid, tid_addr); +} + #else /* CONFIG_CHECKPOINT_RESTORE */ static int prctl_set_mm(int opt, unsigned long addr, unsigned long arg4, unsigned long arg5) { return -EINVAL; } +static int prctl_get_tid_address(struct task_struct *me, int __user **tid_addr) +{ + return -EINVAL; +} #endif SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, @@ -2131,6 +2141,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, else return -EINVAL; break; + case PR_GET_TID_ADDRESS: + error = prctl_get_tid_address(me, (int __user **)arg2); + break; default: return -EINVAL; } -- cgit v1.2.3-70-g09d2 From ae82fdb1406ad41d68f07027fe31f2d35ba22a90 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Fri, 8 Jun 2012 14:58:13 +0930 Subject: module_param: stop double-calling parameters. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 026cee0086fe1df4cf74691cf273062cc769617d "params: _initcall-like kernel parameters" set old-style module parameters to level 0. And we call those level 0 calls where we used to, early in start_kernel(). We also loop through the initcall levels and call the levelled module_params before the corresponding initcall. Unfortunately level 0 is early_init(), so we call the standard module_param calls twice. (Turns out most things don't care, but at least ubi.mtd does). Change the level to -1 for standard module_param calls. Reported-by: Benoît Thébaudeau Signed-off-by: Rusty Russell Cc: stable@kernel.org --- include/linux/moduleparam.h | 10 +++++----- init/main.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h index 1b14d25162c..d6a58065c09 100644 --- a/include/linux/moduleparam.h +++ b/include/linux/moduleparam.h @@ -128,7 +128,7 @@ struct kparam_array * The ops can have NULL set or get functions. */ #define module_param_cb(name, ops, arg, perm) \ - __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, 0) + __module_param_call(MODULE_PARAM_PREFIX, name, ops, arg, perm, -1) /** * _param_cb - general callback for a module/cmdline parameter @@ -192,7 +192,7 @@ struct kparam_array { (void *)set, (void *)get }; \ __module_param_call(MODULE_PARAM_PREFIX, \ name, &__param_ops_##name, arg, \ - (perm) + sizeof(__check_old_set_param(set))*0, 0) + (perm) + sizeof(__check_old_set_param(set))*0, -1) /* We don't get oldget: it's often a new-style param_get_uint, etc. */ static inline int @@ -272,7 +272,7 @@ static inline void __kernel_param_unlock(void) */ #define core_param(name, var, type, perm) \ param_check_##type(name, &(var)); \ - __module_param_call("", name, ¶m_ops_##type, &var, perm, 0) + __module_param_call("", name, ¶m_ops_##type, &var, perm, -1) #endif /* !MODULE */ /** @@ -290,7 +290,7 @@ static inline void __kernel_param_unlock(void) = { len, string }; \ __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_ops_string, \ - .str = &__param_string_##name, perm, 0); \ + .str = &__param_string_##name, perm, -1); \ __MODULE_PARM_TYPE(name, "string") /** @@ -432,7 +432,7 @@ extern int param_set_bint(const char *val, const struct kernel_param *kp); __module_param_call(MODULE_PARAM_PREFIX, name, \ ¶m_array_ops, \ .arr = &__param_arr_##name, \ - perm, 0); \ + perm, -1); \ __MODULE_PARM_TYPE(name, "array of " #type) extern struct kernel_param_ops param_array_ops; diff --git a/init/main.c b/init/main.c index 1ca6b32c482..37e12098eac 100644 --- a/init/main.c +++ b/init/main.c @@ -508,7 +508,7 @@ asmlinkage void __init start_kernel(void) parse_early_param(); parse_args("Booting kernel", static_command_line, __start___param, __stop___param - __start___param, - 0, 0, &unknown_bootoption); + -1, -1, &unknown_bootoption); jump_label_init(); -- cgit v1.2.3-70-g09d2 From c8e9cf7bb240049117d2fa64d1540476c289396d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 7 Jun 2012 12:15:15 +0200 Subject: vga_switcheroo: Add a helper function to get the client state Add vga_switcheroo_get_client_state() to get the current state of the client. This is necessary to determine the proper initial state of audio clients in HD-audio driver. Acked-by: Dave Airlie Signed-off-by: Takashi Iwai --- drivers/gpu/vga/vga_switcheroo.c | 13 +++++++++++++ include/linux/vga_switcheroo.h | 7 +++++++ 2 files changed, 20 insertions(+) (limited to 'include') diff --git a/drivers/gpu/vga/vga_switcheroo.c b/drivers/gpu/vga/vga_switcheroo.c index 38f9534ac51..eb4f64f0a56 100644 --- a/drivers/gpu/vga/vga_switcheroo.c +++ b/drivers/gpu/vga/vga_switcheroo.c @@ -190,6 +190,19 @@ find_active_client(struct list_head *head) return NULL; } +int vga_switcheroo_get_client_state(struct pci_dev *pdev) +{ + struct vga_switcheroo_client *client; + + client = find_client_from_pci(&vgasr_priv.clients, pdev); + if (!client) + return VGA_SWITCHEROO_NOT_FOUND; + if (!vgasr_priv.active) + return VGA_SWITCHEROO_INIT; + return client->pwr_state; +} +EXPORT_SYMBOL(vga_switcheroo_get_client_state); + void vga_switcheroo_unregister_client(struct pci_dev *pdev) { struct vga_switcheroo_client *client; diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h index b455c7c212e..b176342ca03 100644 --- a/include/linux/vga_switcheroo.h +++ b/include/linux/vga_switcheroo.h @@ -12,6 +12,9 @@ enum vga_switcheroo_state { VGA_SWITCHEROO_OFF, VGA_SWITCHEROO_ON, + /* below are referred only from vga_switcheroo_get_client_state() */ + VGA_SWITCHEROO_INIT, + VGA_SWITCHEROO_NOT_FOUND, }; enum vga_switcheroo_client_id { @@ -50,6 +53,8 @@ void vga_switcheroo_unregister_handler(void); int vga_switcheroo_process_delayed_switch(void); +int vga_switcheroo_get_client_state(struct pci_dev *dev); + #else static inline void vga_switcheroo_unregister_client(struct pci_dev *dev) {} @@ -62,5 +67,7 @@ static inline int vga_switcheroo_register_audio_client(struct pci_dev *pdev, int id, bool active) { return 0; } static inline void vga_switcheroo_unregister_handler(void) {} static inline int vga_switcheroo_process_delayed_switch(void) { return 0; } +static inline int vga_switcheroo_get_client_state(struct pci_dev *dev) { return VGA_SWITCHEROO_CLIENT_ON; } + #endif -- cgit v1.2.3-70-g09d2 From 505cff00de9c303b95c204eb4544066e3e707911 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 8 Jun 2012 12:49:17 +0200 Subject: vga_switcheroo: Fix error without CONFIG_VGA_SWITCHEROO Fix a typo that is built only when CONFIG_VGA_SWITCHEROO=n. Signed-off-by: Takashi Iwai --- include/linux/vga_switcheroo.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/vga_switcheroo.h b/include/linux/vga_switcheroo.h index b176342ca03..60da41fe9dc 100644 --- a/include/linux/vga_switcheroo.h +++ b/include/linux/vga_switcheroo.h @@ -67,7 +67,7 @@ static inline int vga_switcheroo_register_audio_client(struct pci_dev *pdev, int id, bool active) { return 0; } static inline void vga_switcheroo_unregister_handler(void) {} static inline int vga_switcheroo_process_delayed_switch(void) { return 0; } -static inline int vga_switcheroo_get_client_state(struct pci_dev *dev) { return VGA_SWITCHEROO_CLIENT_ON; } +static inline int vga_switcheroo_get_client_state(struct pci_dev *dev) { return VGA_SWITCHEROO_ON; } #endif -- cgit v1.2.3-70-g09d2 From 8876d6b5f81f4e242a6660da22bbd92f17a8d058 Mon Sep 17 00:00:00 2001 From: Paul Pluzhnikov Date: Sat, 9 Jun 2012 07:53:03 -0700 Subject: net: Make linux/tcp.h C++ friendly (trivial) I originally sent this patch to , but Jiri Kosina did not feel that this is fully appropriate for the trivial tree. Using linux/tcp.h from C++ results in: cat t.cc #include int main() { } g++ -c t.cc In file included from t.cc:1: /usr/include/linux/tcp.h:72: error: '__u32 __fswab32(__u32)' cannot appear in a constant-expression /usr/include/linux/tcp.h:72: error: a function call cannot appear in a constant-expression ... Attached trivial patch fixes this problem. Tested: - the t.cc above compiles with g++ and - the following program generates the same output before/after the patch: #include #include int main () { #define P(a) printf("%s: %08x\n", #a, (int)a) P(TCP_FLAG_CWR); P(TCP_FLAG_ECE); P(TCP_FLAG_URG); P(TCP_FLAG_ACK); P(TCP_FLAG_PSH); P(TCP_FLAG_RST); P(TCP_FLAG_SYN); P(TCP_FLAG_FIN); P(TCP_RESERVED_BITS); P(TCP_DATA_OFFSET); #undef P return 0; } Signed-off-by: Paul Pluzhnikov Signed-off-by: David S. Miller --- include/linux/tcp.h | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4c5b6328337..5f359dbfcdc 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -69,16 +69,16 @@ union tcp_word_hdr { #define tcp_flag_word(tp) ( ((union tcp_word_hdr *)(tp))->words [3]) enum { - TCP_FLAG_CWR = __cpu_to_be32(0x00800000), - TCP_FLAG_ECE = __cpu_to_be32(0x00400000), - TCP_FLAG_URG = __cpu_to_be32(0x00200000), - TCP_FLAG_ACK = __cpu_to_be32(0x00100000), - TCP_FLAG_PSH = __cpu_to_be32(0x00080000), - TCP_FLAG_RST = __cpu_to_be32(0x00040000), - TCP_FLAG_SYN = __cpu_to_be32(0x00020000), - TCP_FLAG_FIN = __cpu_to_be32(0x00010000), - TCP_RESERVED_BITS = __cpu_to_be32(0x0F000000), - TCP_DATA_OFFSET = __cpu_to_be32(0xF0000000) + TCP_FLAG_CWR = __constant_cpu_to_be32(0x00800000), + TCP_FLAG_ECE = __constant_cpu_to_be32(0x00400000), + TCP_FLAG_URG = __constant_cpu_to_be32(0x00200000), + TCP_FLAG_ACK = __constant_cpu_to_be32(0x00100000), + TCP_FLAG_PSH = __constant_cpu_to_be32(0x00080000), + TCP_FLAG_RST = __constant_cpu_to_be32(0x00040000), + TCP_FLAG_SYN = __constant_cpu_to_be32(0x00020000), + TCP_FLAG_FIN = __constant_cpu_to_be32(0x00010000), + TCP_RESERVED_BITS = __constant_cpu_to_be32(0x0F000000), + TCP_DATA_OFFSET = __constant_cpu_to_be32(0xF0000000) }; /* -- cgit v1.2.3-70-g09d2 From 3777808873b0c49c5cf27e44c948dfb02675d578 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 11 Jun 2012 14:29:58 +0900 Subject: bug.h: need linux/kernel.h for TAINT_WARN. asm-generic/bug.h uses taint flags that are only defined in linux/kernel.h, resulting in build failures on platforms that don't include linux/kernel.h some other way: arch/sh/include/asm/thread_info.h:172:2: error: 'TAINT_WARN' undeclared (first use in this function) Caused by commit edd63a2763bd ("set_restore_sigmask() is never called without SIGPENDING (and never should be)"). Reported-by: Stephen Rothwell Cc: Al Viro Signed-off-by: Paul Mundt --- include/asm-generic/bug.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/asm-generic/bug.h b/include/asm-generic/bug.h index 2520a6e241d..9f02005f217 100644 --- a/include/asm-generic/bug.h +++ b/include/asm-generic/bug.h @@ -2,6 +2,7 @@ #define _ASM_GENERIC_BUG_H #include +#include #ifdef CONFIG_BUG -- cgit v1.2.3-70-g09d2 From c5d21c4b2a7765ab0600c8426374b50eb9f4a36f Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Sun, 10 Jun 2012 20:05:24 +0000 Subject: net: Reorder initialization in ip_route_output to fix gcc warning If I build with W=1, for every file that includes , I get the warning include/net/route.h: In function 'ip_route_output': include/net/route.h:135:3: warning: initialized field overwritten [-Woverride-init] include/net/route.h:135:3: warning: (near initialization for 'fl4') [-Woverride-init] (This is with "gcc (Debian 4.6.3-1) 4.6.3") A fix seems pretty trivial: move the initialization of .flowi4_tos earlier. As far as I can tell, this has no effect on code generation. Signed-off-by: Roland Dreier Signed-off-by: David S. Miller --- include/net/route.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/net/route.h b/include/net/route.h index ed2b78e2375..98705468ac0 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -130,9 +130,9 @@ static inline struct rtable *ip_route_output(struct net *net, __be32 daddr, { struct flowi4 fl4 = { .flowi4_oif = oif, + .flowi4_tos = tos, .daddr = daddr, .saddr = saddr, - .flowi4_tos = tos, }; return ip_route_output_key(net, &fl4); } -- cgit v1.2.3-70-g09d2 From fe4561680519019cc15d660862dce513ded2f3a7 Mon Sep 17 00:00:00 2001 From: Paulo Zanoni Date: Tue, 12 Jun 2012 11:27:01 -0300 Subject: drm: increase DRM_OBJECT_MAX_PROPERTY to 24 Before Kernel 3.5, no one was checking for the return value of drm_connector_attach_property, so we never noticed that we were unable to create some properties. Commit "drm: WARN() when drm_connector_attach_property fails" added a WARN when we fail to create a property, and the transition from "connector properties" to "object properties" changed the warning message a little bit. On i915 machines with many TV connectors we hit the maximum number of properties (since each TV connector uses a lot of properties), so we get a few backtraces in our logs. This commit increases the maximum number of properties to 24 hoping we'll have enough room for everybody. Chris suggested that we convert this code to "lists", but I believe this conversion can come after we make sure people's dmesgs are not spammed by our driver. Signed-off-by: Paulo Zanoni Reported-by: Dave Jones Tested-by: Daniel Vetter Signed-off-by: Dave Airlie --- include/drm/drm_crtc.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h index 73e45600f95..bac55c21511 100644 --- a/include/drm/drm_crtc.h +++ b/include/drm/drm_crtc.h @@ -54,7 +54,7 @@ struct drm_mode_object { struct drm_object_properties *properties; }; -#define DRM_OBJECT_MAX_PROPERTY 16 +#define DRM_OBJECT_MAX_PROPERTY 24 struct drm_object_properties { int count; uint32_t ids[DRM_OBJECT_MAX_PROPERTY]; -- cgit v1.2.3-70-g09d2 From 5ee31c6898ea5537fcea160999d60dc63bc0c305 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 12 Jun 2012 06:03:51 +0000 Subject: bonding: Fix corrupted queue_mapping In the transmit path of the bonding driver, skb->cb is used to stash the skb->queue_mapping so that the bonding device can set its own queue mapping. This value becomes corrupted since the skb->cb is also used in __dev_xmit_skb. When transmitting through bonding driver, bond_select_queue is called from dev_queue_xmit. In bond_select_queue the original skb->queue_mapping is copied into skb->cb (via bond_queue_mapping) and skb->queue_mapping is overwritten with the bond driver queue. Subsequently in dev_queue_xmit, __dev_xmit_skb is called which writes the packet length into skb->cb, thereby overwriting the stashed queue mappping. In bond_dev_queue_xmit (called from hard_start_xmit), the queue mapping for the skb is set to the stashed value which is now the skb length and hence is an invalid queue for the slave device. If we want to save skb->queue_mapping into skb->cb[], best place is to add a field in struct qdisc_skb_cb, to make sure it wont conflict with other layers (eg : Qdiscc, Infiniband...) This patchs also makes sure (struct qdisc_skb_cb)->data is aligned on 8 bytes : netem qdisc for example assumes it can store an u64 in it, without misalignment penalty. Note : we only have 20 bytes left in (struct qdisc_skb_cb)->data[]. The largest user is CHOKe and it fills it. Based on a previous patch from Tom Herbert. Signed-off-by: Eric Dumazet Reported-by: Tom Herbert Cc: John Fastabend Cc: Roland Dreier Acked-by: Neil Horman Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 9 +++++---- include/net/sch_generic.h | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 2ee8cf9e8a3..b9c2ae62166 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -76,6 +76,7 @@ #include #include #include +#include #include "bonding.h" #include "bond_3ad.h" #include "bond_alb.h" @@ -381,8 +382,6 @@ struct vlan_entry *bond_next_vlan(struct bonding *bond, struct vlan_entry *curr) return next; } -#define bond_queue_mapping(skb) (*(u16 *)((skb)->cb)) - /** * bond_dev_queue_xmit - Prepare skb for xmit. * @@ -395,7 +394,9 @@ int bond_dev_queue_xmit(struct bonding *bond, struct sk_buff *skb, { skb->dev = slave_dev; - skb->queue_mapping = bond_queue_mapping(skb); + BUILD_BUG_ON(sizeof(skb->queue_mapping) != + sizeof(qdisc_skb_cb(skb)->bond_queue_mapping)); + skb->queue_mapping = qdisc_skb_cb(skb)->bond_queue_mapping; if (unlikely(netpoll_tx_running(slave_dev))) bond_netpoll_send_skb(bond_get_slave_by_dev(bond, slave_dev), skb); @@ -4171,7 +4172,7 @@ static u16 bond_select_queue(struct net_device *dev, struct sk_buff *skb) /* * Save the original txq to restore before passing to the driver */ - bond_queue_mapping(skb) = skb->queue_mapping; + qdisc_skb_cb(skb)->bond_queue_mapping = skb->queue_mapping; if (unlikely(txq >= dev->real_num_tx_queues)) { do { diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 55ce96b53b0..9d7d54a00e6 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -220,13 +220,16 @@ struct tcf_proto { struct qdisc_skb_cb { unsigned int pkt_len; - unsigned char data[24]; + u16 bond_queue_mapping; + u16 _pad; + unsigned char data[20]; }; static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz) { struct qdisc_skb_cb *qcb; - BUILD_BUG_ON(sizeof(skb->cb) < sizeof(unsigned int) + sz); + + BUILD_BUG_ON(sizeof(skb->cb) < offsetof(struct qdisc_skb_cb, data) + sz); BUILD_BUG_ON(sizeof(qcb->data) < sz); } -- cgit v1.2.3-70-g09d2 From 4149268e7816d719b0fde8e89aaa6db8c168fc43 Mon Sep 17 00:00:00 2001 From: Joern Engel Date: Fri, 18 May 2012 13:57:19 -0700 Subject: target: Add TFO->put_session() caller for HW fabric session shutdown This patch adds an optional target_core_fabric_ops->put_session() caller within the existing target_put_session() code path. This is required by tcm_qla2xxx code in order to invoke it's own fabric specific session shutdown handler using se_session->sess_kref. Signed-off-by: Joern Engel Cc: Roland Dreier Cc: Arun Easi Signed-off-by: Nicholas Bellinger --- drivers/target/target_core_transport.c | 8 +++++++- include/target/target_core_fabric.h | 1 + 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index b05fdc0c05d..634d0f31a28 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -315,7 +315,7 @@ void transport_register_session( } EXPORT_SYMBOL(transport_register_session); -static void target_release_session(struct kref *kref) +void target_release_session(struct kref *kref) { struct se_session *se_sess = container_of(kref, struct se_session, sess_kref); @@ -332,6 +332,12 @@ EXPORT_SYMBOL(target_get_session); void target_put_session(struct se_session *se_sess) { + struct se_portal_group *tpg = se_sess->se_tpg; + + if (tpg->se_tpg_tfo->put_session != NULL) { + tpg->se_tpg_tfo->put_session(se_sess); + return; + } kref_put(&se_sess->sess_kref, target_release_session); } EXPORT_SYMBOL(target_put_session); diff --git a/include/target/target_core_fabric.h b/include/target/target_core_fabric.h index 116959933f4..c78a23333c4 100644 --- a/include/target/target_core_fabric.h +++ b/include/target/target_core_fabric.h @@ -47,6 +47,7 @@ struct target_core_fabric_ops { */ int (*check_stop_free)(struct se_cmd *); void (*release_cmd)(struct se_cmd *); + void (*put_session)(struct se_session *); /* * Called with spin_lock_bh(struct se_portal_group->session_lock held. */ -- cgit v1.2.3-70-g09d2 From c2fb8a3fa25513de8fedb38509b1f15a5bbee47b Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Wed, 13 Jun 2012 11:20:19 -0400 Subject: USB: add NO_D3_DURING_SLEEP flag and revert 151b61284776be2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch (as1558) fixes a problem affecting several ASUS computers: The machine crashes or corrupts memory when going into suspend if the ehci-hcd driver is bound to any controllers. Users have been forced to unbind or unload ehci-hcd before putting their systems to sleep. After extensive testing, it was determined that the machines don't like going into suspend when any EHCI controllers are in the PCI D3 power state. Presumably this is a firmware bug, but there's nothing we can do about it except to avoid putting the controllers in D3 during system sleep. The patch adds a new flag to indicate whether the problem is present, and avoids changing the controller's power state if the flag is set. Runtime suspend is unaffected; this matters only for system suspend. However as a side effect, the controller will not respond to remote wakeup requests while the system is asleep. Hence USB wakeup is not functional -- but of course, this is already true in the current state of affairs. A similar patch has already been applied as commit 151b61284776be2d6f02d48c23c3625678960b97 (USB: EHCI: fix crash during suspend on ASUS computers). The patch supersedes that one and reverts it. There are two differences: The old patch added the flag at the USB level; this patch adds it at the PCI level. The old patch applied to all chipsets with the same vendor, subsystem vendor, and product IDs; this patch makes an exception for a known-good system (based on DMI information). Signed-off-by: Alan Stern Tested-by: Dâniel Fraga Tested-by: Andrey Rahmatullin Tested-by: Steven Rostedt Cc: stable Reviewed-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci.c | 5 +++++ drivers/pci/quirks.c | 26 ++++++++++++++++++++++++++ drivers/usb/core/hcd-pci.c | 9 --------- drivers/usb/host/ehci-pci.c | 8 -------- include/linux/pci.h | 2 ++ include/linux/usb/hcd.h | 2 -- 6 files changed, 33 insertions(+), 19 deletions(-) (limited to 'include') diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 447e83472c0..77cb54a65cd 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -1744,6 +1744,11 @@ int pci_prepare_to_sleep(struct pci_dev *dev) if (target_state == PCI_POWER_ERROR) return -EIO; + /* Some devices mustn't be in D3 during system sleep */ + if (target_state == PCI_D3hot && + (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP)) + return 0; + pci_enable_wake(dev, target_state, device_may_wakeup(&dev->dev)); error = pci_set_power_state(dev, target_state); diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 2a752167754..194b243a281 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -2929,6 +2929,32 @@ static void __devinit disable_igfx_irq(struct pci_dev *dev) DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x0102, disable_igfx_irq); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x010a, disable_igfx_irq); +/* + * The Intel 6 Series/C200 Series chipset's EHCI controllers on many + * ASUS motherboards will cause memory corruption or a system crash + * if they are in D3 while the system is put into S3 sleep. + */ +static void __devinit asus_ehci_no_d3(struct pci_dev *dev) +{ + const char *sys_info; + static const char good_Asus_board[] = "P8Z68-V"; + + if (dev->dev_flags & PCI_DEV_FLAGS_NO_D3_DURING_SLEEP) + return; + if (dev->subsystem_vendor != PCI_VENDOR_ID_ASUSTEK) + return; + sys_info = dmi_get_system_info(DMI_BOARD_NAME); + if (sys_info && memcmp(sys_info, good_Asus_board, + sizeof(good_Asus_board) - 1) == 0) + return; + + dev_info(&dev->dev, "broken D3 during system sleep on ASUS\n"); + dev->dev_flags |= PCI_DEV_FLAGS_NO_D3_DURING_SLEEP; + device_set_wakeup_capable(&dev->dev, false); +} +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c26, asus_ehci_no_d3); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, 0x1c2d, asus_ehci_no_d3); + static void pci_do_fixups(struct pci_dev *dev, struct pci_fixup *f, struct pci_fixup *end) { diff --git a/drivers/usb/core/hcd-pci.c b/drivers/usb/core/hcd-pci.c index 57ed9e400c0..622b4a48e73 100644 --- a/drivers/usb/core/hcd-pci.c +++ b/drivers/usb/core/hcd-pci.c @@ -493,15 +493,6 @@ static int hcd_pci_suspend_noirq(struct device *dev) pci_save_state(pci_dev); - /* - * Some systems crash if an EHCI controller is in D3 during - * a sleep transition. We have to leave such controllers in D0. - */ - if (hcd->broken_pci_sleep) { - dev_dbg(dev, "Staying in PCI D0\n"); - return retval; - } - /* If the root hub is dead rather than suspended, disallow remote * wakeup. usb_hc_died() should ensure that both hosts are marked as * dying, so we only need to check the primary roothub. diff --git a/drivers/usb/host/ehci-pci.c b/drivers/usb/host/ehci-pci.c index bc94d7bf072..123481793a4 100644 --- a/drivers/usb/host/ehci-pci.c +++ b/drivers/usb/host/ehci-pci.c @@ -144,14 +144,6 @@ static int ehci_pci_setup(struct usb_hcd *hcd) hcd->has_tt = 1; tdi_reset(ehci); } - if (pdev->subsystem_vendor == PCI_VENDOR_ID_ASUSTEK) { - /* EHCI #1 or #2 on 6 Series/C200 Series chipset */ - if (pdev->device == 0x1c26 || pdev->device == 0x1c2d) { - ehci_info(ehci, "broken D3 during system sleep on ASUS\n"); - hcd->broken_pci_sleep = 1; - device_set_wakeup_capable(&pdev->dev, false); - } - } break; case PCI_VENDOR_ID_TDI: if (pdev->device == PCI_DEVICE_ID_TDI_EHCI) { diff --git a/include/linux/pci.h b/include/linux/pci.h index d8c379dba6a..fefb4e19bf6 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -176,6 +176,8 @@ enum pci_dev_flags { PCI_DEV_FLAGS_NO_D3 = (__force pci_dev_flags_t) 2, /* Provide indication device is assigned by a Virtual Machine Manager */ PCI_DEV_FLAGS_ASSIGNED = (__force pci_dev_flags_t) 4, + /* Device causes system crash if in D3 during S3 sleep */ + PCI_DEV_FLAGS_NO_D3_DURING_SLEEP = (__force pci_dev_flags_t) 8, }; enum pci_irq_reroute_variant { diff --git a/include/linux/usb/hcd.h b/include/linux/usb/hcd.h index 7f855d50cdf..49b3ac29726 100644 --- a/include/linux/usb/hcd.h +++ b/include/linux/usb/hcd.h @@ -126,8 +126,6 @@ struct usb_hcd { unsigned wireless:1; /* Wireless USB HCD */ unsigned authorized_default:1; unsigned has_tt:1; /* Integrated TT in root hub */ - unsigned broken_pci_sleep:1; /* Don't put the - controller in PCI-D3 for system sleep */ unsigned int irq; /* irq allocated */ void __iomem *regs; /* device memory/io */ -- cgit v1.2.3-70-g09d2 From 9b15b817f3d62409290fd56fe3cbb076a931bb0a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Fri, 15 Jun 2012 17:55:50 -0700 Subject: swap: fix shmem swapping when more than 8 areas Minchan Kim reports that when a system has many swap areas, and tmpfs swaps out to the ninth or more, shmem_getpage_gfp()'s attempts to read back the page cannot locate it, and the read fails with -ENOMEM. Whoops. Yes, I blindly followed read_swap_header()'s pte_to_swp_entry( swp_entry_to_pte()) technique for determining maximum usable swap offset, without stopping to realize that that actually depends upon the pte swap encoding shifting swap offset to the higher bits and truncating it there. Whereas our radix_tree swap encoding leaves offset in the lower bits: it's swap "type" (that is, index of swap area) that was truncated. Fix it by reducing the SWP_TYPE_SHIFT() in swapops.h, and removing the broken radix_to_swp_entry(swp_to_radix_entry()) from read_swap_header(). This does not reduce the usable size of a swap area any further, it leaves it as claimed when making the original commit: no change from 3.0 on x86_64, nor on i386 without PAE; but 3.0's 512GB is reduced to 128GB per swapfile on i386 with PAE. It's not a change I would have risked five years ago, but with x86_64 supported for ten years, I believe it's appropriate now. Hmm, and what if some architecture implements its swap pte with offset encoded below type? That would equally break the maximum usable swap offset check. Happily, they all follow the same tradition of encoding offset above type, but I'll prepare a check on that for next. Reported-and-Reviewed-and-Tested-by: Minchan Kim Signed-off-by: Hugh Dickins Cc: stable@vger.kernel.org [3.1, 3.2, 3.3, 3.4] Signed-off-by: Linus Torvalds --- include/linux/swapops.h | 8 +++++--- mm/swapfile.c | 12 ++++-------- 2 files changed, 9 insertions(+), 11 deletions(-) (limited to 'include') diff --git a/include/linux/swapops.h b/include/linux/swapops.h index 792d16d9cbc..47ead515c81 100644 --- a/include/linux/swapops.h +++ b/include/linux/swapops.h @@ -9,13 +9,15 @@ * get good packing density in that tree, so the index should be dense in * the low-order bits. * - * We arrange the `type' and `offset' fields so that `type' is at the five + * We arrange the `type' and `offset' fields so that `type' is at the seven * high-order bits of the swp_entry_t and `offset' is right-aligned in the - * remaining bits. + * remaining bits. Although `type' itself needs only five bits, we allow for + * shmem/tmpfs to shift it all up a further two bits: see swp_to_radix_entry(). * * swp_entry_t's are *never* stored anywhere in their arch-dependent format. */ -#define SWP_TYPE_SHIFT(e) (sizeof(e.val) * 8 - MAX_SWAPFILES_SHIFT) +#define SWP_TYPE_SHIFT(e) ((sizeof(e.val) * 8) - \ + (MAX_SWAPFILES_SHIFT + RADIX_TREE_EXCEPTIONAL_SHIFT)) #define SWP_OFFSET_MASK(e) ((1UL << SWP_TYPE_SHIFT(e)) - 1) /* diff --git a/mm/swapfile.c b/mm/swapfile.c index de5bc51c4a6..71373d03fce 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1916,24 +1916,20 @@ static unsigned long read_swap_header(struct swap_info_struct *p, /* * Find out how many pages are allowed for a single swap - * device. There are three limiting factors: 1) the number + * device. There are two limiting factors: 1) the number * of bits for the swap offset in the swp_entry_t type, and * 2) the number of bits in the swap pte as defined by the - * the different architectures, and 3) the number of free bits - * in an exceptional radix_tree entry. In order to find the + * different architectures. In order to find the * largest possible bit mask, a swap entry with swap type 0 * and swap offset ~0UL is created, encoded to a swap pte, * decoded to a swp_entry_t again, and finally the swap * offset is extracted. This will mask all the bits from * the initial ~0UL mask that can't be encoded in either * the swp_entry_t or the architecture definition of a - * swap pte. Then the same is done for a radix_tree entry. + * swap pte. */ maxpages = swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))); - maxpages = swp_offset(radix_to_swp_entry( - swp_to_radix_entry(swp_entry(0, maxpages)))) + 1; - + swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; if (maxpages > swap_header->info.last_page) { maxpages = swap_header->info.last_page + 1; /* p->max is an unsigned int: don't overflow it */ -- cgit v1.2.3-70-g09d2 From 6863ca6227598d15c372f1e03449bbb4cfbcca7f Mon Sep 17 00:00:00 2001 From: Krystian Garbaciak Date: Fri, 15 Jun 2012 11:23:56 +0100 Subject: regmap: Add support for register indirect addressing. Devices with register paging or indirectly accessed registers can configure register mapping to map those on virtual address range. During access to virtually mapped register range, indirect addressing is processed automatically, in following steps: 1. selector for page or indirect register is updated (when needed); 2. register in data window is accessed. Configuration should provide minimum and maximum register for virtual range, details of selector field for page selection, minimum and maximum register of data window for indirect access. Virtual range registers are managed by cache as well as direct access registers. In order to make indirect access more efficient, selector register should be declared as non-volatile, if possible. struct regmap_config is extended with the following: struct regmap_range_cfg *ranges; unsigned int n_ranges; [Also reordered debugfs init to later on since the cleanup code was conflicting with the new cleanup code for ranges anyway -- broonie] Signed-off-by: Krystian Garbaciak Signed-off-by: Mark Brown --- drivers/base/regmap/internal.h | 17 ++++ drivers/base/regmap/regmap.c | 201 ++++++++++++++++++++++++++++++++++++++++- include/linux/regmap.h | 39 ++++++++ 3 files changed, 252 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/drivers/base/regmap/internal.h b/drivers/base/regmap/internal.h index b986b8660b0..80f9ab9c3aa 100644 --- a/drivers/base/regmap/internal.h +++ b/drivers/base/regmap/internal.h @@ -95,6 +95,9 @@ struct regmap { /* if set, converts bulk rw to single rw */ bool use_single_rw; + + struct rb_root range_tree; + void *selector_work_buf; /* Scratch buffer used for selector */ }; struct regcache_ops { @@ -115,6 +118,20 @@ bool regmap_precious(struct regmap *map, unsigned int reg); int _regmap_write(struct regmap *map, unsigned int reg, unsigned int val); +struct regmap_range_node { + struct rb_node node; + + unsigned int range_min; + unsigned int range_max; + + unsigned int selector_reg; + unsigned int selector_mask; + int selector_shift; + + unsigned int window_start; + unsigned int window_len; +}; + #ifdef CONFIG_DEBUG_FS extern void regmap_debugfs_initcall(void); extern void regmap_debugfs_init(struct regmap *map, const char *name); diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 652017991da..83a0166420a 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -15,12 +15,17 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include #include "internal.h" +static int _regmap_update_bits(struct regmap *map, unsigned int reg, + unsigned int mask, unsigned int val, + bool *change); + bool regmap_writeable(struct regmap *map, unsigned int reg) { if (map->max_register && reg > map->max_register) @@ -208,6 +213,67 @@ static void dev_get_regmap_release(struct device *dev, void *res) */ } +static bool _regmap_range_add(struct regmap *map, + struct regmap_range_node *data) +{ + struct rb_root *root = &map->range_tree; + struct rb_node **new = &(root->rb_node), *parent = NULL; + + while (*new) { + struct regmap_range_node *this = + container_of(*new, struct regmap_range_node, node); + + parent = *new; + if (data->range_max < this->range_min) + new = &((*new)->rb_left); + else if (data->range_min > this->range_max) + new = &((*new)->rb_right); + else + return false; + } + + rb_link_node(&data->node, parent, new); + rb_insert_color(&data->node, root); + + return true; +} + +static struct regmap_range_node *_regmap_range_lookup(struct regmap *map, + unsigned int reg) +{ + struct rb_node *node = map->range_tree.rb_node; + + while (node) { + struct regmap_range_node *this = + container_of(node, struct regmap_range_node, node); + + if (reg < this->range_min) + node = node->rb_left; + else if (reg > this->range_max) + node = node->rb_right; + else + return this; + } + + return NULL; +} + +static void regmap_range_exit(struct regmap *map) +{ + struct rb_node *next; + struct regmap_range_node *range_node; + + next = rb_first(&map->range_tree); + while (next) { + range_node = rb_entry(next, struct regmap_range_node, node); + next = rb_next(&range_node->node); + rb_erase(&range_node->node, &map->range_tree); + kfree(range_node); + } + + kfree(map->selector_work_buf); +} + /** * regmap_init(): Initialise register map * @@ -227,6 +293,7 @@ struct regmap *regmap_init(struct device *dev, { struct regmap *map, **m; int ret = -EINVAL; + int i, j; if (!bus || !config) goto err; @@ -364,27 +431,88 @@ struct regmap *regmap_init(struct device *dev, goto err_map; } - regmap_debugfs_init(map, config->name); + map->range_tree = RB_ROOT; + for (i = 0; i < config->n_ranges; i++) { + const struct regmap_range_cfg *range_cfg = &config->ranges[i]; + struct regmap_range_node *new; + + /* Sanity check */ + if (range_cfg->range_max < range_cfg->range_min || + range_cfg->range_max > map->max_register || + range_cfg->selector_reg > map->max_register || + range_cfg->window_len == 0) + goto err_range; + + /* Make sure, that this register range has no selector + or data window within its boundary */ + for (j = 0; j < config->n_ranges; j++) { + unsigned sel_reg = config->ranges[j].selector_reg; + unsigned win_min = config->ranges[j].window_start; + unsigned win_max = win_min + + config->ranges[j].window_len - 1; + + if (range_cfg->range_min <= sel_reg && + sel_reg <= range_cfg->range_max) { + goto err_range; + } + + if (!(win_max < range_cfg->range_min || + win_min > range_cfg->range_max)) { + goto err_range; + } + } + + new = kzalloc(sizeof(*new), GFP_KERNEL); + if (new == NULL) { + ret = -ENOMEM; + goto err_range; + } + + new->range_min = range_cfg->range_min; + new->range_max = range_cfg->range_max; + new->selector_reg = range_cfg->selector_reg; + new->selector_mask = range_cfg->selector_mask; + new->selector_shift = range_cfg->selector_shift; + new->window_start = range_cfg->window_start; + new->window_len = range_cfg->window_len; + + if (_regmap_range_add(map, new) == false) { + kfree(new); + goto err_range; + } + + if (map->selector_work_buf == NULL) { + map->selector_work_buf = + kzalloc(map->format.buf_size, GFP_KERNEL); + if (map->selector_work_buf == NULL) { + ret = -ENOMEM; + goto err_range; + } + } + } ret = regcache_init(map, config); if (ret < 0) - goto err_debugfs; + goto err_range; + + regmap_debugfs_init(map, config->name); /* Add a devres resource for dev_get_regmap() */ m = devres_alloc(dev_get_regmap_release, sizeof(*m), GFP_KERNEL); if (!m) { ret = -ENOMEM; - goto err_cache; + goto err_debugfs; } *m = map; devres_add(dev, m); return map; -err_cache: - regcache_exit(map); err_debugfs: regmap_debugfs_exit(map); + regcache_exit(map); +err_range: + regmap_range_exit(map); kfree(map->work_buf); err_map: kfree(map); @@ -481,6 +609,7 @@ void regmap_exit(struct regmap *map) { regcache_exit(map); regmap_debugfs_exit(map); + regmap_range_exit(map); if (map->bus->free_context) map->bus->free_context(map->bus_context); kfree(map->work_buf); @@ -526,6 +655,56 @@ struct regmap *dev_get_regmap(struct device *dev, const char *name) } EXPORT_SYMBOL_GPL(dev_get_regmap); +static int _regmap_select_page(struct regmap *map, unsigned int *reg, + unsigned int val_num) +{ + struct regmap_range_node *range; + void *orig_work_buf; + unsigned int win_offset; + unsigned int win_page; + bool page_chg; + int ret; + + range = _regmap_range_lookup(map, *reg); + if (range) { + win_offset = (*reg - range->range_min) % range->window_len; + win_page = (*reg - range->range_min) / range->window_len; + + if (val_num > 1) { + /* Bulk write shouldn't cross range boundary */ + if (*reg + val_num - 1 > range->range_max) + return -EINVAL; + + /* ... or single page boundary */ + if (val_num > range->window_len - win_offset) + return -EINVAL; + } + + /* It is possible to have selector register inside data window. + In that case, selector register is located on every page and + it needs no page switching, when accessed alone. */ + if (val_num > 1 || + range->window_start + win_offset != range->selector_reg) { + /* Use separate work_buf during page switching */ + orig_work_buf = map->work_buf; + map->work_buf = map->selector_work_buf; + + ret = _regmap_update_bits(map, range->selector_reg, + range->selector_mask, + win_page << range->selector_shift, + &page_chg); + if (ret < 0) + return ret; + + map->work_buf = orig_work_buf; + } + + *reg = range->window_start + win_offset; + } + + return 0; +} + static int _regmap_raw_write(struct regmap *map, unsigned int reg, const void *val, size_t val_len) { @@ -563,6 +742,10 @@ static int _regmap_raw_write(struct regmap *map, unsigned int reg, } } + ret = _regmap_select_page(map, ®, val_len / map->format.val_bytes); + if (ret < 0) + return ret; + map->format.format_reg(map->work_buf, reg, map->reg_shift); u8[0] |= map->write_flag_mask; @@ -626,6 +809,10 @@ int _regmap_write(struct regmap *map, unsigned int reg, trace_regmap_reg_write(map->dev, reg, val); if (map->format.format_write) { + ret = _regmap_select_page(map, ®, 1); + if (ret < 0) + return ret; + map->format.format_write(map, reg, val); trace_regmap_hw_write_start(map->dev, reg, 1); @@ -783,6 +970,10 @@ static int _regmap_raw_read(struct regmap *map, unsigned int reg, void *val, u8 *u8 = map->work_buf; int ret; + ret = _regmap_select_page(map, ®, val_len / map->format.val_bytes); + if (ret < 0) + return ret; + map->format.format_reg(map->work_buf, reg, map->reg_shift); /* diff --git a/include/linux/regmap.h b/include/linux/regmap.h index 56af22ec9ab..5f69d4ad3eb 100644 --- a/include/linux/regmap.h +++ b/include/linux/regmap.h @@ -14,12 +14,14 @@ */ #include +#include struct module; struct device; struct i2c_client; struct spi_device; struct regmap; +struct regmap_range_cfg; /* An enum of all the supported cache types */ enum regcache_type { @@ -84,6 +86,9 @@ struct reg_default { * @reg_defaults_raw: Power on reset values for registers (for use with * register cache support). * @num_reg_defaults_raw: Number of elements in reg_defaults_raw. + * + * @ranges: Array of configuration entries for virtual address ranges. + * @num_ranges: Number of range configuration entries. */ struct regmap_config { const char *name; @@ -109,6 +114,40 @@ struct regmap_config { u8 write_flag_mask; bool use_single_rw; + + const struct regmap_range_cfg *ranges; + unsigned int n_ranges; +}; + +/** + * Configuration for indirectly accessed or paged registers. + * Registers, mapped to this virtual range, are accessed in two steps: + * 1. page selector register update; + * 2. access through data window registers. + * + * @range_min: Address of the lowest register address in virtual range. + * @range_max: Address of the highest register in virtual range. + * + * @page_sel_reg: Register with selector field. + * @page_sel_mask: Bit shift for selector value. + * @page_sel_shift: Bit mask for selector value. + * + * @window_start: Address of first (lowest) register in data window. + * @window_len: Number of registers in data window. + */ +struct regmap_range_cfg { + /* Registers of virtual address range */ + unsigned int range_min; + unsigned int range_max; + + /* Page selector for indirect addressing */ + unsigned int selector_reg; + unsigned int selector_mask; + int selector_shift; + + /* Data window (per each page) */ + unsigned int window_start; + unsigned int window_len; }; typedef int (*regmap_hw_write)(void *context, const void *data, -- cgit v1.2.3-70-g09d2