From 925268a06dc2b1ff7bfcc37419a6827a0e739639 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 11 Jan 2011 16:44:01 +0900 Subject: memory hotplug: one more lock on memory hotplug Now, memory_hotplug_(un)lock() is used for add/remove/offline pages for avoiding races with hibernation. But this should be held in online_pages(), too. It seems asymmetric. There are cases where one has to avoid a race with memory hotplug notifier and his own local code, and hotplug v.s. hotplug. This will add a generic solution for avoiding races. In other view, having lock here has no big impacts. online pages is tend to be done by udev script at el against each memory section one by one. Then, it's better to have lock here, too. Cc: # 2.6.37 Reviewed-by: Christoph Lameter Acked-by: David Rientjes Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Pekka Enberg --- mm/memory_hotplug.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523af547..83163c096a7 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -407,6 +407,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) int ret; struct memory_notify arg; + lock_memory_hotplug(); arg.start_pfn = pfn; arg.nr_pages = nr_pages; arg.status_change_nid = -1; @@ -419,6 +420,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) ret = notifier_to_errno(ret); if (ret) { memory_notify(MEM_CANCEL_ONLINE, &arg); + unlock_memory_hotplug(); return ret; } /* @@ -443,6 +445,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) printk(KERN_DEBUG "online_pages %lx at %lx failed\n", nr_pages, pfn); memory_notify(MEM_CANCEL_ONLINE, &arg); + unlock_memory_hotplug(); return ret; } @@ -467,6 +470,7 @@ int online_pages(unsigned long pfn, unsigned long nr_pages) if (onlined_pages) memory_notify(MEM_ONLINE, &arg); + unlock_memory_hotplug(); return 0; } -- cgit v1.2.3-70-g09d2 From 04d94879c8a4973b5499dc26b9d38acee8928791 Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Mon, 10 Jan 2011 10:15:15 -0600 Subject: slub: Avoid use of slub_lock in show_slab_objects() The purpose of the locking is to prevent removal and additions of nodes when statistics are gathered for a slab cache. So we need to avoid racing with memory hotplug functionality. It is enough to take the memory hotplug locks there instead of the slub_lock. online_pages() currently does not acquire the memory_hotplug lock. Another patch will be submitted by the memory hotplug authors to take the memory hotplug lock and describe the uses of the memory hotplug lock to protect against adding and removal of nodes from non hotplug data structures. Cc: # 2.6.37 Reported-and-tested-by: Bart Van Assche Acked-by: David Rientjes Signed-off-by: Christoph Lameter Signed-off-by: Pekka Enberg --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index bec0e355fba..96e69071782 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3821,7 +3821,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, } } - down_read(&slub_lock); + lock_memory_hotplug(); #ifdef CONFIG_SLUB_DEBUG if (flags & SO_ALL) { for_each_node_state(node, N_NORMAL_MEMORY) { @@ -3862,7 +3862,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s, x += sprintf(buf + x, " N%d=%lu", node, nodes[node]); #endif - up_read(&slub_lock); + unlock_memory_hotplug(); kfree(nodes); return x + sprintf(buf + x, "\n"); } -- cgit v1.2.3-70-g09d2 From 81e88fdc432a1552401d6e91a984dcccce72b8dc Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 12 Jan 2011 14:44:55 +0800 Subject: ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support Generic Hardware Error Source provides a way to report platform hardware errors (such as that from chipset). It works in so called "Firmware First" mode, that is, hardware errors are reported to firmware firstly, then reported to Linux by firmware. This way, some non-standard hardware error registers or non-standard hardware link can be checked by firmware to produce more valuable hardware error information for Linux. This patch adds POLL/IRQ/NMI notification types support. Because the memory area used to transfer hardware error information from BIOS to Linux can be determined only in NMI, IRQ or timer handler, but general ioremap can not be used in atomic context, so a special version of atomic ioremap is implemented for that. Known issue: - Error information can not be printed for recoverable errors notified via NMI, because printk is not NMI-safe. Will fix this via delay printing to IRQ context via irq_work or make printk NMI-safe. v2: - adjust printk format per comments. Signed-off-by: Huang Ying Reviewed-by: Andi Kleen Signed-off-by: Len Brown --- arch/x86/kernel/acpi/boot.c | 1 + arch/x86/kernel/dumpstack.c | 1 + drivers/acpi/apei/ghes.c | 406 +++++++++++++++++++++++++++++++++++--------- kernel/panic.c | 1 + lib/ioremap.c | 2 + mm/vmalloc.c | 1 + 6 files changed, 328 insertions(+), 84 deletions(-) (limited to 'mm') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 71232b941b6..c2d0baa48d8 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -504,6 +504,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) return 0; } +EXPORT_SYMBOL_GPL(acpi_gsi_to_irq); int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) { diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6e8752c1bd5..d34cf80ec40 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -240,6 +240,7 @@ unsigned __kprobes long oops_begin(void) bust_spinlocks(1); return flags; } +EXPORT_SYMBOL_GPL(oops_begin); void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) { diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index 51905d07a4d..d1d484d4a06 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -12,10 +12,6 @@ * For more information about Generic Hardware Error Source, please * refer to ACPI Specification version 4.0, section 17.3.2.6 * - * Now, only SCI notification type and memory errors are - * supported. More notification type and hardware error type will be - * added later. - * * Copyright 2010 Intel Corp. * Author: Huang Ying * @@ -39,15 +35,18 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include #include #include +#include #include "apei-internal.h" @@ -56,42 +55,131 @@ #define GHES_ESTATUS_MAX_SIZE 65536 /* - * One struct ghes is created for each generic hardware error - * source. - * + * One struct ghes is created for each generic hardware error source. * It provides the context for APEI hardware error timer/IRQ/SCI/NMI - * handler. Handler for one generic hardware error source is only - * triggered after the previous one is done. So handler can uses - * struct ghes without locking. + * handler. * * estatus: memory buffer for error status block, allocated during * HEST parsing. */ #define GHES_TO_CLEAR 0x0001 +#define GHES_EXITING 0x0002 struct ghes { struct acpi_hest_generic *generic; struct acpi_hest_generic_status *estatus; - struct list_head list; u64 buffer_paddr; unsigned long flags; + union { + struct list_head list; + struct timer_list timer; + unsigned int irq; + }; }; +static int ghes_panic_timeout __read_mostly = 30; + /* - * Error source lists, one list for each notification method. The - * members in lists are struct ghes. + * All error sources notified with SCI shares one notifier function, + * so they need to be linked and checked one by one. This is applied + * to NMI too. * - * The list members are only added in HEST parsing and deleted during - * module_exit, that is, single-threaded. So no lock is needed for - * that. - * - * But the mutual exclusion is needed between members adding/deleting - * and timer/IRQ/SCI/NMI handler, which may traverse the list. RCU is - * used for that. + * RCU is used for these lists, so ghes_list_mutex is only used for + * list changing, not for traversing. */ static LIST_HEAD(ghes_sci); +static LIST_HEAD(ghes_nmi); static DEFINE_MUTEX(ghes_list_mutex); +/* + * NMI may be triggered on any CPU, so ghes_nmi_lock is used for + * mutual exclusion. + */ +static DEFINE_RAW_SPINLOCK(ghes_nmi_lock); + +/* + * Because the memory area used to transfer hardware error information + * from BIOS to Linux can be determined only in NMI, IRQ or timer + * handler, but general ioremap can not be used in atomic context, so + * a special version of atomic ioremap is implemented for that. + */ + +/* + * Two virtual pages are used, one for NMI context, the other for + * IRQ/PROCESS context + */ +#define GHES_IOREMAP_PAGES 2 +#define GHES_IOREMAP_NMI_PAGE(base) (base) +#define GHES_IOREMAP_IRQ_PAGE(base) ((base) + PAGE_SIZE) + +/* virtual memory area for atomic ioremap */ +static struct vm_struct *ghes_ioremap_area; +/* + * These 2 spinlock is used to prevent atomic ioremap virtual memory + * area from being mapped simultaneously. + */ +static DEFINE_RAW_SPINLOCK(ghes_ioremap_lock_nmi); +static DEFINE_SPINLOCK(ghes_ioremap_lock_irq); + +static int ghes_ioremap_init(void) +{ + ghes_ioremap_area = __get_vm_area(PAGE_SIZE * GHES_IOREMAP_PAGES, + VM_IOREMAP, VMALLOC_START, VMALLOC_END); + if (!ghes_ioremap_area) { + pr_err(GHES_PFX "Failed to allocate virtual memory area for atomic ioremap.\n"); + return -ENOMEM; + } + + return 0; +} + +static void ghes_ioremap_exit(void) +{ + free_vm_area(ghes_ioremap_area); +} + +static void __iomem *ghes_ioremap_pfn_nmi(u64 pfn) +{ + unsigned long vaddr; + + vaddr = (unsigned long)GHES_IOREMAP_NMI_PAGE(ghes_ioremap_area->addr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, + pfn << PAGE_SHIFT, PAGE_KERNEL); + + return (void __iomem *)vaddr; +} + +static void __iomem *ghes_ioremap_pfn_irq(u64 pfn) +{ + unsigned long vaddr; + + vaddr = (unsigned long)GHES_IOREMAP_IRQ_PAGE(ghes_ioremap_area->addr); + ioremap_page_range(vaddr, vaddr + PAGE_SIZE, + pfn << PAGE_SHIFT, PAGE_KERNEL); + + return (void __iomem *)vaddr; +} + +static void ghes_iounmap_nmi(void __iomem *vaddr_ptr) +{ + unsigned long vaddr = (unsigned long __force)vaddr_ptr; + void *base = ghes_ioremap_area->addr; + + BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_NMI_PAGE(base)); + unmap_kernel_range_noflush(vaddr, PAGE_SIZE); + __flush_tlb_one(vaddr); +} + +static void ghes_iounmap_irq(void __iomem *vaddr_ptr) +{ + unsigned long vaddr = (unsigned long __force)vaddr_ptr; + void *base = ghes_ioremap_area->addr; + + BUG_ON(vaddr != (unsigned long)GHES_IOREMAP_IRQ_PAGE(base)); + unmap_kernel_range_noflush(vaddr, PAGE_SIZE); + __flush_tlb_one(vaddr); +} + static struct ghes *ghes_new(struct acpi_hest_generic *generic) { struct ghes *ghes; @@ -102,7 +190,6 @@ static struct ghes *ghes_new(struct acpi_hest_generic *generic) if (!ghes) return ERR_PTR(-ENOMEM); ghes->generic = generic; - INIT_LIST_HEAD(&ghes->list); rc = acpi_pre_map_gar(&generic->error_status_address); if (rc) goto err_free; @@ -159,22 +246,41 @@ static inline int ghes_severity(int severity) } } -/* SCI handler run in work queue, so ioremap can be used here */ -static int ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, - int from_phys) +static void ghes_copy_tofrom_phys(void *buffer, u64 paddr, u32 len, + int from_phys) { - void *vaddr; - - vaddr = ioremap_cache(paddr, len); - if (!vaddr) - return -ENOMEM; - if (from_phys) - memcpy(buffer, vaddr, len); - else - memcpy(vaddr, buffer, len); - iounmap(vaddr); - - return 0; + void __iomem *vaddr; + unsigned long flags = 0; + int in_nmi = in_nmi(); + u64 offset; + u32 trunk; + + while (len > 0) { + offset = paddr - (paddr & PAGE_MASK); + if (in_nmi) { + raw_spin_lock(&ghes_ioremap_lock_nmi); + vaddr = ghes_ioremap_pfn_nmi(paddr >> PAGE_SHIFT); + } else { + spin_lock_irqsave(&ghes_ioremap_lock_irq, flags); + vaddr = ghes_ioremap_pfn_irq(paddr >> PAGE_SHIFT); + } + trunk = PAGE_SIZE - offset; + trunk = min(trunk, len); + if (from_phys) + memcpy_fromio(buffer, vaddr + offset, trunk); + else + memcpy_toio(vaddr + offset, buffer, trunk); + len -= trunk; + paddr += trunk; + buffer += trunk; + if (in_nmi) { + ghes_iounmap_nmi(vaddr); + raw_spin_unlock(&ghes_ioremap_lock_nmi); + } else { + ghes_iounmap_irq(vaddr); + spin_unlock_irqrestore(&ghes_ioremap_lock_irq, flags); + } + } } static int ghes_read_estatus(struct ghes *ghes, int silent) @@ -195,10 +301,8 @@ static int ghes_read_estatus(struct ghes *ghes, int silent) if (!buf_paddr) return -ENOENT; - rc = ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, - sizeof(*ghes->estatus), 1); - if (rc) - return rc; + ghes_copy_tofrom_phys(ghes->estatus, buf_paddr, + sizeof(*ghes->estatus), 1); if (!ghes->estatus->block_status) return -ENOENT; @@ -213,17 +317,15 @@ static int ghes_read_estatus(struct ghes *ghes, int silent) goto err_read_block; if (apei_estatus_check_header(ghes->estatus)) goto err_read_block; - rc = ghes_copy_tofrom_phys(ghes->estatus + 1, - buf_paddr + sizeof(*ghes->estatus), - len - sizeof(*ghes->estatus), 1); - if (rc) - return rc; + ghes_copy_tofrom_phys(ghes->estatus + 1, + buf_paddr + sizeof(*ghes->estatus), + len - sizeof(*ghes->estatus), 1); if (apei_estatus_check(ghes->estatus)) goto err_read_block; rc = 0; err_read_block: - if (rc && !silent) + if (rc && !silent && printk_ratelimit()) pr_warning(FW_WARN GHES_PFX "Failed to read error status block!\n"); return rc; @@ -293,6 +395,42 @@ out: return 0; } +static void ghes_add_timer(struct ghes *ghes) +{ + struct acpi_hest_generic *g = ghes->generic; + unsigned long expire; + + if (!g->notify.poll_interval) { + pr_warning(FW_WARN GHES_PFX "Poll interval is 0 for generic hardware error source: %d, disabled.\n", + g->header.source_id); + return; + } + expire = jiffies + msecs_to_jiffies(g->notify.poll_interval); + ghes->timer.expires = round_jiffies_relative(expire); + add_timer(&ghes->timer); +} + +static void ghes_poll_func(unsigned long data) +{ + struct ghes *ghes = (void *)data; + + ghes_proc(ghes); + if (!(ghes->flags & GHES_EXITING)) + ghes_add_timer(ghes); +} + +static irqreturn_t ghes_irq_func(int irq, void *data) +{ + struct ghes *ghes = data; + int rc; + + rc = ghes_proc(ghes); + if (rc) + return IRQ_NONE; + + return IRQ_HANDLED; +} + static int ghes_notify_sci(struct notifier_block *this, unsigned long event, void *data) { @@ -309,10 +447,63 @@ static int ghes_notify_sci(struct notifier_block *this, return ret; } +static int ghes_notify_nmi(struct notifier_block *this, + unsigned long cmd, void *data) +{ + struct ghes *ghes, *ghes_global = NULL; + int sev, sev_global = -1; + int ret = NOTIFY_DONE; + + if (cmd != DIE_NMI) + return ret; + + raw_spin_lock(&ghes_nmi_lock); + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (ghes_read_estatus(ghes, 1)) { + ghes_clear_estatus(ghes); + continue; + } + sev = ghes_severity(ghes->estatus->error_severity); + if (sev > sev_global) { + sev_global = sev; + ghes_global = ghes; + } + ret = NOTIFY_STOP; + } + + if (ret == NOTIFY_DONE) + goto out; + + if (sev_global >= GHES_SEV_PANIC) { + oops_begin(); + ghes_print_estatus(KERN_EMERG HW_ERR, ghes_global); + /* reboot to log the error! */ + if (panic_timeout == 0) + panic_timeout = ghes_panic_timeout; + panic("Fatal hardware error!"); + } + + list_for_each_entry_rcu(ghes, &ghes_nmi, list) { + if (!(ghes->flags & GHES_TO_CLEAR)) + continue; + /* Do not print estatus because printk is not NMI safe */ + ghes_do_proc(ghes); + ghes_clear_estatus(ghes); + } + +out: + raw_spin_unlock(&ghes_nmi_lock); + return ret; +} + static struct notifier_block ghes_notifier_sci = { .notifier_call = ghes_notify_sci, }; +static struct notifier_block ghes_notifier_nmi = { + .notifier_call = ghes_notify_nmi, +}; + static int __devinit ghes_probe(struct platform_device *ghes_dev) { struct acpi_hest_generic *generic; @@ -323,18 +514,27 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) if (!generic->enabled) return -ENODEV; - if (generic->error_block_length < - sizeof(struct acpi_hest_generic_status)) { - pr_warning(FW_BUG GHES_PFX -"Invalid error block length: %u for generic hardware error source: %d\n", - generic->error_block_length, + switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + case ACPI_HEST_NOTIFY_EXTERNAL: + case ACPI_HEST_NOTIFY_SCI: + case ACPI_HEST_NOTIFY_NMI: + break; + case ACPI_HEST_NOTIFY_LOCAL: + pr_warning(GHES_PFX "Generic hardware error source: %d notified via local interrupt is not supported!\n", generic->header.source_id); goto err; + default: + pr_warning(FW_WARN GHES_PFX "Unknown notification type: %u for generic hardware error source: %d\n", + generic->notify.type, generic->header.source_id); + goto err; } - if (generic->records_to_preallocate == 0) { - pr_warning(FW_BUG GHES_PFX -"Invalid records to preallocate: %u for generic hardware error source: %d\n", - generic->records_to_preallocate, + + rc = -EIO; + if (generic->error_block_length < + sizeof(struct acpi_hest_generic_status)) { + pr_warning(FW_BUG GHES_PFX "Invalid error block length: %u for generic hardware error source: %d\n", + generic->error_block_length, generic->header.source_id); goto err; } @@ -344,38 +544,43 @@ static int __devinit ghes_probe(struct platform_device *ghes_dev) ghes = NULL; goto err; } - if (generic->notify.type == ACPI_HEST_NOTIFY_SCI) { + switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + ghes->timer.function = ghes_poll_func; + ghes->timer.data = (unsigned long)ghes; + init_timer_deferrable(&ghes->timer); + ghes_add_timer(ghes); + break; + case ACPI_HEST_NOTIFY_EXTERNAL: + /* External interrupt vector is GSI */ + if (acpi_gsi_to_irq(generic->notify.vector, &ghes->irq)) { + pr_err(GHES_PFX "Failed to map GSI to IRQ for generic hardware error source: %d\n", + generic->header.source_id); + goto err; + } + if (request_irq(ghes->irq, ghes_irq_func, + 0, "GHES IRQ", ghes)) { + pr_err(GHES_PFX "Failed to register IRQ for generic hardware error source: %d\n", + generic->header.source_id); + goto err; + } + break; + case ACPI_HEST_NOTIFY_SCI: mutex_lock(&ghes_list_mutex); if (list_empty(&ghes_sci)) register_acpi_hed_notifier(&ghes_notifier_sci); list_add_rcu(&ghes->list, &ghes_sci); mutex_unlock(&ghes_list_mutex); - } else { - unsigned char *notify = NULL; - - switch (generic->notify.type) { - case ACPI_HEST_NOTIFY_POLLED: - notify = "POLL"; - break; - case ACPI_HEST_NOTIFY_EXTERNAL: - case ACPI_HEST_NOTIFY_LOCAL: - notify = "IRQ"; - break; - case ACPI_HEST_NOTIFY_NMI: - notify = "NMI"; - break; - } - if (notify) { - pr_warning(GHES_PFX -"Generic hardware error source: %d notified via %s is not supported!\n", - generic->header.source_id, notify); - } else { - pr_warning(FW_WARN GHES_PFX -"Unknown notification type: %u for generic hardware error source: %d\n", - generic->notify.type, generic->header.source_id); - } - rc = -ENODEV; - goto err; + break; + case ACPI_HEST_NOTIFY_NMI: + mutex_lock(&ghes_list_mutex); + if (list_empty(&ghes_nmi)) + register_die_notifier(&ghes_notifier_nmi); + list_add_rcu(&ghes->list, &ghes_nmi); + mutex_unlock(&ghes_list_mutex); + break; + default: + BUG(); } platform_set_drvdata(ghes_dev, ghes); @@ -396,7 +601,14 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) ghes = platform_get_drvdata(ghes_dev); generic = ghes->generic; + ghes->flags |= GHES_EXITING; switch (generic->notify.type) { + case ACPI_HEST_NOTIFY_POLLED: + del_timer_sync(&ghes->timer); + break; + case ACPI_HEST_NOTIFY_EXTERNAL: + free_irq(ghes->irq, ghes); + break; case ACPI_HEST_NOTIFY_SCI: mutex_lock(&ghes_list_mutex); list_del_rcu(&ghes->list); @@ -404,12 +616,23 @@ static int __devexit ghes_remove(struct platform_device *ghes_dev) unregister_acpi_hed_notifier(&ghes_notifier_sci); mutex_unlock(&ghes_list_mutex); break; + case ACPI_HEST_NOTIFY_NMI: + mutex_lock(&ghes_list_mutex); + list_del_rcu(&ghes->list); + if (list_empty(&ghes_nmi)) + unregister_die_notifier(&ghes_notifier_nmi); + mutex_unlock(&ghes_list_mutex); + /* + * To synchronize with NMI handler, ghes can only be + * freed after NMI handler finishes. + */ + synchronize_rcu(); + break; default: BUG(); break; } - synchronize_rcu(); ghes_fini(ghes); kfree(ghes); @@ -429,6 +652,8 @@ static struct platform_driver ghes_platform_driver = { static int __init ghes_init(void) { + int rc; + if (acpi_disabled) return -ENODEV; @@ -437,12 +662,25 @@ static int __init ghes_init(void) return -EINVAL; } - return platform_driver_register(&ghes_platform_driver); + rc = ghes_ioremap_init(); + if (rc) + goto err; + + rc = platform_driver_register(&ghes_platform_driver); + if (rc) + goto err_ioremap_exit; + + return 0; +err_ioremap_exit: + ghes_ioremap_exit(); +err: + return rc; } static void __exit ghes_exit(void) { platform_driver_unregister(&ghes_platform_driver); + ghes_ioremap_exit(); } module_init(ghes_init); diff --git a/kernel/panic.c b/kernel/panic.c index 4c13b1a88eb..991bb87a170 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -34,6 +34,7 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); int panic_timeout; +EXPORT_SYMBOL_GPL(panic_timeout); ATOMIC_NOTIFIER_HEAD(panic_notifier_list); diff --git a/lib/ioremap.c b/lib/ioremap.c index 5730ecd3eb6..da4e2ad74b6 100644 --- a/lib/ioremap.c +++ b/lib/ioremap.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include @@ -90,3 +91,4 @@ int ioremap_page_range(unsigned long addr, return err; } +EXPORT_SYMBOL_GPL(ioremap_page_range); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d00c5..816f074fb4e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1175,6 +1175,7 @@ void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { vunmap_page_range(addr, addr + size); } +EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush); /** * unmap_kernel_range - unmap kernel VM area and flush cache and TLB -- cgit v1.2.3-70-g09d2 From 88f5acf88ae6a9778f6d25d0d5d7ec2d57764a97 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:41 -0800 Subject: mm: page allocator: adjust the per-cpu counter threshold when memory is low Commit aa45484 ("calculate a better estimate of NR_FREE_PAGES when memory is low") noted that watermarks were based on the vmstat NR_FREE_PAGES. To avoid synchronization overhead, these counters are maintained on a per-cpu basis and drained both periodically and when a threshold is above a threshold. On large CPU systems, the difference between the estimate and real value of NR_FREE_PAGES can be very high. The system can get into a case where pages are allocated far below the min watermark potentially causing livelock issues. The commit solved the problem by taking a better reading of NR_FREE_PAGES when memory was low. Unfortately, as reported by Shaohua Li this accurate reading can consume a large amount of CPU time on systems with many sockets due to cache line bouncing. This patch takes a different approach. For large machines where counter drift might be unsafe and while kswapd is awake, the per-cpu thresholds for the target pgdat are reduced to limit the level of drift to what should be a safe level. This incurs a performance penalty in heavy memory pressure by a factor that depends on the workload and the machine but the machine should function correctly without accidentally exhausting all memory on a node. There is an additional cost when kswapd wakes and sleeps but the event is not expected to be frequent - in Shaohua's test case, there was one recorded sleep and wake event at least. To ensure that kswapd wakes up, a safe version of zone_watermark_ok() is introduced that takes a more accurate reading of NR_FREE_PAGES when called from wakeup_kswapd, when deciding whether it is really safe to go back to sleep in sleeping_prematurely() and when deciding if a zone is really balanced or not in balance_pgdat(). We are still using an expensive function but limiting how often it is called. When the test case is reproduced, the time spent in the watermark functions is reduced. The following report is on the percentage of time spent cumulatively spent in the functions zone_nr_free_pages(), zone_watermark_ok(), __zone_watermark_ok(), zone_watermark_ok_safe(), zone_page_state_snapshot(), zone_page_state(). vanilla 11.6615% disable-threshold 0.2584% David said: : We had to pull aa454840 "mm: page allocator: calculate a better estimate : of NR_FREE_PAGES when memory is low and kswapd is awake" from 2.6.36 : internally because tests showed that it would cause the machine to stall : as the result of heavy kswapd activity. I merged it back with this fix as : it is pending in the -mm tree and it solves the issue we were seeing, so I : definitely think this should be pushed to -stable (and I would seriously : consider it for 2.6.37 inclusion even at this late date). Signed-off-by: Mel Gorman Reported-by: Shaohua Li Reviewed-by: Christoph Lameter Tested-by: Nicolas Bareil Cc: David Rientjes Cc: Kyle McMartin Cc: [2.6.37.1, 2.6.36.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 10 +++----- include/linux/vmstat.h | 5 ++++ mm/mmzone.c | 21 ---------------- mm/page_alloc.c | 35 ++++++++++++++++++++------ mm/vmscan.c | 23 +++++++++-------- mm/vmstat.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++- 6 files changed, 115 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 39c24ebe9cf..48906629335 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -458,12 +458,6 @@ static inline int zone_is_oom_locked(const struct zone *zone) return test_bit(ZONE_OOM_LOCKED, &zone->flags); } -#ifdef CONFIG_SMP -unsigned long zone_nr_free_pages(struct zone *zone); -#else -#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES) -#endif /* CONFIG_SMP */ - /* * The "priority" of VM scanning is how much of the queues we will scan in one * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the @@ -661,7 +655,9 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); void wakeup_kswapd(struct zone *zone, int order); -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags); +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); enum memmap_context { MEMMAP_EARLY, diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index eaaea37b3b7..e4cc21cf587 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,6 +254,8 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); +void restore_pgdat_percpu_threshold(pg_data_t *pgdat); #else /* CONFIG_SMP */ /* @@ -298,6 +300,9 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state +static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } +static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } + static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/mmzone.c b/mm/mmzone.c index e35bfb82c85..f5b7d176021 100644 --- a/mm/mmzone.c +++ b/mm/mmzone.c @@ -87,24 +87,3 @@ int memmap_valid_within(unsigned long pfn, return 1; } #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */ - -#ifdef CONFIG_SMP -/* Called when a more accurate view of NR_FREE_PAGES is needed */ -unsigned long zone_nr_free_pages(struct zone *zone) -{ - unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES); - - /* - * While kswapd is awake, it is considered the zone is under some - * memory pressure. Under pressure, there is a risk that - * per-cpu-counter-drift will allow the min watermark to be breached - * potentially causing a live-lock. While kswapd is awake and - * free pages are low, get a better estimate for free pages - */ - if (nr_free_pages < zone->percpu_drift_mark && - !waitqueue_active(&zone->zone_pgdat->kswapd_wait)) - return zone_page_state_snapshot(zone, NR_FREE_PAGES); - - return nr_free_pages; -} -#endif /* CONFIG_SMP */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 826ba6922e8..22a1bb7723e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1460,24 +1460,24 @@ static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order) #endif /* CONFIG_FAIL_PAGE_ALLOC */ /* - * Return 1 if free pages are above 'mark'. This takes into account the order + * Return true if free pages are above 'mark'. This takes into account the order * of the allocation. */ -int zone_watermark_ok(struct zone *z, int order, unsigned long mark, - int classzone_idx, int alloc_flags) +static bool __zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags, long free_pages) { /* free_pages my go negative - that's OK */ long min = mark; - long free_pages = zone_nr_free_pages(z) - (1 << order) + 1; int o; + free_pages -= (1 << order) + 1; if (alloc_flags & ALLOC_HIGH) min -= min / 2; if (alloc_flags & ALLOC_HARDER) min -= min / 4; if (free_pages <= min + z->lowmem_reserve[classzone_idx]) - return 0; + return false; for (o = 0; o < order; o++) { /* At the next order, this order's pages become unavailable */ free_pages -= z->free_area[o].nr_free << o; @@ -1486,9 +1486,28 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark, min >>= 1; if (free_pages <= min) - return 0; + return false; } - return 1; + return true; +} + +bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + zone_page_state(z, NR_FREE_PAGES)); +} + +bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, + int classzone_idx, int alloc_flags) +{ + long free_pages = zone_page_state(z, NR_FREE_PAGES); + + if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark) + free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES); + + return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags, + free_pages); } #ifdef CONFIG_NUMA @@ -2442,7 +2461,7 @@ void show_free_areas(void) " all_unreclaimable? %s" "\n", zone->name, - K(zone_nr_free_pages(zone)), + K(zone_page_state(zone, NR_FREE_PAGES)), K(min_wmark_pages(zone)), K(low_wmark_pages(zone)), K(high_wmark_pages(zone)), diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ca587c6927..5da4295e7d6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2143,7 +2143,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (zone->all_unreclaimable) continue; - if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) return 1; } @@ -2230,7 +2230,7 @@ loop_again: shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; break; @@ -2276,7 +2276,7 @@ loop_again: * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; @@ -2297,7 +2297,7 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; /* @@ -2305,7 +2305,7 @@ loop_again: * means that we have a GFP_ATOMIC allocation * failure risk. Hurry up! */ - if (!zone_watermark_ok(zone, order, + if (!zone_watermark_ok_safe(zone, order, min_wmark_pages(zone), end_zone, 0)) has_under_min_watermark_zone = 1; } else { @@ -2448,7 +2448,9 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + restore_pgdat_percpu_threshold(pgdat); schedule(); + reduce_pgdat_percpu_threshold(pgdat); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); @@ -2487,16 +2489,17 @@ void wakeup_kswapd(struct zone *zone, int order) if (!populated_zone(zone)) return; - pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; + pgdat = zone->zone_pgdat; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; - trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - return; if (!waitqueue_active(&pgdat->kswapd_wait)) return; + if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) + return; + + trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, zone_idx(zone), order); wake_up_interruptible(&pgdat->kswapd_wait); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 312d728976f..bc0f095791b 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,6 +83,30 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP +static int calculate_pressure_threshold(struct zone *zone) +{ + int threshold; + int watermark_distance; + + /* + * As vmstats are not up to date, there is drift between the estimated + * and real values. For high thresholds and a high number of CPUs, it + * is possible for the min watermark to be breached while the estimated + * value looks fine. The pressure threshold is a reduced value such + * that even the maximum amount of drift will not accidentally breach + * the min watermark + */ + watermark_distance = low_wmark_pages(zone) - min_wmark_pages(zone); + threshold = max(1, (int)(watermark_distance / num_online_cpus())); + + /* + * Maximum threshold is 125 + */ + threshold = min(125, threshold); + + return threshold; +} + static int calculate_threshold(struct zone *zone) { int threshold; @@ -161,6 +185,48 @@ static void refresh_zone_stat_thresholds(void) } } +void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = calculate_pressure_threshold(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } + put_online_cpus(); +} + +void restore_pgdat_percpu_threshold(pg_data_t *pgdat) +{ + struct zone *zone; + int cpu; + int threshold; + int i; + + get_online_cpus(); + for (i = 0; i < pgdat->nr_zones; i++) { + zone = &pgdat->node_zones[i]; + if (!zone->percpu_drift_mark) + continue; + + threshold = calculate_threshold(zone); + for_each_online_cpu(cpu) + per_cpu_ptr(zone->pageset, cpu)->stat_threshold + = threshold; + } + put_online_cpus(); +} + /* * For use when we know that interrupts are disabled. */ @@ -911,7 +977,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n scanned %lu" "\n spanned %lu" "\n present %lu", - zone_nr_free_pages(zone), + zone_page_state(zone, NR_FREE_PAGES), min_wmark_pages(zone), low_wmark_pages(zone), high_wmark_pages(zone), -- cgit v1.2.3-70-g09d2 From b44129b30652c8771db2265939bb8b463724043d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:43 -0800 Subject: mm: vmstat: use a single setter function and callback for adjusting percpu thresholds reduce_pgdat_percpu_threshold() and restore_pgdat_percpu_threshold() exist to adjust the per-cpu vmstat thresholds while kswapd is awake to avoid errors due to counter drift. The functions duplicate some code so this patch replaces them with a single set_pgdat_percpu_threshold() that takes a callback function to calculate the desired threshold as a parameter. [akpm@linux-foundation.org: readability tweak] [kosaki.motohiro@jp.fujitsu.com: set_pgdat_percpu_threshold(): don't use for_each_online_cpu] Signed-off-by: Mel Gorman Reviewed-by: Christoph Lameter Reviewed-by: KAMEZAWA Hiroyuki Signed-off-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmstat.h | 10 ++++++---- mm/vmscan.c | 19 +++++++++++++++++-- mm/vmstat.c | 36 +++++++----------------------------- 3 files changed, 30 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index e4cc21cf587..833e676d6d9 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -254,8 +254,11 @@ extern void dec_zone_state(struct zone *, enum zone_stat_item); extern void __dec_zone_state(struct zone *, enum zone_stat_item); void refresh_cpu_vm_stats(int); -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat); -void restore_pgdat_percpu_threshold(pg_data_t *pgdat); + +int calculate_pressure_threshold(struct zone *zone); +int calculate_normal_threshold(struct zone *zone); +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)); #else /* CONFIG_SMP */ /* @@ -300,8 +303,7 @@ static inline void __dec_zone_page_state(struct page *page, #define dec_zone_page_state __dec_zone_page_state #define mod_zone_page_state __mod_zone_page_state -static inline void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) { } -static inline void restore_pgdat_percpu_threshold(pg_data_t *pgdat) { } +#define set_pgdat_percpu_threshold(pgdat, callback) { } static inline void refresh_cpu_vm_stats(int cpu) { } #endif diff --git a/mm/vmscan.c b/mm/vmscan.c index 5da4295e7d6..86f8c341879 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2448,9 +2448,24 @@ static int kswapd(void *p) */ if (!sleeping_prematurely(pgdat, order, remaining)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - restore_pgdat_percpu_threshold(pgdat); + + /* + * vmstat counters are not perfectly + * accurate and the estimated value + * for counters such as NR_FREE_PAGES + * can deviate from the true value by + * nr_online_cpus * threshold. To + * avoid the zone watermarks being + * breached while under pressure, we + * reduce the per-cpu vmstat threshold + * while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, + calculate_normal_threshold); schedule(); - reduce_pgdat_percpu_threshold(pgdat); + set_pgdat_percpu_threshold(pgdat, + calculate_pressure_threshold); } else { if (remaining) count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); diff --git a/mm/vmstat.c b/mm/vmstat.c index bc0f095791b..751a65e00aa 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(vm_stat); #ifdef CONFIG_SMP -static int calculate_pressure_threshold(struct zone *zone) +int calculate_pressure_threshold(struct zone *zone) { int threshold; int watermark_distance; @@ -107,7 +107,7 @@ static int calculate_pressure_threshold(struct zone *zone) return threshold; } -static int calculate_threshold(struct zone *zone) +int calculate_normal_threshold(struct zone *zone) { int threshold; int mem; /* memory in 128 MB units */ @@ -166,7 +166,7 @@ static void refresh_zone_stat_thresholds(void) for_each_populated_zone(zone) { unsigned long max_drift, tolerate_drift; - threshold = calculate_threshold(zone); + threshold = calculate_normal_threshold(zone); for_each_online_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold @@ -185,46 +185,24 @@ static void refresh_zone_stat_thresholds(void) } } -void reduce_pgdat_percpu_threshold(pg_data_t *pgdat) +void set_pgdat_percpu_threshold(pg_data_t *pgdat, + int (*calculate_pressure)(struct zone *)) { struct zone *zone; int cpu; int threshold; int i; - get_online_cpus(); - for (i = 0; i < pgdat->nr_zones; i++) { - zone = &pgdat->node_zones[i]; - if (!zone->percpu_drift_mark) - continue; - - threshold = calculate_pressure_threshold(zone); - for_each_online_cpu(cpu) - per_cpu_ptr(zone->pageset, cpu)->stat_threshold - = threshold; - } - put_online_cpus(); -} - -void restore_pgdat_percpu_threshold(pg_data_t *pgdat) -{ - struct zone *zone; - int cpu; - int threshold; - int i; - - get_online_cpus(); for (i = 0; i < pgdat->nr_zones; i++) { zone = &pgdat->node_zones[i]; if (!zone->percpu_drift_mark) continue; - threshold = calculate_threshold(zone); - for_each_online_cpu(cpu) + threshold = (*calculate_pressure)(zone); + for_each_possible_cpu(cpu) per_cpu_ptr(zone->pageset, cpu)->stat_threshold = threshold; } - put_online_cpus(); } /* -- cgit v1.2.3-70-g09d2 From c3f0da631539b3b8e17f6dda567af9958d49d14f Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Thu, 13 Jan 2011 15:45:49 -0800 Subject: mm/page-writeback.c: fix __set_page_dirty_no_writeback() return value __set_page_dirty_no_writeback() should return true if it actually transitioned the page from a clean to dirty state although it seems nobody uses its return value at present. Signed-off-by: Bob Liu Acked-by: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index b5d8a1f820a..28763b8bdbd 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1103,7 +1103,7 @@ EXPORT_SYMBOL(write_one_page); int __set_page_dirty_no_writeback(struct page *page) { if (!PageDirty(page)) - SetPageDirty(page); + return !TestSetPageDirty(page); return 0; } -- cgit v1.2.3-70-g09d2 From f0bc0a60b13f209df16062f94e9fb4b90dc08708 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 13 Jan 2011 15:45:50 -0800 Subject: vmscan: factor out kswapd sleeping logic from kswapd() Currently, kswapd() has deep nesting and is slightly hard to read. Clean this up. Signed-off-by: KOSAKI Motohiro Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 92 ++++++++++++++++++++++++++++++------------------------------- 1 file changed, 46 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 86f8c341879..cacdf668497 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2371,6 +2371,50 @@ out: return sc.nr_reclaimed; } +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) +{ + long remaining = 0; + DEFINE_WAIT(wait); + + if (freezing(current) || kthread_should_stop()) + return; + + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a premature sleep. If not, then + * go fully to sleep until explicitly woken up. + */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + trace_mm_vmscan_kswapd_sleep(pgdat->node_id); + + /* + * vmstat counters are not perfectly accurate and the estimated + * value for counters such as NR_FREE_PAGES can deviate from the + * true value by nr_online_cpus * threshold. To avoid the zone + * watermarks being breached while under pressure, we reduce the + * per-cpu vmstat threshold while kswapd is awake and restore + * them before going back to sleep. + */ + set_pgdat_percpu_threshold(pgdat, calculate_normal_threshold); + schedule(); + set_pgdat_percpu_threshold(pgdat, calculate_pressure_threshold); + } else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + finish_wait(&pgdat->kswapd_wait, &wait); +} + /* * The background pageout daemon, started as a kernel thread * from the init process. @@ -2389,7 +2433,7 @@ static int kswapd(void *p) unsigned long order; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; - DEFINE_WAIT(wait); + struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; @@ -2421,7 +2465,6 @@ static int kswapd(void *p) unsigned long new_order; int ret; - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; pgdat->kswapd_max_order = 0; if (order < new_order) { @@ -2431,52 +2474,9 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current) && !kthread_should_stop()) { - long remaining = 0; - - /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - remaining = schedule_timeout(HZ/10); - finish_wait(&pgdat->kswapd_wait, &wait); - prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); - } - - /* - * After a short sleep, check if it was a - * premature sleep. If not, then go fully - * to sleep until explicitly woken up - */ - if (!sleeping_prematurely(pgdat, order, remaining)) { - trace_mm_vmscan_kswapd_sleep(pgdat->node_id); - - /* - * vmstat counters are not perfectly - * accurate and the estimated value - * for counters such as NR_FREE_PAGES - * can deviate from the true value by - * nr_online_cpus * threshold. To - * avoid the zone watermarks being - * breached while under pressure, we - * reduce the per-cpu vmstat threshold - * while kswapd is awake and restore - * them before going back to sleep. - */ - set_pgdat_percpu_threshold(pgdat, - calculate_normal_threshold); - schedule(); - set_pgdat_percpu_threshold(pgdat, - calculate_pressure_threshold); - } else { - if (remaining) - count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); - else - count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); - } - } - + kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; } - finish_wait(&pgdat->kswapd_wait, &wait); ret = try_to_freeze(); if (kthread_should_stop()) -- cgit v1.2.3-70-g09d2 From 9cbb4cb21b19fff46cf1174d0ed699ef710e641c Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 13 Jan 2011 15:45:51 -0800 Subject: mm: find_get_pages_contig fixlet Testing ->mapping and ->index without a ref is not stable as the page may have been reused at this point. Signed-off-by: Nick Piggin Reviewed-by: Wu Fengguang Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ca389394fa2..1a3dd591472 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -837,9 +837,6 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; - if (page->mapping == NULL || page->index != index) - break; - if (!page_cache_get_speculative(page)) goto repeat; @@ -849,6 +846,16 @@ repeat: goto repeat; } + /* + * must check mapping and index after taking the ref. + * otherwise we can get both false positives and false + * negatives, which is just confusing to the caller. + */ + if (page->mapping == NULL || page->index != index) { + page_cache_release(page); + break; + } + pages[ret] = page; ret++; index++; -- cgit v1.2.3-70-g09d2 From 62c70bce8ac236514c610020bb1ae5b8bde965cb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 13 Jan 2011 15:45:52 -0800 Subject: mm: convert sprintf_symbol to %pS Signed-off-by: Joe Perches Acked-by: Pekka Enberg Cc: Jiri Kosina Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 11 ++++------- mm/vmalloc.c | 9 ++------- 2 files changed, 6 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 008cd743a36..c7ef0070dd8 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3636,7 +3636,7 @@ static int list_locations(struct kmem_cache *s, char *buf, len += sprintf(buf + len, "%7ld ", l->count); if (l->addr) - len += sprint_symbol(buf + len, (unsigned long)l->addr); + len += sprintf(buf + len, "%pS", (void *)l->addr); else len += sprintf(buf + len, ""); @@ -3946,12 +3946,9 @@ SLAB_ATTR(min_partial); static ssize_t ctor_show(struct kmem_cache *s, char *buf) { - if (s->ctor) { - int n = sprint_symbol(buf, (unsigned long)s->ctor); - - return n + sprintf(buf + n, "\n"); - } - return 0; + if (!s->ctor) + return 0; + return sprintf(buf, "%pS\n", s->ctor); } SLAB_ATTR_RO(ctor); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index eb5cc7d00c5..48245af07b1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2456,13 +2456,8 @@ static int s_show(struct seq_file *m, void *p) seq_printf(m, "0x%p-0x%p %7ld", v->addr, v->addr + v->size, v->size); - if (v->caller) { - char buff[KSYM_SYMBOL_LEN]; - - seq_putc(m, ' '); - sprint_symbol(buff, (unsigned long)v->caller); - seq_puts(m, buff); - } + if (v->caller) + seq_printf(m, " %pS", v->caller); if (v->nr_pages) seq_printf(m, " pages=%d", v->nr_pages); -- cgit v1.2.3-70-g09d2 From b7aba6984dc048503b69c2a885098cdd430832bf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:54 -0800 Subject: mm: compaction: add trace events for memory compaction activity In preparation for a patches promoting the use of memory compaction over lumpy reclaim, this patch adds trace points for memory compaction activity. Using them, we can monitor the scanning activity of the migration and free page scanners as well as the number and success rates of pages passed to page migration. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/compaction.h | 74 +++++++++++++++++++++++++++++++++++++++ mm/compaction.c | 14 +++++++- 2 files changed, 87 insertions(+), 1 deletion(-) create mode 100644 include/trace/events/compaction.h (limited to 'mm') diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h new file mode 100644 index 00000000000..388bcdd26d4 --- /dev/null +++ b/include/trace/events/compaction.h @@ -0,0 +1,74 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM compaction + +#if !defined(_TRACE_COMPACTION_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_COMPACTION_H + +#include +#include +#include "gfpflags.h" + +DECLARE_EVENT_CLASS(mm_compaction_isolate_template, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken), + + TP_STRUCT__entry( + __field(unsigned long, nr_scanned) + __field(unsigned long, nr_taken) + ), + + TP_fast_assign( + __entry->nr_scanned = nr_scanned; + __entry->nr_taken = nr_taken; + ), + + TP_printk("nr_scanned=%lu nr_taken=%lu", + __entry->nr_scanned, + __entry->nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_migratepages, + + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, + TP_PROTO(unsigned long nr_scanned, + unsigned long nr_taken), + + TP_ARGS(nr_scanned, nr_taken) +); + +TRACE_EVENT(mm_compaction_migratepages, + + TP_PROTO(unsigned long nr_migrated, + unsigned long nr_failed), + + TP_ARGS(nr_migrated, nr_failed), + + TP_STRUCT__entry( + __field(unsigned long, nr_migrated) + __field(unsigned long, nr_failed) + ), + + TP_fast_assign( + __entry->nr_migrated = nr_migrated; + __entry->nr_failed = nr_failed; + ), + + TP_printk("nr_migrated=%lu nr_failed=%lu", + __entry->nr_migrated, + __entry->nr_failed) +); + + +#endif /* _TRACE_COMPACTION_H */ + +/* This part must be outside protection */ +#include diff --git a/mm/compaction.c b/mm/compaction.c index 1a8894eadf7..20011a850fe 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -16,6 +16,9 @@ #include #include "internal.h" +#define CREATE_TRACE_POINTS +#include + /* * compact_control is used to track pages being migrated and the free pages * they are being migrated to during memory compaction. The free_pfn starts @@ -60,7 +63,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, struct list_head *freelist) { unsigned long zone_end_pfn, end_pfn; - int total_isolated = 0; + int nr_scanned = 0, total_isolated = 0; struct page *cursor; /* Get the last PFN we should scan for free pages at */ @@ -81,6 +84,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, if (!pfn_valid_within(blockpfn)) continue; + nr_scanned++; if (!PageBuddy(page)) continue; @@ -100,6 +104,7 @@ static unsigned long isolate_freepages_block(struct zone *zone, } } + trace_mm_compaction_isolate_freepages(nr_scanned, total_isolated); return total_isolated; } @@ -234,6 +239,7 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; /* Do not scan outside zone boundaries */ @@ -266,6 +272,7 @@ static unsigned long isolate_migratepages(struct zone *zone, struct page *page; if (!pfn_valid_within(low_pfn)) continue; + nr_scanned++; /* Get the page and skip if free */ page = pfn_to_page(low_pfn); @@ -280,6 +287,7 @@ static unsigned long isolate_migratepages(struct zone *zone, del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); cc->nr_migratepages++; + nr_isolated++; /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) @@ -291,6 +299,8 @@ static unsigned long isolate_migratepages(struct zone *zone, spin_unlock_irq(&zone->lru_lock); cc->migrate_pfn = low_pfn; + trace_mm_compaction_isolate_migratepages(nr_scanned, nr_isolated); + return cc->nr_migratepages; } @@ -401,6 +411,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) count_vm_events(COMPACTPAGES, nr_migrate - nr_remaining); if (nr_remaining) count_vm_events(COMPACTPAGEFAILED, nr_remaining); + trace_mm_compaction_migratepages(nr_migrate - nr_remaining, + nr_remaining); /* Release LRU pages not migrated */ if (!list_empty(&cc->migratepages)) { -- cgit v1.2.3-70-g09d2 From ee64fc9354e515a79c7232cfde65c88ec627308b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:55 -0800 Subject: mm: vmscan: convert lumpy_mode into a bitmask Currently lumpy_mode is an enum and determines if lumpy reclaim is off, syncronous or asyncronous. In preparation for using compaction instead of lumpy reclaim, this patch converts the flags into a bitmap. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 +++--- mm/vmscan.c | 46 ++++++++++++++++++++++++++----------------- 2 files changed, 31 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index c255fcc587b..be76429962c 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync == LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index cacdf668497..3464312bde0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -51,11 +51,20 @@ #define CREATE_TRACE_POINTS #include -enum lumpy_mode { - LUMPY_MODE_NONE, - LUMPY_MODE_ASYNC, - LUMPY_MODE_SYNC, -}; +/* + * lumpy_mode determines how the inactive list is shrunk + * LUMPY_MODE_SINGLE: Reclaim only order-0 pages + * LUMPY_MODE_ASYNC: Do not block + * LUMPY_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference + * page from the LRU and reclaim all pages within a + * naturally aligned range + */ +typedef unsigned __bitwise__ lumpy_mode; +#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) +#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) +#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) +#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -88,7 +97,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - enum lumpy_mode lumpy_reclaim_mode; + lumpy_mode lumpy_reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -274,13 +283,13 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - enum lumpy_mode mode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; /* * Some reclaim have alredy been failed. No worth to try synchronous * lumpy reclaim. */ - if (sync && sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) return; /* @@ -288,17 +297,18 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, * trouble getting a small set of contiguous pages, we * will reclaim both active and inactive pages. */ + sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode = mode; + sc->lumpy_reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode = mode; + sc->lumpy_reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; } static void disable_lumpy_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_NONE; + sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -429,7 +439,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC) + (sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -622,7 +632,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode != LUMPY_MODE_NONE) + if (sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM) return PAGEREF_RECLAIM; /* @@ -739,7 +749,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_SYNC && + if ((sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -1324,7 +1334,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode == LUMPY_MODE_NONE) + if (sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1375,7 +1385,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? ISOLATE_INACTIVE : ISOLATE_BOTH, zone, 0, file); zone->pages_scanned += nr_scanned; @@ -1388,7 +1398,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode == LUMPY_MODE_NONE ? + sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? ISOLATE_INACTIVE : ISOLATE_BOTH, zone, sc->mem_cgroup, 0, file); -- cgit v1.2.3-70-g09d2 From 3e7d344970673c5334cf7b5bb27c8c0942b06126 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:56 -0800 Subject: mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim Lumpy reclaim is disruptive. It reclaims a large number of pages and ignores the age of the pages it reclaims. This can incur significant stalls and potentially increase the number of major faults. Compaction has reached the point where it is considered reasonably stable (meaning it has passed a lot of testing) and is a potential candidate for displacing lumpy reclaim. This patch introduces an alternative to lumpy reclaim whe compaction is available called reclaim/compaction. The basic operation is very simple - instead of selecting a contiguous range of pages to reclaim, a number of order-0 pages are reclaimed and then compaction is later by either kswapd (compact_zone_order()) or direct compaction (__alloc_pages_direct_compact()). [akpm@linux-foundation.org: fix build] [akpm@linux-foundation.org: use conventional task_struct naming] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 14 +++++++ include/linux/kernel.h | 7 ++++ mm/compaction.c | 89 ++++++++++++++++++++++++--------------- mm/migrate.c | 17 ++++++++ mm/page_alloc.c | 16 +++++++ mm/vmscan.c | 102 ++++++++++++++++++++++++++++++++++++++------- 6 files changed, 196 insertions(+), 49 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 5ac51552d90..2592883d862 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -22,6 +22,9 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); +extern unsigned long compaction_suitable(struct zone *zone, int order); +extern unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -59,6 +62,17 @@ static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, return COMPACT_CONTINUE; } +static inline unsigned long compaction_suitable(struct zone *zone, int order) +{ + return COMPACT_SKIPPED; +} + +static inline unsigned long compact_zone_order(struct zone *zone, int order, + gfp_t gfp_mask) +{ + return 0; +} + static inline void defer_compaction(struct zone *zone) { } diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 57dac7022b6..5a9d9059520 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -600,6 +600,13 @@ struct sysinfo { #define NUMA_BUILD 0 #endif +/* This helps us avoid #ifdef CONFIG_COMPACTION */ +#ifdef CONFIG_COMPACTION +#define COMPACTION_BUILD 1 +#else +#define COMPACTION_BUILD 0 +#endif + /* Rebuild everything on CONFIG_FTRACE_MCOUNT_RECORD */ #ifdef CONFIG_FTRACE_MCOUNT_RECORD # define REBUILD_DUE_TO_FTRACE_MCOUNT_RECORD diff --git a/mm/compaction.c b/mm/compaction.c index 20011a850fe..8fe917ec7c1 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -384,10 +384,62 @@ static int compact_finished(struct zone *zone, return COMPACT_CONTINUE; } +/* + * compaction_suitable: Is this suitable to run compaction on this zone now? + * Returns + * COMPACT_SKIPPED - If there are too few free pages for compaction + * COMPACT_PARTIAL - If the allocation would succeed without compaction + * COMPACT_CONTINUE - If compaction should run now + */ +unsigned long compaction_suitable(struct zone *zone, int order) +{ + int fragindex; + unsigned long watermark; + + /* + * Watermarks for order-0 must be met for compaction. Note the 2UL. + * This is because during migration, copies of pages need to be + * allocated and for a short time, the footprint is higher + */ + watermark = low_wmark_pages(zone) + (2UL << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return COMPACT_SKIPPED; + + /* + * fragmentation index determines if allocation failures are due to + * low memory or external fragmentation + * + * index of -1 implies allocations might succeed dependingon watermarks + * index towards 0 implies failure is due to lack of memory + * index towards 1000 implies failure is due to fragmentation + * + * Only compact if a failure would be due to fragmentation. + */ + fragindex = fragmentation_index(zone, order); + if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) + return COMPACT_SKIPPED; + + if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) + return COMPACT_PARTIAL; + + return COMPACT_CONTINUE; +} + static int compact_zone(struct zone *zone, struct compact_control *cc) { int ret; + ret = compaction_suitable(zone, cc->order); + switch (ret) { + case COMPACT_PARTIAL: + case COMPACT_SKIPPED: + /* Compaction is likely to fail */ + return ret; + case COMPACT_CONTINUE: + /* Fall through to compaction */ + ; + } + /* Setup to move all movable pages to the end of the zone */ cc->migrate_pfn = zone->zone_start_pfn; cc->free_pfn = cc->migrate_pfn + zone->spanned_pages; @@ -429,7 +481,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) return ret; } -static unsigned long compact_zone_order(struct zone *zone, +unsigned long compact_zone_order(struct zone *zone, int order, gfp_t gfp_mask) { struct compact_control cc = { @@ -462,7 +514,6 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; int may_perform_io = gfp_mask & __GFP_IO; - unsigned long watermark; struct zoneref *z; struct zone *zone; int rc = COMPACT_SKIPPED; @@ -480,43 +531,13 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, /* Compact each zone in the list */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - int fragindex; int status; - /* - * Watermarks for order-0 must be met for compaction. Note - * the 2UL. This is because during migration, copies of - * pages need to be allocated and for a short time, the - * footprint is higher - */ - watermark = low_wmark_pages(zone) + (2UL << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - continue; - - /* - * fragmentation index determines if allocation failures are - * due to low memory or external fragmentation - * - * index of -1 implies allocations might succeed depending - * on watermarks - * index towards 0 implies failure is due to lack of memory - * index towards 1000 implies failure is due to fragmentation - * - * Only compact if a failure would be due to fragmentation. - */ - fragindex = fragmentation_index(zone, order); - if (fragindex >= 0 && fragindex <= sysctl_extfrag_threshold) - continue; - - if (fragindex == -1 && zone_watermark_ok(zone, order, watermark, 0, 0)) { - rc = COMPACT_PARTIAL; - break; - } - status = compact_zone_order(zone, order, gfp_mask); rc = max(status, rc); - if (zone_watermark_ok(zone, order, watermark, 0, 0)) + /* If a normal allocation would succeed, stop compacting */ + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) break; } diff --git a/mm/migrate.c b/mm/migrate.c index 6ae8a66a704..94875b26592 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -639,6 +639,23 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!trylock_page(page)) { if (!force) goto move_newpage; + + /* + * It's not safe for direct compaction to call lock_page. + * For example, during page readahead pages are added locked + * to the LRU. Later, when the IO completes the pages are + * marked uptodate and unlocked. However, the queueing + * could be merging multiple pages for one bio (e.g. + * mpage_readpages). If an allocation happens for the + * second or third page, the process can end up locking + * the same page twice and deadlocking. Rather than + * trying to be clever about what pages can be locked, + * avoid the use of lock_page for direct compaction + * altogether. + */ + if (current->flags & PF_MEMALLOC) + goto move_newpage; + lock_page(page); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22a1bb7723e..03a66a31bfc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1815,12 +1815,15 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, int migratetype, unsigned long *did_some_progress) { struct page *page; + struct task_struct *tsk = current; if (!order || compaction_deferred(preferred_zone)) return NULL; + tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask); + tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -2121,6 +2124,19 @@ rebalance: /* Wait for some write requests to complete then retry */ wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/50); goto rebalance; + } else { + /* + * High-order allocations do not necessarily loop after + * direct reclaim and reclaim/compaction depends on compaction + * being called after reclaim so call directly if necessary + */ + page = __alloc_pages_direct_compact(gfp_mask, order, + zonelist, high_zoneidx, + nodemask, + alloc_flags, preferred_zone, + migratetype, &did_some_progress); + if (page) + goto got_pg; } nopage: diff --git a/mm/vmscan.c b/mm/vmscan.c index 3464312bde0..10ebd74a423 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -59,12 +60,15 @@ * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference * page from the LRU and reclaim all pages within a * naturally aligned range + * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of + * order-0 pages and then compact the zone */ typedef unsigned __bitwise__ lumpy_mode; #define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) #define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) #define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) #define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) +#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -286,18 +290,20 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; /* - * Some reclaim have alredy been failed. No worth to try synchronous - * lumpy reclaim. + * Initially assume we are entering either lumpy reclaim or + * reclaim/compaction.Depending on the order, we will either set the + * sync mode or just reclaim order-0 pages later. */ - if (sync && sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) - return; + if (COMPACTION_BUILD) + sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION; + else + sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. + * Avoid using lumpy reclaim or reclaim/compaction if possible by + * restricting when its set to either costly allocations or when + * under memory pressure */ - sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; if (sc->order > PAGE_ALLOC_COSTLY_ORDER) sc->lumpy_reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) @@ -1385,8 +1391,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; if (current_is_kswapd()) @@ -1398,8 +1404,8 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE ? - ISOLATE_INACTIVE : ISOLATE_BOTH, + sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); /* @@ -1814,6 +1820,57 @@ out: } } +/* + * Reclaim/compaction depends on a number of pages being freed. To avoid + * disruption to the system, a small number of order-0 pages continue to be + * rotated and reclaimed in the normal fashion. However, by the time we get + * back to the allocator and call try_to_compact_zone(), we ensure that + * there are enough free pages for it to be likely successful + */ +static inline bool should_continue_reclaim(struct zone *zone, + unsigned long nr_reclaimed, + unsigned long nr_scanned, + struct scan_control *sc) +{ + unsigned long pages_for_compaction; + unsigned long inactive_lru_pages; + + /* If not in reclaim/compaction mode, stop */ + if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION)) + return false; + + /* + * If we failed to reclaim and have scanned the full list, stop. + * NOTE: Checking just nr_reclaimed would exit reclaim/compaction far + * faster but obviously would be less likely to succeed + * allocation. If this is desirable, use GFP_REPEAT to decide + * if both reclaimed and scanned should be checked or just + * reclaimed + */ + if (!nr_reclaimed && !nr_scanned) + return false; + + /* + * If we have not reclaimed enough pages for compaction and the + * inactive lists are large enough, continue reclaiming + */ + pages_for_compaction = (2UL << sc->order); + inactive_lru_pages = zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + if (sc->nr_reclaimed < pages_for_compaction && + inactive_lru_pages > pages_for_compaction) + return true; + + /* If compaction would go ahead or the allocation would succeed, stop */ + switch (compaction_suitable(zone, sc->order)) { + case COMPACT_PARTIAL: + case COMPACT_CONTINUE: + return false; + default: + return true; + } +} + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ @@ -1823,9 +1880,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long nr_reclaimed; unsigned long nr_to_reclaim = sc->nr_to_reclaim; + unsigned long nr_scanned = sc->nr_scanned; +restart: + nr_reclaimed = 0; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1851,8 +1911,7 @@ static void shrink_zone(int priority, struct zone *zone, if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } - - sc->nr_reclaimed = nr_reclaimed; + sc->nr_reclaimed += nr_reclaimed; /* * Even if we did not try to evict anon pages at all, we want to @@ -1861,6 +1920,11 @@ static void shrink_zone(int priority, struct zone *zone, if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + /* reclaim/compaction might need reclaim to continue */ + if (should_continue_reclaim(zone, nr_reclaimed, + sc->nr_scanned - nr_scanned, sc)) + goto restart; + throttle_vm_writeout(sc->gfp_mask); } @@ -2307,6 +2371,14 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + /* + * Compact the zone for higher orders to reduce + * latencies for higher-order allocations that + * would ordinarily call try_to_compact_pages() + */ + if (sc.order > PAGE_ALLOC_COSTLY_ORDER) + compact_zone_order(zone, sc.order, sc.gfp_mask); + if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit v1.2.3-70-g09d2 From 77f1fe6b08b13a87391549c8a820ddc817b6f50e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:57 -0800 Subject: mm: migration: allow migration to operate asynchronously and avoid synchronous compaction in the faster path Migration synchronously waits for writeback if the initial passes fails. Callers of memory compaction do not necessarily want this behaviour if the caller is latency sensitive or expects that synchronous migration is not going to have a significantly better success rate. This patch adds a sync parameter to migrate_pages() allowing the caller to indicate if wait_on_page_writeback() is allowed within migration or not. For reclaim/compaction, try_to_compact_pages() is first called asynchronously, direct reclaim runs and then try_to_compact_pages() is called synchronously as there is a greater expectation that it'll succeed. [akpm@linux-foundation.org: build/merge fix] Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 10 ++++++---- include/linux/migrate.h | 12 ++++++++---- mm/compaction.c | 14 ++++++++++---- mm/memory-failure.c | 8 +++++--- mm/memory_hotplug.c | 3 ++- mm/mempolicy.c | 4 ++-- mm/migrate.c | 22 +++++++++++++--------- mm/page_alloc.c | 21 +++++++++++++++------ mm/vmscan.c | 3 ++- 9 files changed, 63 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 2592883d862..72cba403478 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -21,10 +21,11 @@ extern int sysctl_extfrag_handler(struct ctl_table *table, int write, extern int fragmentation_index(struct zone *zone, unsigned int order); extern unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *mask); + int order, gfp_t gfp_mask, nodemask_t *mask, + bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask); + gfp_t gfp_mask, bool sync); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -57,7 +58,8 @@ static inline bool compaction_deferred(struct zone *zone) #else static inline unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { return COMPACT_CONTINUE; } @@ -68,7 +70,7 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, bool sync) { return 0; } diff --git a/include/linux/migrate.h b/include/linux/migrate.h index 085527fb826..fa31902803f 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,9 +13,11 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining); + unsigned long private, int offlining, + bool sync); extern int fail_migrate_page(struct address_space *, struct page *, struct page *); @@ -33,9 +35,11 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining) { return -ENOSYS; } + unsigned long private, int offlining, + bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } static inline int migrate_prep_local(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 8fe917ec7c1..47fca106934 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -33,6 +33,7 @@ struct compact_control { unsigned long nr_migratepages; /* Number of pages to migrate */ unsigned long free_pfn; /* isolate_freepages search base */ unsigned long migrate_pfn; /* isolate_migratepages search base */ + bool sync; /* Synchronous migration */ /* Account for isolated anon and file pages */ unsigned long nr_anon; @@ -455,7 +456,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0); + (unsigned long)cc, 0, + cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; @@ -482,7 +484,8 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask) + int order, gfp_t gfp_mask, + bool sync) { struct compact_control cc = { .nr_freepages = 0, @@ -490,6 +493,7 @@ unsigned long compact_zone_order(struct zone *zone, .order = order, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, + .sync = sync, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -505,11 +509,13 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @gfp_mask: The GFP mask of the current allocation * @nodemask: The allowed nodes to allocate from + * @sync: Whether migration is synchronous or not * * This is the main entry point for direct page compaction. */ unsigned long try_to_compact_pages(struct zonelist *zonelist, - int order, gfp_t gfp_mask, nodemask_t *nodemask) + int order, gfp_t gfp_mask, nodemask_t *nodemask, + bool sync) { enum zone_type high_zoneidx = gfp_zone(gfp_mask); int may_enter_fs = gfp_mask & __GFP_FS; @@ -533,7 +539,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask); + status = compact_zone_order(zone, order, gfp_mask, sync); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 46ab2c044b0..2323a8039a9 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1290,9 +1290,10 @@ static int soft_offline_huge_page(struct page *page, int flags) /* Keep page count to indicate a given hugepage is isolated. */ list_add(&hpage->lru, &pagelist); - ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, + true); if (ret) { - putback_lru_pages(&pagelist); + putback_lru_pages(&pagelist); pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) @@ -1413,7 +1414,8 @@ int soft_offline_page(struct page *page, int flags) LIST_HEAD(pagelist); list_add(&page->lru, &pagelist); - ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0); + ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, + 0, true); if (ret) { pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 2c6523af547..584fc5588fd 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -733,7 +733,8 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) goto out; } /* this function returns # of failed pages */ - ret = migrate_pages(&source, hotremove_migrate_alloc, 0, 1); + ret = migrate_pages(&source, hotremove_migrate_alloc, 0, + 1, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 11ff260fb28..9db27459308 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,7 +935,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0); + err = migrate_pages(&pagelist, new_node_page, dest, 0, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1155,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0); + (unsigned long)vma, 0, true); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index 94875b26592..dc47f6c4035 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining) + struct page *page, int force, int offlining, bool sync) { int rc = 0; int *result = NULL; @@ -682,7 +682,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, BUG_ON(charge); if (PageWriteback(page)) { - if (!force) + if (!force || !sync) goto uncharge; wait_on_page_writeback(page); } @@ -827,7 +827,7 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining) + int force, int offlining, bool sync) { int rc = 0; int *result = NULL; @@ -841,7 +841,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc = -EAGAIN; if (!trylock_page(hpage)) { - if (!force) + if (!force || !sync) goto out; lock_page(hpage); } @@ -909,7 +909,8 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, int offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -929,7 +930,8 @@ int migrate_pages(struct list_head *from, cond_resched(); rc = unmap_and_move(get_new_page, private, - page, pass > 2, offlining); + page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -958,7 +960,8 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining) + new_page_t get_new_page, unsigned long private, int offlining, + bool sync) { int retry = 1; int nr_failed = 0; @@ -974,7 +977,8 @@ int migrate_huge_pages(struct list_head *from, cond_resched(); rc = unmap_and_move_huge_page(get_new_page, - private, page, pass > 2, offlining); + private, page, pass > 2, offlining, + sync); switch(rc) { case -ENOMEM: @@ -1107,7 +1111,7 @@ set_status: err = 0; if (!list_empty(&pagelist)) { err = migrate_pages(&pagelist, new_page_node, - (unsigned long)pm, 0); + (unsigned long)pm, 0, true); if (err) putback_lru_pages(&pagelist); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 03a66a31bfc..0fd486467b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1812,7 +1812,8 @@ static struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { struct page *page; struct task_struct *tsk = current; @@ -1822,7 +1823,7 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, tsk->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, - nodemask); + nodemask, sync_migration); tsk->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { @@ -1859,7 +1860,8 @@ static inline struct page * __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, struct zonelist *zonelist, enum zone_type high_zoneidx, nodemask_t *nodemask, int alloc_flags, struct zone *preferred_zone, - int migratetype, unsigned long *did_some_progress) + int migratetype, unsigned long *did_some_progress, + bool sync_migration) { return NULL; } @@ -2001,6 +2003,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, unsigned long pages_reclaimed = 0; unsigned long did_some_progress; struct task_struct *p = current; + bool sync_migration = false; /* * In the slowpath, we sanity check order to avoid ever trying to @@ -2063,14 +2066,19 @@ rebalance: if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) goto nopage; - /* Try direct compaction */ + /* + * Try direct compaction. The first pass is asynchronous. Subsequent + * attempts after direct reclaim are synchronous + */ page = __alloc_pages_direct_compact(gfp_mask, order, zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; + sync_migration = true; /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, @@ -2134,7 +2142,8 @@ rebalance: zonelist, high_zoneidx, nodemask, alloc_flags, preferred_zone, - migratetype, &did_some_progress); + migratetype, &did_some_progress, + sync_migration); if (page) goto got_pg; } diff --git a/mm/vmscan.c b/mm/vmscan.c index 10ebd74a423..8320d115c85 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2377,7 +2377,8 @@ loop_again: * would ordinarily call try_to_compact_pages() */ if (sc.order > PAGE_ALLOC_COSTLY_ORDER) - compact_zone_order(zone, sc.order, sc.gfp_mask); + compact_zone_order(zone, sc.order, sc.gfp_mask, + false); if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { -- cgit v1.2.3-70-g09d2 From 7f0f24967b0349798803260b2e4bf347cffa1990 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:58 -0800 Subject: mm: migration: cleanup migrate_pages API by matching types for offlining and sync With the introduction of the boolean sync parameter, the API looks a little inconsistent as offlining is still an int. Convert offlining to a bool for the sake of being tidy. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/migrate.h | 8 ++++---- mm/compaction.c | 2 +- mm/memory_hotplug.c | 2 +- mm/mempolicy.c | 6 ++++-- mm/migrate.c | 8 ++++---- 5 files changed, 14 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/migrate.h b/include/linux/migrate.h index fa31902803f..e39aeecfe9a 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -13,10 +13,10 @@ extern void putback_lru_pages(struct list_head *l); extern int migrate_page(struct address_space *, struct page *, struct page *); extern int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync); extern int fail_migrate_page(struct address_space *, @@ -35,10 +35,10 @@ extern int migrate_huge_page_move_mapping(struct address_space *mapping, static inline void putback_lru_pages(struct list_head *l) {} static inline int migrate_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_huge_pages(struct list_head *l, new_page_t x, - unsigned long private, int offlining, + unsigned long private, bool offlining, bool sync) { return -ENOSYS; } static inline int migrate_prep(void) { return -ENOSYS; } diff --git a/mm/compaction.c b/mm/compaction.c index 47fca106934..e005a30e968 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -456,7 +456,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) nr_migrate = cc->nr_migratepages; migrate_pages(&cc->migratepages, compaction_alloc, - (unsigned long)cc, 0, + (unsigned long)cc, false, cc->sync); update_nr_listpages(cc); nr_remaining = cc->nr_migratepages; diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 584fc5588fd..a2832c09250 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -734,7 +734,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) } /* this function returns # of failed pages */ ret = migrate_pages(&source, hotremove_migrate_alloc, 0, - 1, true); + true, true); if (ret) putback_lru_pages(&source); } diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9db27459308..7ee55af8d79 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -935,7 +935,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, return PTR_ERR(vma); if (!list_empty(&pagelist)) { - err = migrate_pages(&pagelist, new_node_page, dest, 0, true); + err = migrate_pages(&pagelist, new_node_page, dest, + false, true); if (err) putback_lru_pages(&pagelist); } @@ -1155,7 +1156,8 @@ static long do_mbind(unsigned long start, unsigned long len, if (!list_empty(&pagelist)) { nr_failed = migrate_pages(&pagelist, new_vma_page, - (unsigned long)vma, 0, true); + (unsigned long)vma, + false, true); if (nr_failed) putback_lru_pages(&pagelist); } diff --git a/mm/migrate.c b/mm/migrate.c index dc47f6c4035..690d0de993a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -614,7 +614,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, * to the newly allocated page in newpage. */ static int unmap_and_move(new_page_t get_new_page, unsigned long private, - struct page *page, int force, int offlining, bool sync) + struct page *page, int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; @@ -827,7 +827,7 @@ move_newpage: */ static int unmap_and_move_huge_page(new_page_t get_new_page, unsigned long private, struct page *hpage, - int force, int offlining, bool sync) + int force, bool offlining, bool sync) { int rc = 0; int *result = NULL; @@ -909,7 +909,7 @@ out: * Return: Number of pages not migrated or error code. */ int migrate_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining, + new_page_t get_new_page, unsigned long private, bool offlining, bool sync) { int retry = 1; @@ -960,7 +960,7 @@ out: } int migrate_huge_pages(struct list_head *from, - new_page_t get_new_page, unsigned long private, int offlining, + new_page_t get_new_page, unsigned long private, bool offlining, bool sync) { int retry = 1; -- cgit v1.2.3-70-g09d2 From 9927af740b1b9b1e769310bd0b91425e8047b803 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:45:59 -0800 Subject: mm: compaction: perform a faster migration scan when migrating asynchronously try_to_compact_pages() is initially called to only migrate pages asychronously and kswapd always compacts asynchronously. Both are being optimistic so it is important to complete the work as quickly as possible to minimise stalls. This patch alters the scanner when asynchronous to only consider MIGRATE_MOVABLE pageblocks as migration candidates. This reduces stalls when allocating huge pages while not impairing allocation success rates as a full scan will be performed if necessary after direct reclaim. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index e005a30e968..b0fbfdfad29 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -240,6 +240,7 @@ static unsigned long isolate_migratepages(struct zone *zone, struct compact_control *cc) { unsigned long low_pfn, end_pfn; + unsigned long last_pageblock_nr = 0, pageblock_nr; unsigned long nr_scanned = 0, nr_isolated = 0; struct list_head *migratelist = &cc->migratepages; @@ -280,6 +281,20 @@ static unsigned long isolate_migratepages(struct zone *zone, if (PageBuddy(page)) continue; + /* + * For async migration, also only scan in MOVABLE blocks. Async + * migration is optimistic to see if the minimum amount of work + * satisfies the allocation + */ + pageblock_nr = low_pfn >> pageblock_order; + if (!cc->sync && last_pageblock_nr != pageblock_nr && + get_pageblock_migratetype(page) != MIGRATE_MOVABLE) { + low_pfn += pageblock_nr_pages; + low_pfn = ALIGN(low_pfn, pageblock_nr_pages) - 1; + last_pageblock_nr = pageblock_nr; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; -- cgit v1.2.3-70-g09d2 From f3a310bc4e5ce7e55e1c8e25c31e63af017f3e50 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: vmscan: rename lumpy_mode to reclaim_mode With compaction being used instead of lumpy reclaim, the name lumpy_mode and associated variables is a bit misleading. Rename lumpy_mode to reclaim_mode which is a better fit. There is no functional change. Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: KOSAKI Motohiro Cc: Rik van Riel Acked-by: Johannes Weiner Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 6 ++-- mm/vmscan.c | 70 +++++++++++++++++++++---------------------- 2 files changed, 38 insertions(+), 38 deletions(-) (limited to 'mm') diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index be76429962c..ea422aaa23e 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -25,13 +25,13 @@ #define trace_reclaim_flags(page, sync) ( \ (page_is_file_cache(page) ? RECLAIM_WB_FILE : RECLAIM_WB_ANON) | \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) #define trace_shrink_flags(file, sync) ( \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_MIXED : \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_MIXED : \ (file ? RECLAIM_WB_FILE : RECLAIM_WB_ANON)) | \ - (sync & LUMPY_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ + (sync & RECLAIM_MODE_SYNC ? RECLAIM_WB_SYNC : RECLAIM_WB_ASYNC) \ ) TRACE_EVENT(mm_vmscan_kswapd_sleep, diff --git a/mm/vmscan.c b/mm/vmscan.c index 8320d115c85..7037cc8c60b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -53,22 +53,22 @@ #include /* - * lumpy_mode determines how the inactive list is shrunk - * LUMPY_MODE_SINGLE: Reclaim only order-0 pages - * LUMPY_MODE_ASYNC: Do not block - * LUMPY_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback - * LUMPY_MODE_CONTIGRECLAIM: For high-order allocations, take a reference + * reclaim_mode determines how the inactive list is shrunk + * RECLAIM_MODE_SINGLE: Reclaim only order-0 pages + * RECLAIM_MODE_ASYNC: Do not block + * RECLAIM_MODE_SYNC: Allow blocking e.g. call wait_on_page_writeback + * RECLAIM_MODE_LUMPYRECLAIM: For high-order allocations, take a reference * page from the LRU and reclaim all pages within a * naturally aligned range - * LUMPY_MODE_COMPACTION: For high-order allocations, reclaim a number of + * RECLAIM_MODE_COMPACTION: For high-order allocations, reclaim a number of * order-0 pages and then compact the zone */ -typedef unsigned __bitwise__ lumpy_mode; -#define LUMPY_MODE_SINGLE ((__force lumpy_mode)0x01u) -#define LUMPY_MODE_ASYNC ((__force lumpy_mode)0x02u) -#define LUMPY_MODE_SYNC ((__force lumpy_mode)0x04u) -#define LUMPY_MODE_CONTIGRECLAIM ((__force lumpy_mode)0x08u) -#define LUMPY_MODE_COMPACTION ((__force lumpy_mode)0x10u) +typedef unsigned __bitwise__ reclaim_mode_t; +#define RECLAIM_MODE_SINGLE ((__force reclaim_mode_t)0x01u) +#define RECLAIM_MODE_ASYNC ((__force reclaim_mode_t)0x02u) +#define RECLAIM_MODE_SYNC ((__force reclaim_mode_t)0x04u) +#define RECLAIM_MODE_LUMPYRECLAIM ((__force reclaim_mode_t)0x08u) +#define RECLAIM_MODE_COMPACTION ((__force reclaim_mode_t)0x10u) struct scan_control { /* Incremented by the number of inactive pages that were scanned */ @@ -101,7 +101,7 @@ struct scan_control { * Intend to reclaim enough continuous memory rather than reclaim * enough amount of memory. i.e, mode for high order allocation. */ - lumpy_mode lumpy_reclaim_mode; + reclaim_mode_t reclaim_mode; /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; @@ -284,10 +284,10 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, return ret; } -static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, +static void set_reclaim_mode(int priority, struct scan_control *sc, bool sync) { - lumpy_mode syncmode = sync ? LUMPY_MODE_SYNC : LUMPY_MODE_ASYNC; + reclaim_mode_t syncmode = sync ? RECLAIM_MODE_SYNC : RECLAIM_MODE_ASYNC; /* * Initially assume we are entering either lumpy reclaim or @@ -295,9 +295,9 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, * sync mode or just reclaim order-0 pages later. */ if (COMPACTION_BUILD) - sc->lumpy_reclaim_mode = LUMPY_MODE_COMPACTION; + sc->reclaim_mode = RECLAIM_MODE_COMPACTION; else - sc->lumpy_reclaim_mode = LUMPY_MODE_CONTIGRECLAIM; + sc->reclaim_mode = RECLAIM_MODE_LUMPYRECLAIM; /* * Avoid using lumpy reclaim or reclaim/compaction if possible by @@ -305,16 +305,16 @@ static void set_lumpy_reclaim_mode(int priority, struct scan_control *sc, * under memory pressure */ if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - sc->lumpy_reclaim_mode |= syncmode; + sc->reclaim_mode |= syncmode; else if (sc->order && priority < DEF_PRIORITY - 2) - sc->lumpy_reclaim_mode |= syncmode; + sc->reclaim_mode |= syncmode; else - sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } -static void disable_lumpy_reclaim_mode(struct scan_control *sc) +static void reset_reclaim_mode(struct scan_control *sc) { - sc->lumpy_reclaim_mode = LUMPY_MODE_SINGLE | LUMPY_MODE_ASYNC; + sc->reclaim_mode = RECLAIM_MODE_SINGLE | RECLAIM_MODE_ASYNC; } static inline int is_page_cache_freeable(struct page *page) @@ -445,7 +445,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * first attempt to free a range of pages fails. */ if (PageWriteback(page) && - (sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC)) + (sc->reclaim_mode & RECLAIM_MODE_SYNC)) wait_on_page_writeback(page); if (!PageWriteback(page)) { @@ -453,7 +453,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, ClearPageReclaim(page); } trace_mm_vmscan_writepage(page, - trace_reclaim_flags(page, sc->lumpy_reclaim_mode)); + trace_reclaim_flags(page, sc->reclaim_mode)); inc_zone_page_state(page, NR_VMSCAN_WRITE); return PAGE_SUCCESS; } @@ -638,7 +638,7 @@ static enum page_references page_check_references(struct page *page, referenced_page = TestClearPageReferenced(page); /* Lumpy reclaim - ignore references */ - if (sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM) + if (sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM) return PAGEREF_RECLAIM; /* @@ -755,7 +755,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * for any page for which writeback has already * started. */ - if ((sc->lumpy_reclaim_mode & LUMPY_MODE_SYNC) && + if ((sc->reclaim_mode & RECLAIM_MODE_SYNC) && may_enter_fs) wait_on_page_writeback(page); else { @@ -911,7 +911,7 @@ cull_mlocked: try_to_free_swap(page); unlock_page(page); putback_lru_page(page); - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); continue; activate_locked: @@ -924,7 +924,7 @@ activate_locked: keep_locked: unlock_page(page); keep: - disable_lumpy_reclaim_mode(sc); + reset_reclaim_mode(sc); keep_lumpy: list_add(&page->lru, &ret_pages); VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); @@ -1340,7 +1340,7 @@ static inline bool should_reclaim_stall(unsigned long nr_taken, return false; /* Only stall on lumpy reclaim */ - if (sc->lumpy_reclaim_mode & LUMPY_MODE_SINGLE) + if (sc->reclaim_mode & RECLAIM_MODE_SINGLE) return false; /* If we have relaimed everything on the isolated list, no stall */ @@ -1384,14 +1384,14 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, return SWAP_CLUSTER_MAX; } - set_lumpy_reclaim_mode(priority, sc, false); + set_reclaim_mode(priority, sc, false); lru_add_drain(); spin_lock_irq(&zone->lru_lock); if (scanning_global_lru(sc)) { nr_taken = isolate_pages_global(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? ISOLATE_BOTH : ISOLATE_INACTIVE, zone, 0, file); zone->pages_scanned += nr_scanned; @@ -1404,7 +1404,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, } else { nr_taken = mem_cgroup_isolate_pages(nr_to_scan, &page_list, &nr_scanned, sc->order, - sc->lumpy_reclaim_mode & LUMPY_MODE_CONTIGRECLAIM ? + sc->reclaim_mode & RECLAIM_MODE_LUMPYRECLAIM ? ISOLATE_BOTH : ISOLATE_INACTIVE, zone, sc->mem_cgroup, 0, file); @@ -1427,7 +1427,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, /* Check if we should syncronously wait for writeback */ if (should_reclaim_stall(nr_taken, nr_reclaimed, priority, sc)) { - set_lumpy_reclaim_mode(priority, sc, true); + set_reclaim_mode(priority, sc, true); nr_reclaimed += shrink_page_list(&page_list, zone, sc); } @@ -1442,7 +1442,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct zone *zone, zone_idx(zone), nr_scanned, nr_reclaimed, priority, - trace_shrink_flags(file, sc->lumpy_reclaim_mode)); + trace_shrink_flags(file, sc->reclaim_mode)); return nr_reclaimed; } @@ -1836,7 +1836,7 @@ static inline bool should_continue_reclaim(struct zone *zone, unsigned long inactive_lru_pages; /* If not in reclaim/compaction mode, stop */ - if (!(sc->lumpy_reclaim_mode & LUMPY_MODE_COMPACTION)) + if (!(sc->reclaim_mode & RECLAIM_MODE_COMPACTION)) return false; /* -- cgit v1.2.3-70-g09d2 From e5a5623b28198aa91ea71ee5d3846757fc76bc87 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:00 -0800 Subject: mm: remove unused get_vm_area_node get_vm_area_node() is unused in the kernel and can thus be removed. Signed-off-by: David Rientjes Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 3 --- mm/vmalloc.c | 7 ------- 2 files changed, 10 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 44b54f619ac..cb73c755fac 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -90,9 +90,6 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, void *caller); -extern struct vm_struct *get_vm_area_node(unsigned long size, - unsigned long flags, int node, - gfp_t gfp_mask); extern struct vm_struct *remove_vm_area(const void *addr); extern int map_vm_area(struct vm_struct *area, pgprot_t prot, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 48245af07b1..78ec9d8bc57 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1315,13 +1315,6 @@ struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, -1, GFP_KERNEL, caller); } -struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, - int node, gfp_t gfp_mask) -{ - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, - node, gfp_mask, __builtin_return_address(0)); -} - static struct vm_struct *find_vm_area(const void *addr) { struct vmap_area *va; -- cgit v1.2.3-70-g09d2 From ec3f64fc9c196a304c4b7db3e1ff56d640628509 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:01 -0800 Subject: mm: remove gfp mask from pcpu_get_vm_areas pcpu_get_vm_areas() only uses GFP_KERNEL allocations, so remove the gfp_t formal and use the mask internally. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/vmalloc.h | 2 +- mm/percpu-vm.c | 2 +- mm/vmalloc.c | 21 +++++++++------------ 3 files changed, 11 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index cb73c755fac..c7348b8d0a8 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -117,7 +117,7 @@ extern __init void vm_area_register_early(struct vm_struct *vm, size_t align); #ifdef CONFIG_SMP struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask); + size_t align); void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms); #endif diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c index 7d9c1d0ebd3..ea534960a04 100644 --- a/mm/percpu-vm.c +++ b/mm/percpu-vm.c @@ -421,7 +421,7 @@ static struct pcpu_chunk *pcpu_create_chunk(void) return NULL; vms = pcpu_get_vm_areas(pcpu_group_offsets, pcpu_group_sizes, - pcpu_nr_groups, pcpu_atom_size, GFP_KERNEL); + pcpu_nr_groups, pcpu_atom_size); if (!vms) { pcpu_free_chunk(chunk); return NULL; diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78ec9d8bc57..f6754663632 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2196,17 +2196,16 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, * @sizes: array containing size of each area * @nr_vms: the number of areas to allocate * @align: alignment, all entries in @offsets and @sizes must be aligned to this - * @gfp_mask: allocation mask * * Returns: kmalloc'd vm_struct pointer array pointing to allocated * vm_structs on success, %NULL on failure * * Percpu allocator wants to use congruent vm areas so that it can * maintain the offsets among percpu areas. This function allocates - * congruent vmalloc areas for it. These areas tend to be scattered - * pretty far, distance between two areas easily going up to - * gigabytes. To avoid interacting with regular vmallocs, these areas - * are allocated from top. + * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to + * be scattered pretty far, distance between two areas easily going up + * to gigabytes. To avoid interacting with regular vmallocs, these + * areas are allocated from top. * * Despite its complicated look, this allocator is rather simple. It * does everything top-down and scans areas from the end looking for @@ -2217,7 +2216,7 @@ static unsigned long pvm_determine_end(struct vmap_area **pnext, */ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, const size_t *sizes, int nr_vms, - size_t align, gfp_t gfp_mask) + size_t align) { const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align); const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1); @@ -2227,8 +2226,6 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, unsigned long base, start, end, last_end; bool purged = false; - gfp_mask &= GFP_RECLAIM_MASK; - /* verify parameters and allocate data structures */ BUG_ON(align & ~PAGE_MASK || !is_power_of_2(align)); for (last_area = 0, area = 0; area < nr_vms; area++) { @@ -2261,14 +2258,14 @@ struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets, return NULL; } - vms = kzalloc(sizeof(vms[0]) * nr_vms, gfp_mask); - vas = kzalloc(sizeof(vas[0]) * nr_vms, gfp_mask); + vms = kzalloc(sizeof(vms[0]) * nr_vms, GFP_KERNEL); + vas = kzalloc(sizeof(vas[0]) * nr_vms, GFP_KERNEL); if (!vas || !vms) goto err_free; for (area = 0; area < nr_vms; area++) { - vas[area] = kzalloc(sizeof(struct vmap_area), gfp_mask); - vms[area] = kzalloc(sizeof(struct vm_struct), gfp_mask); + vas[area] = kzalloc(sizeof(struct vmap_area), GFP_KERNEL); + vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL); if (!vas[area] || !vms[area]) goto err_free; } -- cgit v1.2.3-70-g09d2 From d0a21265dfb5fa8ae54e90d0fb6d1c215b10a28a Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 13 Jan 2011 15:46:02 -0800 Subject: mm: unify module_alloc code for vmalloc Four architectures (arm, mips, sparc, x86) use __vmalloc_area() for module_init(). Much of the code is duplicated and can be generalized in a globally accessible function, __vmalloc_node_range(). __vmalloc_node() now calls into __vmalloc_node_range() with a range of [VMALLOC_START, VMALLOC_END) for functionally equivalent behavior. Each architecture may then use __vmalloc_node_range() directly to remove the duplication of code. Signed-off-by: David Rientjes Cc: Christoph Lameter Cc: Russell King Cc: Ralf Baechle Cc: "David S. Miller" Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/module.c | 14 +++---------- arch/mips/kernel/module.c | 14 +++---------- arch/sparc/kernel/module.c | 14 ++++--------- arch/x86/kernel/module.c | 17 ++++------------ include/linux/vmalloc.h | 5 +++-- mm/vmalloc.c | 50 +++++++++++++++++++++++++++------------------- 6 files changed, 46 insertions(+), 68 deletions(-) (limited to 'mm') diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c index 0c1bb68ff4a..2cfe8161b47 100644 --- a/arch/arm/kernel/module.c +++ b/arch/arm/kernel/module.c @@ -38,17 +38,9 @@ #ifdef CONFIG_MMU void *module_alloc(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL_EXEC, -1, + __builtin_return_address(0)); } #else /* CONFIG_MMU */ void *module_alloc(unsigned long size) diff --git a/arch/mips/kernel/module.c b/arch/mips/kernel/module.c index 6f51dda87fc..d87a72e9fac 100644 --- a/arch/mips/kernel/module.c +++ b/arch/mips/kernel/module.c @@ -46,17 +46,9 @@ static DEFINE_SPINLOCK(dbe_lock); void *module_alloc(unsigned long size) { #ifdef MODULE_START - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULE_START, MODULE_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); #else if (size == 0) return NULL; diff --git a/arch/sparc/kernel/module.c b/arch/sparc/kernel/module.c index ee3c7dde8d9..8d348c474a2 100644 --- a/arch/sparc/kernel/module.c +++ b/arch/sparc/kernel/module.c @@ -23,17 +23,11 @@ static void *module_map(unsigned long size) { - struct vm_struct *area; - - size = PAGE_ALIGN(size); - if (!size || size > MODULES_LEN) - return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - return __vmalloc_area(area, GFP_KERNEL, PAGE_KERNEL); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL, PAGE_KERNEL, -1, + __builtin_return_address(0)); } static char *dot2underscore(char *name) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 8f295609173..ab23f1ad4bf 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -37,20 +37,11 @@ void *module_alloc(unsigned long size) { - struct vm_struct *area; - - if (!size) - return NULL; - size = PAGE_ALIGN(size); - if (size > MODULES_LEN) + if (PAGE_ALIGN(size) > MODULES_LEN) return NULL; - - area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END); - if (!area) - return NULL; - - return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, - PAGE_KERNEL_EXEC); + return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END, + GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC, + -1, __builtin_return_address(0)); } /* Free memory returned from module_alloc */ diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index c7348b8d0a8..4ed6fcd6b72 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -59,8 +59,9 @@ extern void *vmalloc_exec(unsigned long size); extern void *vmalloc_32(unsigned long size); extern void *vmalloc_32_user(unsigned long size); extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); -extern void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, - pgprot_t prot); +extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller); extern void vfree(const void *addr); extern void *vmap(struct page **pages, unsigned int count, diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f6754663632..284346ee0e9 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1530,25 +1530,12 @@ fail: return NULL; } -void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) -{ - void *addr = __vmalloc_area_node(area, gfp_mask, prot, -1, - __builtin_return_address(0)); - - /* - * A ref_count = 3 is needed because the vm_struct and vmap_area - * structures allocated in the __get_vm_area_node() function contain - * references to the virtual address of the vmalloc'ed block. - */ - kmemleak_alloc(addr, area->size - PAGE_SIZE, 3, gfp_mask); - - return addr; -} - /** - * __vmalloc_node - allocate virtually contiguous memory + * __vmalloc_node_range - allocate virtually contiguous memory * @size: allocation size * @align: desired alignment + * @start: vm area range start + * @end: vm area range end * @gfp_mask: flags for the page level allocator * @prot: protection mask for the allocated pages * @node: node to use for allocation or -1 @@ -1558,9 +1545,9 @@ void *__vmalloc_area(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot) * allocator with @gfp_mask flags. Map them into contiguous * kernel virtual space, using a pagetable protection of @prot. */ -static void *__vmalloc_node(unsigned long size, unsigned long align, - gfp_t gfp_mask, pgprot_t prot, - int node, void *caller) +void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, int node, void *caller) { struct vm_struct *area; void *addr; @@ -1570,8 +1557,8 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) return NULL; - area = __get_vm_area_node(size, align, VM_ALLOC, VMALLOC_START, - VMALLOC_END, node, gfp_mask, caller); + area = __get_vm_area_node(size, align, VM_ALLOC, start, end, node, + gfp_mask, caller); if (!area) return NULL; @@ -1588,6 +1575,27 @@ static void *__vmalloc_node(unsigned long size, unsigned long align, return addr; } +/** + * __vmalloc_node - allocate virtually contiguous memory + * @size: allocation size + * @align: desired alignment + * @gfp_mask: flags for the page level allocator + * @prot: protection mask for the allocated pages + * @node: node to use for allocation or -1 + * @caller: caller's return address + * + * Allocate enough pages to cover @size from the page level + * allocator with @gfp_mask flags. Map them into contiguous + * kernel virtual space, using a pagetable protection of @prot. + */ +static void *__vmalloc_node(unsigned long size, unsigned long align, + gfp_t gfp_mask, pgprot_t prot, + int node, void *caller) +{ + return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END, + gfp_mask, prot, node, caller); +} + void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) { return __vmalloc_node(size, 1, gfp_mask, prot, -1, -- cgit v1.2.3-70-g09d2 From 212260aa07135b327752dc02625c68cf4ce04caf Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:46:06 -0800 Subject: mm: clear PageError bit in msync & fsync Temporary IO failures, eg. due to loss of both multipath paths, can permanently leave the PageError bit set on a page, resulting in msync or fsync returning -EIO over and over again, even if IO is now getting to the disk correctly. We already clear the AS_ENOSPC and AS_IO bits in mapping->flags in the filemap_fdatawait_range function. Also clearing the PageError bit on the page allows subsequent msync or fsync calls on this file to return without an error, if the subsequent IO succeeds. Unfortunately data written out in the msync or fsync call that returned -EIO can still get lost, because the page dirty bit appears to not get restored on IO error. However, the alternative could be potentially all of memory filling up with uncleanable dirty pages, hanging the system, so there is no nice choice here... Signed-off-by: Rik van Riel Acked-by: Valerie Aurora Acked-by: Jeff Layton Cc: Theodore Ts'o Acked-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 2 +- mm/filemap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5f38c460367..4805fdec8d6 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -198,7 +198,7 @@ static inline int __TestClearPage##uname(struct page *page) { return 0; } struct page; /* forward declaration */ TESTPAGEFLAG(Locked, locked) TESTSETFLAG(Locked, locked) -PAGEFLAG(Error, error) +PAGEFLAG(Error, error) TESTCLEARFLAG(Error, error) PAGEFLAG(Referenced, referenced) TESTCLEARFLAG(Referenced, referenced) PAGEFLAG(Dirty, dirty) TESTSCFLAG(Dirty, dirty) __CLEARPAGEFLAG(Dirty, dirty) PAGEFLAG(LRU, lru) __CLEARPAGEFLAG(LRU, lru) diff --git a/mm/filemap.c b/mm/filemap.c index 1a3dd591472..b4ad8e36c81 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -298,7 +298,7 @@ int filemap_fdatawait_range(struct address_space *mapping, loff_t start_byte, continue; wait_on_page_writeback(page); - if (PageError(page)) + if (TestClearPageError(page)) ret = -EIO; } pagevec_release(&pvec); -- cgit v1.2.3-70-g09d2 From b009c024ff0059e293c1937516f2defe56263650 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:07 -0800 Subject: do_wp_page: remove the 'reuse' flag mlocking a shared, writable vma currently causes the corresponding pages to be marked as dirty and queued for writeback. This seems rather unnecessary given that the pages are not being actually modified during mlock. It is understood that for non-shared mappings (file or anon) we want to use a write fault in order to break COW, but there is just no such need for shared mappings. The first two patches in this series do not introduce any behavior change. The intent there is to make it obvious that dirtying file pages is only done in the (writable, shared) case. I think this clarifies the code, but I wouldn't mind dropping these two patches if there is no consensus about them. The last patch is where we actually avoid dirtying shared mappings during mlock. Note that as a side effect of this, we won't call page_mkwrite() for the mappings that define it, and won't be pre-allocating data blocks at the FS level if the mapped file was sparsely allocated. My understanding is that mlock does not need to provide such guarantee, as evidenced by the fact that it never did for the filesystems that don't define page_mkwrite() - including some common ones like ext3. However, I would like to gather feedback on this from filesystem people as a precaution. If this turns out to be a showstopper, maybe block preallocation can be added back on using a different interface. Large shared mlocks are getting significantly (>2x) faster in my tests, as the disk can be fully used for reading the file instead of having to share between this and writeback. This patch: Reorganize the code to remove the 'reuse' flag. No behavior changes. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Kosaki Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Theodore Tso Cc: Michael Rubin Cc: Suleiman Souhlal Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 02e48aa0ed1..d0cc1c134a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2112,7 +2112,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, { struct page *old_page, *new_page; pte_t entry; - int reuse = 0, ret = 0; + int ret = 0; int page_mkwrite = 0; struct page *dirty_page = NULL; @@ -2149,14 +2149,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(old_page); } - reuse = reuse_swap_page(old_page); - if (reuse) + if (reuse_swap_page(old_page)) { /* * The page is all ours. Move it to our anon_vma so * the rmap code will not search our parent or siblings. * Protected against the rmap code by the page lock. */ page_move_anon_rmap(old_page, vma, address); + unlock_page(old_page); + goto reuse; + } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { @@ -2220,10 +2222,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } dirty_page = old_page; get_page(dirty_page); - reuse = 1; - } - if (reuse) { reuse: flush_cache_page(vma, address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); -- cgit v1.2.3-70-g09d2 From 72ddc8f72270758951ccefb7d190f364d20215ab Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:08 -0800 Subject: do_wp_page: clarify dirty_page handling Reorganize the code so that dirty pages are handled closer to the place that makes them dirty (handling write fault into shared, writable VMAs). No behavior changes. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Kosaki Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Theodore Tso Cc: Michael Rubin Cc: Suleiman Souhlal Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 72 ++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 38 insertions(+), 34 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d0cc1c134a6..9144fae9a68 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2229,8 +2229,45 @@ reuse: entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, address, page_table, entry,1)) update_mmu_cache(vma, address, page_table); + pte_unmap_unlock(page_table, ptl); ret |= VM_FAULT_WRITE; - goto unlock; + + if (!dirty_page) + return ret; + + /* + * Yes, Virginia, this is actually required to prevent a race + * with clear_page_dirty_for_io() from clearing the page dirty + * bit after it clear all dirty ptes, but before a racing + * do_wp_page installs a dirty pte. + * + * do_no_page is protected similarly. + */ + if (!page_mkwrite) { + wait_on_page_locked(dirty_page); + set_page_dirty_balance(dirty_page, page_mkwrite); + } + put_page(dirty_page); + if (page_mkwrite) { + struct address_space *mapping = dirty_page->mapping; + + set_page_dirty(dirty_page); + unlock_page(dirty_page); + page_cache_release(dirty_page); + if (mapping) { + /* + * Some device drivers do not set page.mapping + * but still dirty their pages + */ + balance_dirty_pages_ratelimited(mapping); + } + } + + /* file_update_time outside page_lock */ + if (vma->vm_file) + file_update_time(vma->vm_file); + + return ret; } /* @@ -2336,39 +2373,6 @@ gotten: page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); - if (dirty_page) { - /* - * Yes, Virginia, this is actually required to prevent a race - * with clear_page_dirty_for_io() from clearing the page dirty - * bit after it clear all dirty ptes, but before a racing - * do_wp_page installs a dirty pte. - * - * do_no_page is protected similarly. - */ - if (!page_mkwrite) { - wait_on_page_locked(dirty_page); - set_page_dirty_balance(dirty_page, page_mkwrite); - } - put_page(dirty_page); - if (page_mkwrite) { - struct address_space *mapping = dirty_page->mapping; - - set_page_dirty(dirty_page); - unlock_page(dirty_page); - page_cache_release(dirty_page); - if (mapping) { - /* - * Some device drivers do not set page.mapping - * but still dirty their pages - */ - balance_dirty_pages_ratelimited(mapping); - } - } - - /* file_update_time outside page_lock */ - if (vma->vm_file) - file_update_time(vma->vm_file); - } return ret; oom_free_new: page_cache_release(new_page); -- cgit v1.2.3-70-g09d2 From 5ecfda041e4b4bd858d25bbf5a16c2a6c06d7272 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:09 -0800 Subject: mlock: avoid dirtying pages and triggering writeback When faulting in pages for mlock(), we want to break COW for anonymous or file pages within VM_WRITABLE, non-VM_SHARED vmas. However, there is no need to write-fault into VM_SHARED vmas since shared file pages can be mlocked first and dirtied later, when/if they actually get written to. Skipping the write fault is desirable, as we don't want to unnecessarily cause these pages to be dirtied and queued for writeback. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Kosaki Motohiro Cc: Peter Zijlstra Cc: Nick Piggin Cc: Theodore Tso Cc: Michael Rubin Cc: Suleiman Souhlal Cc: Dave Chinner Cc: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++++++- mm/mlock.c | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 9144fae9a68..b8f97b8575b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3299,7 +3299,12 @@ int make_pages_present(unsigned long addr, unsigned long end) vma = find_vma(current->mm, addr); if (!vma) return -ENOMEM; - write = (vma->vm_flags & VM_WRITE) != 0; + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + write = (vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE; BUG_ON(addr >= end); BUG_ON(end > vma->vm_end); len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; diff --git a/mm/mlock.c b/mm/mlock.c index b70919ce4f7..4f318642fbb 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -171,7 +171,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); gup_flags = FOLL_TOUCH | FOLL_GET; - if (vma->vm_flags & VM_WRITE) + /* + * We want to touch writable mappings with a write fault in order + * to break COW, except for shared mappings because these don't COW + * and we would not want to dirty them for nothing. + */ + if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; /* We don't try to access the guard page of a stack vma */ -- cgit v1.2.3-70-g09d2 From fed067da46ad3b9acedaf794a5f05d0bc153280b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:10 -0800 Subject: mlock: only hold mmap_sem in shared mode when faulting in pages Currently mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time, during which various commands such as 'ps auxw' will block. This makes sysadmins unhappy: real 14m36.232s user 0m0.003s sys 0m0.015s (output from 'time ps auxw' while a 20GB file was being mlocked without being previously preloaded into page cache) I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages, in small batches, releasing mmap_sem when we block on disk access or when we detect some contention. This patch: Before this change, mlock() holds mmap_sem in exclusive mode while the pages get faulted in. In the case of a large mlock, this can potentially take a very long time. Various things will block while mmap_sem is held, including 'ps auxw'. This can make sysadmins angry. I propose that mlock() could release mmap_sem after the VM_LOCKED bits have been set in all appropriate VMAs. Then a second pass could be done to actually mlock the pages with mmap_sem held for reads only. We need to recheck the vma flags after we re-acquire mmap_sem, but this is easy. In the case where a vma has been munlocked before mlock completes, pages that were already marked as PageMlocked() are handled by the munlock() call, and mlock() is careful to not mark new page batches as PageMlocked() after the munlock() call has cleared the VM_LOCKED vma flags. So, the end result will be identical to what'd happen if munlock() had executed after the mlock() call. In a later change, I will allow the second pass to release mmap_sem when blocking on disk accesses or when it is otherwise contended, so that it won't be held for long periods of time even in shared mode. Signed-off-by: Michel Lespinasse Tested-by: Valdis Kletnieks Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 81 +++++++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 4f318642fbb..67b3dd8616d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -377,18 +377,10 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, int ret = 0; int lock = newflags & VM_LOCKED; - if (newflags == vma->vm_flags || - (vma->vm_flags & (VM_IO | VM_PFNMAP))) + if (newflags == vma->vm_flags || (vma->vm_flags & VM_SPECIAL) || + is_vm_hugetlb_page(vma) || vma == get_gate_vma(current)) goto out; /* don't set VM_LOCKED, don't count */ - if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current)) { - if (lock) - make_pages_present(start, end); - goto out; /* don't set VM_LOCKED, don't count */ - } - pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma)); @@ -424,14 +416,10 @@ success: * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ - if (lock) { + if (lock) vma->vm_flags = newflags; - ret = __mlock_vma_pages_range(vma, start, end); - if (ret < 0) - ret = __mlock_posix_error_return(ret); - } else { + else munlock_vma_pages_range(vma, start, end); - } out: *prev = vma; @@ -444,7 +432,8 @@ static int do_mlock(unsigned long start, size_t len, int on) struct vm_area_struct * vma, * prev; int error; - len = PAGE_ALIGN(len); + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; if (end < start) return -EINVAL; @@ -487,6 +476,58 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } +static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) +{ + struct mm_struct *mm = current->mm; + unsigned long end, nstart, nend; + struct vm_area_struct *vma = NULL; + int ret = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(len != PAGE_ALIGN(len)); + end = start + len; + + down_read(&mm->mmap_sem); + for (nstart = start; nstart < end; nstart = nend) { + /* + * We want to fault in pages for [nstart; end) address range. + * Find first corresponding VMA. + */ + if (!vma) + vma = find_vma(mm, nstart); + else + vma = vma->vm_next; + if (!vma || vma->vm_start >= end) + break; + /* + * Set [nstart; nend) to intersection of desired address + * range with the first VMA. Also, skip undesirable VMA types. + */ + nend = min(end, vma->vm_end); + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + continue; + if (nstart < vma->vm_start) + nstart = vma->vm_start; + /* + * Now fault in a range of pages within the first VMA. + */ + if (vma->vm_flags & VM_LOCKED) { + ret = __mlock_vma_pages_range(vma, nstart, nend); + if (ret < 0 && ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + if (ret) { + ret = __mlock_posix_error_return(ret); + break; + } + } else + make_pages_present(nstart, nend); + } + up_read(&mm->mmap_sem); + return ret; /* 0 or negative error code */ +} + SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; @@ -512,6 +553,8 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) if ((locked <= lock_limit) || capable(CAP_IPC_LOCK)) error = do_mlock(start, len, 1); up_write(¤t->mm->mmap_sem); + if (!error) + error = do_mlock_pages(start, len, 0); return error; } @@ -576,6 +619,10 @@ SYSCALL_DEFINE1(mlockall, int, flags) capable(CAP_IPC_LOCK)) ret = do_mlockall(flags); up_write(¤t->mm->mmap_sem); + if (!ret && (flags & MCL_CURRENT)) { + /* Ignore errors */ + do_mlock_pages(0, TASK_SIZE, 1); + } out: return ret; } -- cgit v1.2.3-70-g09d2 From 110d74a921f4d272b47ef6104fcf937df808f4c8 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:11 -0800 Subject: mm: add FOLL_MLOCK follow_page flag. Move the code to mlock pages from __mlock_vma_pages_range() to follow_page(). This allows __mlock_vma_pages_range() to not have to break down work into 16-page batches. An additional motivation for doing this within the present patch series is that it'll make it easier for a later chagne to drop mmap_sem when blocking on disk (we'd like to be able to resume at the page that was read from disk instead of at the start of a 16-page batch). Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 22 ++++++++++++++++++ mm/mlock.c | 65 +++++------------------------------------------------- 3 files changed, 28 insertions(+), 60 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 721f451c302..9ade803bcc1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1415,6 +1415,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_GET 0x04 /* do get_page on page */ #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ +#define FOLL_MLOCK 0x40 /* mark page as mlocked */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index b8f97b8575b..15e1f19a3b1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1310,6 +1310,28 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, */ mark_page_accessed(page); } + if (flags & FOLL_MLOCK) { + /* + * The preliminary mapping check is mainly to avoid the + * pointless overhead of lock_page on the ZERO_PAGE + * which might bounce very badly if there is contention. + * + * If the page is already locked, we don't need to + * handle it now - vmscan will handle it later if and + * when it attempts to reclaim the page. + */ + if (page->mapping && trylock_page(page)) { + lru_add_drain(); /* push cached pages to LRU */ + /* + * Because we lock page here and migration is + * blocked by the pte's page reference, we need + * only check for file-cache page truncation. + */ + if (page->mapping) + mlock_vma_page(page); + unlock_page(page); + } + } unlock: pte_unmap_unlock(ptep, ptl); out: diff --git a/mm/mlock.c b/mm/mlock.c index 67b3dd8616d..25cc9e88c54 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -159,10 +159,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; - struct page *pages[16]; /* 16 gives a reasonable batch */ int nr_pages = (end - start) / PAGE_SIZE; - int ret = 0; int gup_flags; + int ret; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -170,7 +169,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_GET; + gup_flags = FOLL_TOUCH | FOLL_MLOCK; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -185,63 +184,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, nr_pages--; } - while (nr_pages > 0) { - int i; - - cond_resched(); - - /* - * get_user_pages makes pages present if we are - * setting mlock. and this extra reference count will - * disable migration of this page. However, page may - * still be truncated out from under us. - */ - ret = __get_user_pages(current, mm, addr, - min_t(int, nr_pages, ARRAY_SIZE(pages)), - gup_flags, pages, NULL); - /* - * This can happen for, e.g., VM_NONLINEAR regions before - * a page has been allocated and mapped at a given offset, - * or for addresses that map beyond end of a file. - * We'll mlock the pages if/when they get faulted in. - */ - if (ret < 0) - break; - - lru_add_drain(); /* push cached pages to LRU */ - - for (i = 0; i < ret; i++) { - struct page *page = pages[i]; - - if (page->mapping) { - /* - * That preliminary check is mainly to avoid - * the pointless overhead of lock_page on the - * ZERO_PAGE: which might bounce very badly if - * there is contention. However, we're still - * dirtying its cacheline with get/put_page: - * we'll add another __get_user_pages flag to - * avoid it if that case turns out to matter. - */ - lock_page(page); - /* - * Because we lock page here and migration is - * blocked by the elevated reference, we need - * only check for file-cache page truncation. - */ - if (page->mapping) - mlock_vma_page(page); - unlock_page(page); - } - put_page(page); /* ref from get_user_pages() */ - } - - addr += ret * PAGE_SIZE; - nr_pages -= ret; - ret = 0; - } - - return ret; /* 0 or negative error code */ + ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL); + return max(ret, 0); /* 0 or negative error code */ } /* -- cgit v1.2.3-70-g09d2 From 5fdb2002131cd4e210b9638a4fc932ec7be491d1 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:12 -0800 Subject: mm: move VM_LOCKED check to __mlock_vma_pages_range() Use a single code path for faulting in pages during mlock. The reason to have it in this patch series is that I did not want to update both code paths in a later change that releases mmap_sem when blocking on disk. Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mlock.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 25cc9e88c54..84da66b7bbf 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -169,7 +169,7 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, VM_BUG_ON(end > vma->vm_end); VM_BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); - gup_flags = FOLL_TOUCH | FOLL_MLOCK; + gup_flags = FOLL_TOUCH; /* * We want to touch writable mappings with a write fault in order * to break COW, except for shared mappings because these don't COW @@ -178,6 +178,9 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + if (vma->vm_flags & VM_LOCKED) + gup_flags |= FOLL_MLOCK; + /* We don't try to access the guard page of a stack vma */ if (stack_guard_page(vma, start)) { addr += PAGE_SIZE; @@ -456,18 +459,15 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) /* * Now fault in a range of pages within the first VMA. */ - if (vma->vm_flags & VM_LOCKED) { - ret = __mlock_vma_pages_range(vma, nstart, nend); - if (ret < 0 && ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - if (ret) { - ret = __mlock_posix_error_return(ret); - break; - } - } else - make_pages_present(nstart, nend); + ret = __mlock_vma_pages_range(vma, nstart, nend); + if (ret < 0 && ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } + if (ret) { + ret = __mlock_posix_error_return(ret); + break; + } } up_read(&mm->mmap_sem); return ret; /* 0 or negative error code */ -- cgit v1.2.3-70-g09d2 From 53a7706d5ed8f1a53ba062b318773160cc476dde Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 13 Jan 2011 15:46:14 -0800 Subject: mlock: do not hold mmap_sem for extended periods of time __get_user_pages gets a new 'nonblocking' parameter to signal that the caller is prepared to re-acquire mmap_sem and retry the operation if needed. This is used to split off long operations if they are going to block on a disk transfer, or when we detect contention on the mmap_sem. [akpm@linux-foundation.org: remove ref to rwsem_is_contended()] Signed-off-by: Michel Lespinasse Cc: Hugh Dickins Cc: Rik van Riel Cc: Peter Zijlstra Cc: Nick Piggin Cc: KOSAKI Motohiro Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Cc: David Howells Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 3 ++- mm/memory.c | 23 ++++++++++++++++++----- mm/mlock.c | 40 +++++++++++++++++++++++----------------- mm/nommu.c | 6 ++++-- 4 files changed, 47 insertions(+), 25 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index dedb0aff673..bd4f581f624 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -243,7 +243,8 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas); + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking); #define ZONE_RECLAIM_NOSCAN -2 #define ZONE_RECLAIM_FULL -1 diff --git a/mm/memory.c b/mm/memory.c index 15e1f19a3b1..1bbe9a22429 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1363,7 +1363,8 @@ no_page_table: int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int gup_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *nonblocking) { int i; unsigned long vm_flags; @@ -1463,10 +1464,15 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, cond_resched(); while (!(page = follow_page(vma, start, foll_flags))) { int ret; + unsigned int fault_flags = 0; + + if (foll_flags & FOLL_WRITE) + fault_flags |= FAULT_FLAG_WRITE; + if (nonblocking) + fault_flags |= FAULT_FLAG_ALLOW_RETRY; ret = handle_mm_fault(mm, vma, start, - (foll_flags & FOLL_WRITE) ? - FAULT_FLAG_WRITE : 0); + fault_flags); if (ret & VM_FAULT_ERROR) { if (ret & VM_FAULT_OOM) @@ -1482,6 +1488,11 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, else tsk->min_flt++; + if (ret & VM_FAULT_RETRY) { + *nonblocking = 0; + return i; + } + /* * The VM_FAULT_WRITE bit tells us that * do_wp_page has broken COW when necessary, @@ -1581,7 +1592,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); @@ -1606,7 +1618,8 @@ struct page *get_dump_page(unsigned long addr) struct page *page; if (__get_user_pages(current, current->mm, addr, 1, - FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma) < 1) + FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma, + NULL) < 1) return NULL; flush_cache_page(vma, addr, page_to_pfn(page)); return page; diff --git a/mm/mlock.c b/mm/mlock.c index 84da66b7bbf..13e81ee8be9 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -155,13 +155,13 @@ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long add * vma->vm_mm->mmap_sem must be held for at least read. */ static long __mlock_vma_pages_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + int *nonblocking) { struct mm_struct *mm = vma->vm_mm; unsigned long addr = start; int nr_pages = (end - start) / PAGE_SIZE; int gup_flags; - int ret; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(end & ~PAGE_MASK); @@ -187,9 +187,8 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, nr_pages--; } - ret = __get_user_pages(current, mm, addr, nr_pages, gup_flags, - NULL, NULL); - return max(ret, 0); /* 0 or negative error code */ + return __get_user_pages(current, mm, addr, nr_pages, gup_flags, + NULL, NULL, nonblocking); } /* @@ -233,7 +232,7 @@ long mlock_vma_pages_range(struct vm_area_struct *vma, is_vm_hugetlb_page(vma) || vma == get_gate_vma(current))) { - __mlock_vma_pages_range(vma, start, end); + __mlock_vma_pages_range(vma, start, end, NULL); /* Hide errors from mmap() and other callers */ return 0; @@ -429,21 +428,23 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) struct mm_struct *mm = current->mm; unsigned long end, nstart, nend; struct vm_area_struct *vma = NULL; + int locked = 0; int ret = 0; VM_BUG_ON(start & ~PAGE_MASK); VM_BUG_ON(len != PAGE_ALIGN(len)); end = start + len; - down_read(&mm->mmap_sem); for (nstart = start; nstart < end; nstart = nend) { /* * We want to fault in pages for [nstart; end) address range. * Find first corresponding VMA. */ - if (!vma) + if (!locked) { + locked = 1; + down_read(&mm->mmap_sem); vma = find_vma(mm, nstart); - else + } else if (nstart >= vma->vm_end) vma = vma->vm_next; if (!vma || vma->vm_start >= end) break; @@ -457,19 +458,24 @@ static int do_mlock_pages(unsigned long start, size_t len, int ignore_errors) if (nstart < vma->vm_start) nstart = vma->vm_start; /* - * Now fault in a range of pages within the first VMA. + * Now fault in a range of pages. __mlock_vma_pages_range() + * double checks the vma flags, so that it won't mlock pages + * if the vma was already munlocked. */ - ret = __mlock_vma_pages_range(vma, nstart, nend); - if (ret < 0 && ignore_errors) { - ret = 0; - continue; /* continue at next VMA */ - } - if (ret) { + ret = __mlock_vma_pages_range(vma, nstart, nend, &locked); + if (ret < 0) { + if (ignore_errors) { + ret = 0; + continue; /* continue at next VMA */ + } ret = __mlock_posix_error_return(ret); break; } + nend = nstart + ret * PAGE_SIZE; + ret = 0; } - up_read(&mm->mmap_sem); + if (locked) + up_read(&mm->mmap_sem); return ret; /* 0 or negative error code */ } diff --git a/mm/nommu.c b/mm/nommu.c index ef4045d010d..f59e1424d3d 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -127,7 +127,8 @@ unsigned int kobjsize(const void *objp) int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int nr_pages, unsigned int foll_flags, - struct page **pages, struct vm_area_struct **vmas) + struct page **pages, struct vm_area_struct **vmas, + int *retry) { struct vm_area_struct *vma; unsigned long vm_flags; @@ -185,7 +186,8 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, if (force) flags |= FOLL_FORCE; - return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas); + return __get_user_pages(tsk, mm, start, nr_pages, flags, pages, vmas, + NULL); } EXPORT_SYMBOL(get_user_pages); -- cgit v1.2.3-70-g09d2 From 1e50df39f6e2c3a4a3394df62baa8a213df16c54 Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Thu, 13 Jan 2011 15:46:14 -0800 Subject: mempolicy: remove tasklist_lock from migrate_pages Today, tasklist_lock in migrate_pages doesn't protect anything. rcu_read_lock() provide enough protection from pid hash walk. Signed-off-by: KOSAKI Motohiro Reported-by: Peter Zijlstra Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7ee55af8d79..e6d351265ae 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1310,16 +1310,13 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, /* Find the mm_struct */ rcu_read_lock(); - read_lock(&tasklist_lock); task = pid ? find_task_by_vpid(pid) : current; if (!task) { - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -ESRCH; goto out; } mm = get_task_mm(task); - read_unlock(&tasklist_lock); rcu_read_unlock(); err = -EINVAL; -- cgit v1.2.3-70-g09d2 From ddf9c6d472825ceda66b3adff0f6437dbcd37f71 Mon Sep 17 00:00:00 2001 From: Tobias Klauser Date: Thu, 13 Jan 2011 15:46:15 -0800 Subject: vmalloc: remove redundant unlikely() IS_ERR() already implies unlikely(), so it can be omitted here. Signed-off-by: Tobias Klauser Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 284346ee0e9..cac13b41563 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -748,7 +748,7 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, VMALLOC_START, VMALLOC_END, node, gfp_mask); - if (unlikely(IS_ERR(va))) { + if (IS_ERR(va)) { kfree(vb); return ERR_CAST(va); } -- cgit v1.2.3-70-g09d2 From c585a2678d83ba8fb02fa6b197de0ac7d67377f1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 13 Jan 2011 15:46:18 -0800 Subject: mm: remove likely() from grab_cache_page_write_begin() Running the annotated branch profiler on a box doing average work (firefox, evolution, xchat, distcc farm), the likely() used in grab_cache_page_write_begin() was incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 1924262 71332401 97 grab_cache_page_write_begin filemap.c 2206 Adding a trace_printk() and running the function tracer limited to just this function I can see: gconfd-2-2696 [000] 4467.268935: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=7 gconfd-2-2696 [000] 4467.268946: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268947: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=8 gconfd-2-2696 [000] 4467.268959: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268960: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=9 gconfd-2-2696 [000] 4467.268972: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268973: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=10 gconfd-2-2696 [000] 4467.268991: grab_cache_page_write_begin <-ext3_write_begin gconfd-2-2696 [000] 4467.268992: grab_cache_page_write_begin: page= (null) mapping=ffff8800676a9460 index=11 gconfd-2-2696 [000] 4467.269005: grab_cache_page_write_begin <-ext3_write_begin Which shows that a lot of calls from ext3_write_begin will result in the page returned by "find_lock_page" will be NULL. Signed-off-by: Steven Rostedt Acked-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index b4ad8e36c81..83a45d35468 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2227,7 +2227,7 @@ struct page *grab_cache_page_write_begin(struct address_space *mapping, gfp_notmask = __GFP_FS; repeat: page = find_lock_page(mapping, index); - if (likely(page)) + if (page) return page; page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~gfp_notmask); -- cgit v1.2.3-70-g09d2 From 9950474883e027e6e728cbcff25f7f2bf0c96530 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:20 -0800 Subject: mm: kswapd: stop high-order balancing when any suitable zone is balanced Simon Kirby reported the following problem We're seeing cases on a number of servers where cache never fully grows to use all available memory. Sometimes we see servers with 4 GB of memory that never seem to have less than 1.5 GB free, even with a constantly-active VM. In some cases, these servers also swap out while this happens, even though they are constantly reading the working set into memory. We have been seeing this happening for a long time; I don't think it's anything recent, and it still happens on 2.6.36. After some debugging work by Simon, Dave Hansen and others, the prevaling theory became that kswapd is reclaiming order-3 pages requested by SLUB too aggressive about it. There are two apparent problems here. On the target machine, there is a small Normal zone in comparison to DMA32. As kswapd tries to balance all zones, it would continually try reclaiming for Normal even though DMA32 was balanced enough for callers. The second problem is that sleeping_prematurely() does not use the same logic as balance_pgdat() when deciding whether to sleep or not. This keeps kswapd artifically awake. A number of tests were run and the figures from previous postings will look very different for a few reasons. One, the old figures were forcing my network card to use GFP_ATOMIC in attempt to replicate Simon's problem. Second, I previous specified slub_min_order=3 again in an attempt to reproduce Simon's problem. In this posting, I'm depending on Simon to say whether his problem is fixed or not and these figures are to show the impact to the ordinary cases. Finally, the "vmscan" figures are taken from /proc/vmstat instead of the tracepoints. There is less information but recording is less disruptive. The first test of relevance was postmark with a process running in the background reading a large amount of anonymous memory in blocks. The objective was to vaguely simulate what was happening on Simon's machine and it's memory intensive enough to have kswapd awake. POSTMARK traceonly kanyzone Transactions per second: 156.00 ( 0.00%) 153.00 (-1.96%) Data megabytes read per second: 21.51 ( 0.00%) 21.52 ( 0.05%) Data megabytes written per second: 29.28 ( 0.00%) 29.11 (-0.58%) Files created alone per second: 250.00 ( 0.00%) 416.00 (39.90%) Files create/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) Files deleted alone per second: 520.00 ( 0.00%) 420.00 (-23.81%) Files delete/transact per second: 79.00 ( 0.00%) 76.00 (-3.95%) MMTests Statistics: duration User/Sys Time Running Test (seconds) 16.58 17.4 Total Elapsed Time (seconds) 218.48 222.47 VMstat Reclaim Statistics: vmscan Direct reclaims 0 4 Direct reclaim pages scanned 0 203 Direct reclaim pages reclaimed 0 184 Kswapd pages scanned 326631 322018 Kswapd pages reclaimed 312632 309784 Kswapd low wmark quickly 1 4 Kswapd high wmark quickly 122 475 Kswapd skip congestion_wait 1 0 Pages activated 700040 705317 Pages deactivated 212113 203922 Pages written 9875 6363 Total pages scanned 326631 322221 Total pages reclaimed 312632 309968 %age total pages scanned/reclaimed 95.71% 96.20% %age total pages scanned/written 3.02% 1.97% proc vmstat: Faults Major Faults 300 254 Minor Faults 645183 660284 Page ins 493588 486704 Page outs 4960088 4986704 Swap ins 1230 661 Swap outs 9869 6355 Performance is mildly affected because kswapd is no longer doing as much work and the background memory consumer process is getting in the way. Note that kswapd scanned and reclaimed fewer pages as it's less aggressive and overall fewer pages were scanned and reclaimed. Swap in/out is particularly reduced again reflecting kswapd throwing out fewer pages. The slight performance impact is unfortunate here but it looks like a direct result of kswapd being less aggressive. As the bug report is about too many pages being freed by kswapd, it may have to be accepted for now. The second test is a streaming IO benchmark that was previously used by Johannes to show regressions in page reclaim. MICRO traceonly kanyzone User/Sys Time Running Test (seconds) 29.29 28.87 Total Elapsed Time (seconds) 492.18 488.79 VMstat Reclaim Statistics: vmscan Direct reclaims 2128 1460 Direct reclaim pages scanned 2284822 1496067 Direct reclaim pages reclaimed 148919 110937 Kswapd pages scanned 15450014 16202876 Kswapd pages reclaimed 8503697 8537897 Kswapd low wmark quickly 3100 3397 Kswapd high wmark quickly 1860 7243 Kswapd skip congestion_wait 708 801 Pages activated 9635 9573 Pages deactivated 1432 1271 Pages written 223 1130 Total pages scanned 17734836 17698943 Total pages reclaimed 8652616 8648834 %age total pages scanned/reclaimed 48.79% 48.87% %age total pages scanned/written 0.00% 0.01% proc vmstat: Faults Major Faults 165 221 Minor Faults 9655785 9656506 Page ins 3880 7228 Page outs 37692940 37480076 Swap ins 0 69 Swap outs 19 15 Again fewer pages are scanned and reclaimed as expected and this time the test completed faster. Note that kswapd is hitting its watermarks faster (low and high wmark quickly) which I expect is due to kswapd reclaiming fewer pages. I also ran fs-mark, iozone and sysbench but there is nothing interesting to report in the figures. Performance is not significantly changed and the reclaim statistics look reasonable. Tgis patch: When the allocator enters its slow path, kswapd is woken up to balance the node. It continues working until all zones within the node are balanced. For order-0 allocations, this makes perfect sense but for higher orders it can have unintended side-effects. If the zone sizes are imbalanced, kswapd may reclaim heavily within a smaller zone discarding an excessive number of pages. The user-visible behaviour is that kswapd is awake and reclaiming even though plenty of pages are free from a suitable zone. This patch alters the "balance" logic for high-order reclaim allowing kswapd to stop if any suitable zone becomes balanced to reduce the number of pages it reclaims from other zones. kswapd still tries to ensure that order-0 watermarks for all zones are met before sleeping. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 3 ++- mm/page_alloc.c | 8 +++--- mm/vmscan.c | 68 +++++++++++++++++++++++++++++++++++++++++++------- 3 files changed, 66 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 48906629335..dad3612a7e3 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -639,6 +639,7 @@ typedef struct pglist_data { wait_queue_head_t kswapd_wait; struct task_struct *kswapd; int kswapd_max_order; + enum zone_type classzone_idx; } pg_data_t; #define node_present_pages(nid) (NODE_DATA(nid)->node_present_pages) @@ -654,7 +655,7 @@ typedef struct pglist_data { extern struct mutex zonelists_mutex; void build_all_zonelists(void *data); -void wakeup_kswapd(struct zone *zone, int order); +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx); bool zone_watermark_ok(struct zone *z, int order, unsigned long mark, int classzone_idx, int alloc_flags); bool zone_watermark_ok_safe(struct zone *z, int order, unsigned long mark, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0fd486467b4..c3146091850 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1944,13 +1944,14 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, static inline void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, - enum zone_type high_zoneidx) + enum zone_type high_zoneidx, + enum zone_type classzone_idx) { struct zoneref *z; struct zone *zone; for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) - wakeup_kswapd(zone, order); + wakeup_kswapd(zone, order, classzone_idx); } static inline int @@ -2028,7 +2029,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx); + wake_all_kswapd(order, zonelist, high_zoneidx, + zone_idx(preferred_zone)); /* * OK, we're below the kswapd watermark and have kicked background diff --git a/mm/vmscan.c b/mm/vmscan.c index 7037cc8c60b..3584067800e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2246,11 +2246,14 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * interoperates with the page allocator fallback scheme to ensure that aging * of pages is balanced across the zones. */ -static unsigned long balance_pgdat(pg_data_t *pgdat, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, int order, + int classzone_idx) { int all_zones_ok; + int any_zone_ok; int priority; int i; + int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long total_scanned; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { @@ -2273,7 +2276,6 @@ loop_again: count_vm_event(PAGEOUTRUN); for (priority = DEF_PRIORITY; priority >= 0; priority--) { - int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; int has_under_min_watermark_zone = 0; @@ -2282,6 +2284,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; + any_zone_ok = 0; /* * Scan in the highmem->dma direction for the highest @@ -2400,10 +2403,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); + if (i <= classzone_idx) + any_zone_ok = 1; } } - if (all_zones_ok) + if (all_zones_ok || (order && any_zone_ok)) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2426,7 +2431,13 @@ loop_again: break; } out: - if (!all_zones_ok) { + + /* + * order-0: All zones must meet high watermark for a balanced node + * high-order: Any zone below pgdats classzone_idx must meet the high + * watermark for a balanced node + */ + if (!(all_zones_ok || (order && any_zone_ok))) { cond_resched(); try_to_freeze(); @@ -2451,6 +2462,36 @@ out: goto loop_again; } + /* + * If kswapd was reclaiming at a higher order, it has the option of + * sleeping without all zones being balanced. Before it does, it must + * ensure that the watermarks for order-0 on *all* zones are met and + * that the congestion flags are cleared. The congestion flag must + * be cleared as kswapd is the only mechanism that clears the flag + * and it is potentially going to sleep here. + */ + if (order) { + for (i = 0; i <= end_zone; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone->all_unreclaimable && priority != DEF_PRIORITY) + continue; + + /* Confirm the zone is balanced for order-0 */ + if (!zone_watermark_ok(zone, 0, + high_wmark_pages(zone), 0, 0)) { + order = sc.order = 0; + goto loop_again; + } + + /* If balanced, clear the congested flag */ + zone_clear_flag(zone, ZONE_CONGESTED); + } + } + return sc.nr_reclaimed; } @@ -2514,6 +2555,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) static int kswapd(void *p) { unsigned long order; + int classzone_idx; pg_data_t *pgdat = (pg_data_t*)p; struct task_struct *tsk = current; @@ -2544,21 +2586,27 @@ static int kswapd(void *p) set_freezable(); order = 0; + classzone_idx = MAX_NR_ZONES - 1; for ( ; ; ) { unsigned long new_order; + int new_classzone_idx; int ret; new_order = pgdat->kswapd_max_order; + new_classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; - if (order < new_order) { + pgdat->classzone_idx = MAX_NR_ZONES - 1; + if (order < new_order || classzone_idx > new_classzone_idx) { /* * Don't sleep if someone wants a larger 'order' - * allocation + * allocation or has tigher zone constraints */ order = new_order; + classzone_idx = new_classzone_idx; } else { kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; + classzone_idx = pgdat->classzone_idx; } ret = try_to_freeze(); @@ -2571,7 +2619,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order); + balance_pgdat(pgdat, order, classzone_idx); } } return 0; @@ -2580,7 +2628,7 @@ static int kswapd(void *p) /* * A zone is low on free memory, so wake its kswapd task to service it. */ -void wakeup_kswapd(struct zone *zone, int order) +void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx) { pg_data_t *pgdat; @@ -2590,8 +2638,10 @@ void wakeup_kswapd(struct zone *zone, int order) if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) return; pgdat = zone->zone_pgdat; - if (pgdat->kswapd_max_order < order) + if (pgdat->kswapd_max_order < order) { pgdat->kswapd_max_order = order; + pgdat->classzone_idx = min(pgdat->classzone_idx, classzone_idx); + } if (!waitqueue_active(&pgdat->kswapd_wait)) return; if (zone_watermark_ok_safe(zone, order, low_wmark_pages(zone), 0, 0)) -- cgit v1.2.3-70-g09d2 From 1741c87757448cedd03224f01586504f9256415d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:21 -0800 Subject: mm: kswapd: keep kswapd awake for high-order allocations until a percentage of the node is balanced When reclaiming for high-orders, kswapd is responsible for balancing a node but it should not reclaim excessively. It avoids excessive reclaim by considering if any zone in a node is balanced then the node is balanced. In the cases where there are imbalanced zone sizes (e.g. ZONE_DMA with both ZONE_DMA32 and ZONE_NORMAL), kswapd can go to sleep prematurely as just one small zone was balanced. This alters the sleep logic of kswapd slightly. It counts the number of pages that make up the balanced zones. If the total number of balanced pages is more than a quarter of the zone, kswapd will go back to sleep. This should keep a node balanced without reclaiming an excessive number of pages. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 58 +++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 3584067800e..d3488828331 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2198,10 +2198,40 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* + * pgdat_balanced is used when checking if a node is balanced for high-order + * allocations. Only zones that meet watermarks and are in a zone allowed + * by the callers classzone_idx are added to balanced_pages. The total of + * balanced pages must be at least 25% of the zones allowed by classzone_idx + * for the node to be considered balanced. Forcing all zones to be balanced + * for high orders can cause excessive reclaim when there are imbalanced zones. + * The choice of 25% is due to + * o a 16M DMA zone that is balanced will not balance a zone on any + * reasonable sized machine + * o On all other machines, the top zone must be at least a reasonable + * precentage of the middle zones. For example, on 32-bit x86, highmem + * would need to be at least 256M for it to be balance a whole node. + * Similarly, on x86-64 the Normal zone would need to be at least 1G + * to balance a node on its own. These seemed like reasonable ratios. + */ +static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, + int classzone_idx) +{ + unsigned long present_pages = 0; + int i; + + for (i = 0; i <= classzone_idx; i++) + present_pages += pgdat->node_zones[i].present_pages; + + return balanced_pages > (present_pages >> 2); +} + /* is kswapd sleeping prematurely? */ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) { int i; + unsigned long balanced = 0; + bool all_zones_ok = true; /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) @@ -2219,10 +2249,20 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) - return 1; + all_zones_ok = false; + else + balanced += zone->present_pages; } - return 0; + /* + * For high-order requests, the balanced zones must contain at least + * 25% of the nodes pages for kswapd to sleep. For order-0, all zones + * must be balanced + */ + if (order) + return pgdat_balanced(pgdat, balanced, 0); + else + return !all_zones_ok; } /* @@ -2250,7 +2290,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) { int all_zones_ok; - int any_zone_ok; + unsigned long balanced; int priority; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ @@ -2284,7 +2324,7 @@ loop_again: disable_swap_token(); all_zones_ok = 1; - any_zone_ok = 0; + balanced = 0; /* * Scan in the highmem->dma direction for the highest @@ -2404,11 +2444,11 @@ loop_again: */ zone_clear_flag(zone, ZONE_CONGESTED); if (i <= classzone_idx) - any_zone_ok = 1; + balanced += zone->present_pages; } } - if (all_zones_ok || (order && any_zone_ok)) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2434,10 +2474,10 @@ out: /* * order-0: All zones must meet high watermark for a balanced node - * high-order: Any zone below pgdats classzone_idx must meet the high - * watermark for a balanced node + * high-order: Balanced zones must make up at least 25% of the node + * for the node to be balanced */ - if (!(all_zones_ok || (order && any_zone_ok))) { + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))) { cond_resched(); try_to_freeze(); -- cgit v1.2.3-70-g09d2 From 0abdee2bd4118366c62349a304f81537be69af33 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:22 -0800 Subject: mm: kswapd: use the order that kswapd was reclaiming at for sleeping_prematurely() Before kswapd goes to sleep, it uses sleeping_prematurely() to check if there was a race pushing a zone below its watermark. If the race happened, it stays awake. However, balance_pgdat() can decide to reclaim at order-0 if it decides that high-order reclaim is not working as expected. This information is not passed back to sleeping_prematurely(). The impact is that kswapd remains awake reclaiming pages long after it should have gone to sleep. This patch passes the adjusted order to sleeping_prematurely and uses the same logic as balance_pgdat to decide if it's ok to go to sleep. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index d3488828331..46711f080f3 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2227,7 +2227,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, } /* is kswapd sleeping prematurely? */ -static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) { int i; unsigned long balanced = 0; @@ -2237,7 +2237,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (remaining) return 1; - /* If after HZ/10, a zone is below the high mark, it's premature */ + /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { struct zone *zone = pgdat->node_zones + i; @@ -2269,7 +2269,7 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * - * Returns the number of pages which were actually freed. + * Returns the final order kswapd was reclaiming at * * There is special handling here for zones which are full of pinned pages. * This can happen if the pages are all mlocked, or if they are all used by @@ -2532,7 +2532,13 @@ out: } } - return sc.nr_reclaimed; + /* + * Return the order we were reclaiming at so sleeping_prematurely() + * makes a decision on the order we were last reclaiming at. However, + * if another caller entered the allocator slow path while kswapd + * was awake, order will remain at the higher level + */ + return order; } static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) @@ -2659,7 +2665,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - balance_pgdat(pgdat, order, classzone_idx); + order = balance_pgdat(pgdat, order, classzone_idx); } } return 0; -- cgit v1.2.3-70-g09d2 From 4d40502ea580c35414a1466d86f96484910ebaec Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:23 -0800 Subject: mm: kswapd: reset kswapd_max_order and classzone_idx after reading When kswapd wakes up, it reads its order and classzone from pgdat and calls balance_pgdat. While its awake, it potentially reclaimes at a high order and a low classzone index. This might have been a once-off that was not required by subsequent callers. However, because the pgdat values were not reset, they remain artifically high while balance_pgdat() is running and potentially kswapd enters a second unnecessary reclaim cycle. Reset the pgdat order and classzone index after reading. Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: KAMEZAWA Hiroyuki Reviewed-by: Eric B Munson Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 46711f080f3..dafb9d91b60 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2653,6 +2653,8 @@ static int kswapd(void *p) kswapd_try_to_sleep(pgdat, order); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; + pgdat->kswapd_max_order = 0; + pgdat->classzone_idx = MAX_NR_ZONES - 1; } ret = try_to_freeze(); -- cgit v1.2.3-70-g09d2 From 355b09c47a0cbb73b3e65a57c03f157f2e7ddb0b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:24 -0800 Subject: mm: kswapd: treat zone->all_unreclaimable in sleeping_prematurely similar to balance_pgdat() After DEF_PRIORITY, balance_pgdat() considers all_unreclaimable zones to be balanced but sleeping_prematurely does not. This can force kswapd to stay awake longer than it should. This patch fixes it. Signed-off-by: Mel Gorman Reviewed-by: Eric B Munson Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index dafb9d91b60..388a0447b8e 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2244,8 +2244,16 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) if (!populated_zone(zone)) continue; - if (zone->all_unreclaimable) + /* + * balance_pgdat() skips over all_unreclaimable after + * DEF_PRIORITY. Effectively, it considers them balanced so + * they must be considered balanced here as well if kswapd + * is to sleep + */ + if (zone->all_unreclaimable) { + balanced += zone->present_pages; continue; + } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) -- cgit v1.2.3-70-g09d2 From dc83edd941f412e938841b4989be24aa288a1aa6 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:46:26 -0800 Subject: mm: kswapd: use the classzone idx that kswapd was using for sleeping_prematurely() When kswapd is woken up for a high-order allocation, it takes account of the highest usable zone by the caller (the classzone idx). During allocation, this index is used to select the lowmem_reserve[] that should be applied to the watermark calculation in zone_watermark_ok(). When balancing a node, kswapd considers the highest unbalanced zone to be the classzone index. This will always be at least be the callers classzone_idx and can be higher. However, sleeping_prematurely() always considers the lowest zone (e.g. ZONE_DMA) to be the classzone index. This means that sleeping_prematurely() can consider a zone to be balanced that is unusable by the allocation request that originally woke kswapd. This patch changes sleeping_prematurely() to use a classzone_idx matching the value it used in balance_pgdat(). Signed-off-by: Mel Gorman Reviewed-by: Minchan Kim Reviewed-by: Eric B Munson Cc: KAMEZAWA Hiroyuki Cc: Simon Kirby Cc: KOSAKI Motohiro Cc: Shaohua Li Cc: Dave Hansen Cc: Johannes Weiner Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 388a0447b8e..cfdef0bcc7a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2227,7 +2227,8 @@ static bool pgdat_balanced(pg_data_t *pgdat, unsigned long balanced_pages, } /* is kswapd sleeping prematurely? */ -static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining, + int classzone_idx) { int i; unsigned long balanced = 0; @@ -2235,7 +2236,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ if (remaining) - return 1; + return true; /* Check the watermark levels */ for (i = 0; i < pgdat->nr_zones; i++) { @@ -2256,7 +2257,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) } if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), - 0, 0)) + classzone_idx, 0)) all_zones_ok = false; else balanced += zone->present_pages; @@ -2268,7 +2269,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * must be balanced */ if (order) - return pgdat_balanced(pgdat, balanced, 0); + return pgdat_balanced(pgdat, balanced, classzone_idx); else return !all_zones_ok; } @@ -2295,7 +2296,7 @@ static bool sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order, - int classzone_idx) + int *classzone_idx) { int all_zones_ok; unsigned long balanced; @@ -2358,6 +2359,7 @@ loop_again: if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), 0, 0)) { end_zone = i; + *classzone_idx = i; break; } } @@ -2451,12 +2453,12 @@ loop_again: * spectulatively avoid congestion waits */ zone_clear_flag(zone, ZONE_CONGESTED); - if (i <= classzone_idx) + if (i <= *classzone_idx) balanced += zone->present_pages; } } - if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx))) + if (all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx))) break; /* kswapd: all done */ /* * OK, kswapd is getting into trouble. Take a nap, then take @@ -2485,7 +2487,7 @@ out: * high-order: Balanced zones must make up at least 25% of the node * for the node to be balanced */ - if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, classzone_idx)))) { + if (!(all_zones_ok || (order && pgdat_balanced(pgdat, balanced, *classzone_idx)))) { cond_resched(); try_to_freeze(); @@ -2546,10 +2548,11 @@ out: * if another caller entered the allocator slow path while kswapd * was awake, order will remain at the higher level */ + *classzone_idx = end_zone; return order; } -static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) +static void kswapd_try_to_sleep(pg_data_t *pgdat, int order, int classzone_idx) { long remaining = 0; DEFINE_WAIT(wait); @@ -2560,7 +2563,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); /* Try to sleep for a short interval */ - if (!sleeping_prematurely(pgdat, order, remaining)) { + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { remaining = schedule_timeout(HZ/10); finish_wait(&pgdat->kswapd_wait, &wait); prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); @@ -2570,7 +2573,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int order) * After a short sleep, check if it was a premature sleep. If not, then * go fully to sleep until explicitly woken up. */ - if (!sleeping_prematurely(pgdat, order, remaining)) { + if (!sleeping_prematurely(pgdat, order, remaining, classzone_idx)) { trace_mm_vmscan_kswapd_sleep(pgdat->node_id); /* @@ -2658,7 +2661,7 @@ static int kswapd(void *p) order = new_order; classzone_idx = new_classzone_idx; } else { - kswapd_try_to_sleep(pgdat, order); + kswapd_try_to_sleep(pgdat, order, classzone_idx); order = pgdat->kswapd_max_order; classzone_idx = pgdat->classzone_idx; pgdat->kswapd_max_order = 0; @@ -2675,7 +2678,7 @@ static int kswapd(void *p) */ if (!ret) { trace_mm_vmscan_kswapd_wake(pgdat->node_id, order); - order = balance_pgdat(pgdat, order, classzone_idx); + order = balance_pgdat(pgdat, order, &classzone_idx); } } return 0; -- cgit v1.2.3-70-g09d2 From ecb256f815232b35ae8382cff36ca8ce0bbd077e Mon Sep 17 00:00:00 2001 From: "Volodymyr G. Lukiianyk" Date: Thu, 13 Jan 2011 15:46:26 -0800 Subject: mm: set correct numa_zonelist_order string when configured on the kernel command line When numa_zonelist_order parameter is set to "node" or "zone" on the command line it's still showing as "default" in sysctl. That's because early_param parsing function changes only user_zonelist_order variable. Fix this by copying user-provided string to numa_zonelist_order if it was successfully parsed. Signed-off-by: Volodymyr G Lukiianyk Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3146091850..c18d1f7cff8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2631,9 +2631,16 @@ static int __parse_numa_zonelist_order(char *s) static __init int setup_numa_zonelist_order(char *s) { - if (s) - return __parse_numa_zonelist_order(s); - return 0; + int ret; + + if (!s) + return 0; + + ret = __parse_numa_zonelist_order(s); + if (ret == 0) + strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN); + + return ret; } early_param("numa_zonelist_order", setup_numa_zonelist_order); -- cgit v1.2.3-70-g09d2 From 240c879f20a605346705be24253bc9fc6fa8a106 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Jan 2011 15:46:27 -0800 Subject: writeback: avoid unnecessary determine_dirtyable_memory call I think determine_dirtyable_memory() is a rather costly function since it need many atomic reads for gathering zone/global page state. But when we use vm_dirty_bytes && dirty_background_bytes, we don't need that costly calculation. This patch eliminates such unnecessary overhead. NOTE : newly added if condition might add overhead in normal path. But it should be _really_ small because anyway we need the access both vm_dirty_bytes and dirty_background_bytes so it is likely to hit the cache. [akpm@linux-foundation.org: fix used-uninitialised warning] Signed-off-by: Minchan Kim Cc: Wu Fengguang Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 28763b8bdbd..2cb01f6ec5d 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -410,9 +410,12 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) { unsigned long background; unsigned long dirty; - unsigned long available_memory = determine_dirtyable_memory(); + unsigned long uninitialized_var(available_memory); struct task_struct *tsk; + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = determine_dirtyable_memory(); + if (vm_dirty_bytes) dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); else -- cgit v1.2.3-70-g09d2 From ae52a2adb5afa5ac5ec5fb5c7b24777f84b6c926 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:46:28 -0800 Subject: thp: ksm: free swap when swapcache page is replaced When a swapcache page is replaced by a ksm page, it's best to free that swap immediately. Reported-by: Andrea Arcangeli Signed-off-by: Hugh Dickins Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 43bc893470b..b5b907cb0f9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -800,6 +800,8 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); page_remove_rmap(page); + if (!page_mapped(page)) + try_to_free_swap(page); put_page(page); pte_unmap_unlock(ptep, ptl); -- cgit v1.2.3-70-g09d2 From 4e9f64c42d0ba5eb0c78569435ada4c224332ce4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:29 -0800 Subject: thp: fix bad_page to show the real reason the page is bad page_count shows the count of the head page, but the actual check is done on the tail page, so show what is really being checked. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Reviewed-by: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c18d1f7cff8..2a67c3bd403 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5618,7 +5618,7 @@ void dump_page(struct page *page) { printk(KERN_ALERT "page:%p count:%d mapcount:%d mapping:%p index:%#lx\n", - page, page_count(page), page_mapcount(page), + page, atomic_read(&page->_count), page_mapcount(page), page->mapping, page->index); dump_page_flags(page->flags); } -- cgit v1.2.3-70-g09d2 From 9180706344487700b40da9eca5dedd3d11cb33b4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:32 -0800 Subject: thp: alter compound get_page/put_page Alter compound get_page/put_page to keep references on subpages too, in order to allow __split_huge_page_refcount to split an hugepage even while subpages have been pinned by one of the get_user_pages() variants. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/mm/gup.c | 12 +++++++ arch/x86/mm/gup.c | 12 +++++++ include/linux/mm.h | 24 ++++++++++++-- mm/swap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++---- 4 files changed, 129 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/arch/powerpc/mm/gup.c b/arch/powerpc/mm/gup.c index d7efdbf640c..fec13200868 100644 --- a/arch/powerpc/mm/gup.c +++ b/arch/powerpc/mm/gup.c @@ -16,6 +16,16 @@ #ifdef __HAVE_ARCH_PTE_SPECIAL +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + /* * The performance critical leaf functions are made noinline otherwise gcc * inlines everything into a single function which results in too much @@ -47,6 +57,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, put_page(page); return 0; } + if (PageTail(page)) + get_huge_page_tail(page); pages[*nr] = page; (*nr)++; diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 738e6593799..06f56fcf9a7 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -105,6 +105,16 @@ static inline void get_head_page_multiple(struct page *page, int nr) atomic_add(nr, &page->_count); } +static inline void get_huge_page_tail(struct page *page) +{ + /* + * __split_huge_page_refcount() cannot run + * from under us. + */ + VM_BUG_ON(atomic_read(&page->_count) < 0); + atomic_inc(&page->_count); +} + static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -128,6 +138,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; + if (PageTail(page)) + get_huge_page_tail(page); (*nr)++; page++; refs++; diff --git a/include/linux/mm.h b/include/linux/mm.h index 3b1754ad878..a2718e1ed58 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -353,9 +353,29 @@ static inline int page_count(struct page *page) static inline void get_page(struct page *page) { - page = compound_head(page); - VM_BUG_ON(atomic_read(&page->_count) == 0); + /* + * Getting a normal page or the head of a compound page + * requires to already have an elevated page->_count. Only if + * we're getting a tail page, the elevated page->_count is + * required only in the head page, so for tail pages the + * bugcheck only verifies that the page->_count isn't + * negative. + */ + VM_BUG_ON(atomic_read(&page->_count) < !PageTail(page)); atomic_inc(&page->_count); + /* + * Getting a tail page will elevate both the head and tail + * page->_count(s). + */ + if (unlikely(PageTail(page))) { + /* + * This is safe only because + * __split_huge_page_refcount can't run under + * get_page(). + */ + VM_BUG_ON(atomic_read(&page->first_page->_count) <= 0); + atomic_inc(&page->first_page->_count); + } } static inline struct page *virt_to_head_page(const void *x) diff --git a/mm/swap.c b/mm/swap.c index 3f4854205b1..33f5292fe13 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -56,17 +56,93 @@ static void __page_cache_release(struct page *page) del_page_from_lru(zone, page); spin_unlock_irqrestore(&zone->lru_lock, flags); } +} + +static void __put_single_page(struct page *page) +{ + __page_cache_release(page); free_hot_cold_page(page, 0); } -static void put_compound_page(struct page *page) +static void __put_compound_page(struct page *page) { - page = compound_head(page); - if (put_page_testzero(page)) { - compound_page_dtor *dtor; + compound_page_dtor *dtor; + + __page_cache_release(page); + dtor = get_compound_page_dtor(page); + (*dtor)(page); +} - dtor = get_compound_page_dtor(page); - (*dtor)(page); +static void put_compound_page(struct page *page) +{ + if (unlikely(PageTail(page))) { + /* __split_huge_page_refcount can run under us */ + struct page *page_head = page->first_page; + smp_rmb(); + /* + * If PageTail is still set after smp_rmb() we can be sure + * that the page->first_page we read wasn't a dangling pointer. + * See __split_huge_page_refcount() smp_wmb(). + */ + if (likely(PageTail(page) && get_page_unless_zero(page_head))) { + unsigned long flags; + /* + * Verify that our page_head wasn't converted + * to a a regular page before we got a + * reference on it. + */ + if (unlikely(!PageHead(page_head))) { + /* PageHead is cleared after PageTail */ + smp_rmb(); + VM_BUG_ON(PageTail(page)); + goto out_put_head; + } + /* + * Only run compound_lock on a valid PageHead, + * after having it pinned with + * get_page_unless_zero() above. + */ + smp_mb(); + /* page_head wasn't a dangling pointer */ + flags = compound_lock_irqsave(page_head); + if (unlikely(!PageTail(page))) { + /* __split_huge_page_refcount run before us */ + compound_unlock_irqrestore(page_head, flags); + VM_BUG_ON(PageHead(page_head)); + out_put_head: + if (put_page_testzero(page_head)) + __put_single_page(page_head); + out_put_single: + if (put_page_testzero(page)) + __put_single_page(page); + return; + } + VM_BUG_ON(page_head != page->first_page); + /* + * We can release the refcount taken by + * get_page_unless_zero now that + * split_huge_page_refcount is blocked on the + * compound_lock. + */ + if (put_page_testzero(page_head)) + VM_BUG_ON(1); + /* __split_huge_page_refcount will wait now */ + VM_BUG_ON(atomic_read(&page->_count) <= 0); + atomic_dec(&page->_count); + VM_BUG_ON(atomic_read(&page_head->_count) <= 0); + compound_unlock_irqrestore(page_head, flags); + if (put_page_testzero(page_head)) + __put_compound_page(page_head); + } else { + /* page_head is a dangling pointer */ + VM_BUG_ON(PageTail(page)); + goto out_put_single; + } + } else if (put_page_testzero(page)) { + if (PageHead(page)) + __put_compound_page(page); + else + __put_single_page(page); } } @@ -75,7 +151,7 @@ void put_page(struct page *page) if (unlikely(PageCompound(page))) put_compound_page(page); else if (put_page_testzero(page)) - __page_cache_release(page); + __put_single_page(page); } EXPORT_SYMBOL(put_page); -- cgit v1.2.3-70-g09d2 From a95a82e96c48270980dd248ccd5546f1b49e6f8a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:33 -0800 Subject: thp: put_page: recheck PageHead after releasing the compound_lock After releasing the compound_lock split_huge_page can still run and release the page before put_page_testzero runs. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 33f5292fe13..e0eeef94088 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -131,8 +131,12 @@ static void put_compound_page(struct page *page) atomic_dec(&page->_count); VM_BUG_ON(atomic_read(&page_head->_count) <= 0); compound_unlock_irqrestore(page_head, flags); - if (put_page_testzero(page_head)) - __put_compound_page(page_head); + if (put_page_testzero(page_head)) { + if (PageHead(page_head)) + __put_compound_page(page_head); + else + __put_single_page(page_head); + } } else { /* page_head is a dangling pointer */ VM_BUG_ON(PageTail(page)); -- cgit v1.2.3-70-g09d2 From 8dd60a3a65c1b057bf0031d28436d3447a3c545b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:34 -0800 Subject: thp: clear compound mapping Clear compound mapping for anonymous compound pages like it already happens for regular anonymous pages. But crash if mapping is set for any tail page, also the PageAnon check is meaningless for tail pages. This check only makes sense for the head page, for tail page it can only hide bugs and we definitely don't want to hide bugs. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 2a67c3bd403..8be81422d4b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -651,13 +651,10 @@ static bool free_pages_prepare(struct page *page, unsigned int order) trace_mm_page_free_direct(page, order); kmemcheck_free_shadow(page, order); - for (i = 0; i < (1 << order); i++) { - struct page *pg = page + i; - - if (PageAnon(pg)) - pg->mapping = NULL; - bad += free_pages_check(pg); - } + if (PageAnon(page)) + page->mapping = NULL; + for (i = 0; i < (1 << order); i++) + bad += free_pages_check(page + i); if (bad) return false; -- cgit v1.2.3-70-g09d2 From 14fd403f2146f740942d78af4e0ee59396ad8eab Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:37 -0800 Subject: thp: export maybe_mkwrite huge_memory.c needs it too when it fallbacks in copying hugepages into regular fragmented pages if hugepage allocation fails during COW. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 13 +++++++++++++ mm/memory.c | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index a2718e1ed58..6bef67d74ad 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -429,6 +429,19 @@ static inline void set_compound_order(struct page *page, unsigned long order) page[1].lru.prev = (void *)order; } +/* + * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when + * servicing faults for write access. In the normal case, do always want + * pte_mkwrite. But get_user_pages can cause write faults for mappings + * that do not have writing enabled, when used by access_process_vm. + */ +static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) +{ + if (likely(vma->vm_flags & VM_WRITE)) + pte = pte_mkwrite(pte); + return pte; +} + /* * Multiple processes may "see" the same page. E.g. for untouched * mappings of /dev/null, all processes see the same page full of diff --git a/mm/memory.c b/mm/memory.c index 1bbe9a22429..bdf19366b70 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2083,19 +2083,6 @@ static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd, return same; } -/* - * Do pte_mkwrite, but only if the vma says VM_WRITE. We do this when - * servicing faults for write access. In the normal case, do always want - * pte_mkwrite. But get_user_pages can cause write faults for mappings - * that do not have writing enabled, when used by access_process_vm. - */ -static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) -{ - if (likely(vma->vm_flags & VM_WRITE)) - pte = pte_mkwrite(pte); - return pte; -} - static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va, struct vm_area_struct *vma) { /* -- cgit v1.2.3-70-g09d2 From 59ff421631295cd54dbf75dcc53d27e84af6d9c0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:38 -0800 Subject: thp: comment reminder in destroy_compound_page Warn destroy_compound_page that __split_huge_page_refcount is heavily dependent on its internal behavior. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 8be81422d4b..6e62d5f9d40 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -357,6 +357,7 @@ void prep_compound_page(struct page *page, unsigned long order) } } +/* update __split_huge_page_refcount if you change this function */ static int destroy_compound_page(struct page *page, unsigned long order) { int i; -- cgit v1.2.3-70-g09d2 From 4c76d9d1fb9b21fa10c9e4c1fab2875018a88aa1 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:39 -0800 Subject: thp: CONFIG_TRANSPARENT_HUGEPAGE Add config option. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index c2c8a4a1189..3982be2d721 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -302,6 +302,20 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. +config TRANSPARENT_HUGEPAGE + bool "Transparent Hugepage Support" if EMBEDDED + depends on X86_64 && MMU + default y + help + Transparent Hugepages allows the kernel to use huge pages and + huge tlb transparently to the applications whenever possible. + This feature can improve computing performance to certain + applications by speeding up page faults during memory + allocation, by reducing the number of tlb misses and by speeding + up the pagetable walking. + + If memory constrained on embedded, you may want to say N. + # # UP and nommu archs use km based percpu allocator # -- cgit v1.2.3-70-g09d2 From e2cda322648122dc400c85ada80eaddbc612ef6a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:40 -0800 Subject: thp: add pmd mangling generic functions Some are needed to build but not actually used on archs not supporting transparent hugepages. Others like pmdp_clear_flush are used by x86 too. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 214 ++++++++++++++++++++++++++++++------------ mm/Makefile | 2 +- mm/pgtable-generic.c | 123 ++++++++++++++++++++++++ 3 files changed, 278 insertions(+), 61 deletions(-) create mode 100644 mm/pgtable-generic.c (limited to 'mm') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 0ab2cd27c60..f1eddf71dd0 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -5,67 +5,108 @@ #ifdef CONFIG_MMU #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -/* - * Largely same as above, but only sets the access flags (dirty, - * accessed, and writable). Furthermore, we know it always gets set - * to a "more permissive" setting, which allows most architectures - * to optimize this. We return whether the PTE actually changed, which - * in turn instructs the caller to do things like update__mmu_cache. - * This used to be done in the caller, but sparc needs minor faults to - * force that call on sun4c so we changed this macro slightly - */ -#define ptep_set_access_flags(__vma, __address, __ptep, __entry, __dirty) \ -({ \ - int __changed = !pte_same(*(__ptep), __entry); \ - if (__changed) { \ - set_pte_at((__vma)->vm_mm, (__address), __ptep, __entry); \ - flush_tlb_page(__vma, __address); \ - } \ - __changed; \ -}) +extern int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty); +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); #endif #ifndef __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - int r = 1; \ - if (!pte_young(__pte)) \ - r = 0; \ - else \ - set_pte_at((__vma)->vm_mm, (__address), \ - (__ptep), pte_mkold(__pte)); \ - r; \ -}) +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + int r = 1; + if (!pte_young(pte)) + r = 0; + else + set_pte_at(vma->vm_mm, address, ptep, pte_mkold(pte)); + return r; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + int r = 1; + if (!pmd_young(pmd)) + r = 0; + else + set_pmd_at(vma->vm_mm, address, pmdp, pmd_mkold(pmd)); + return r; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH -#define ptep_clear_flush_young(__vma, __address, __ptep) \ -({ \ - int __young; \ - __young = ptep_test_and_clear_young(__vma, __address, __ptep); \ - if (__young) \ - flush_tlb_page(__vma, __address); \ - __young; \ -}) +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR -#define ptep_get_and_clear(__mm, __address, __ptep) \ -({ \ - pte_t __pte = *(__ptep); \ - pte_clear((__mm), (__address), (__ptep)); \ - __pte; \ +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, + unsigned long address, + pte_t *ptep) +{ + pte_t pte = *ptep; + pte_clear(mm, address, ptep); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_GET_AND_CLEAR +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + pmd_clear(mm, address, pmdp); + return pmd; }) +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, + unsigned long address, + pmd_t *pmdp) +{ + BUG(); + return __pmd(0); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL -#define ptep_get_and_clear_full(__mm, __address, __ptep, __full) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__mm), (__address), (__ptep)); \ - __pte; \ -}) +static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, + unsigned long address, pte_t *ptep, + int full) +{ + pte_t pte; + pte = ptep_get_and_clear(mm, address, ptep); + return pte; +} #endif /* @@ -74,20 +115,25 @@ * not present, or in the process of an address space destruction. */ #ifndef __HAVE_ARCH_PTE_CLEAR_NOT_PRESENT_FULL -#define pte_clear_not_present_full(__mm, __address, __ptep, __full) \ -do { \ - pte_clear((__mm), (__address), (__ptep)); \ -} while (0) +static inline void pte_clear_not_present_full(struct mm_struct *mm, + unsigned long address, + pte_t *ptep, + int full) +{ + pte_clear(mm, address, ptep); +} #endif #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH -#define ptep_clear_flush(__vma, __address, __ptep) \ -({ \ - pte_t __pte; \ - __pte = ptep_get_and_clear((__vma)->vm_mm, __address, __ptep); \ - flush_tlb_page(__vma, __address); \ - __pte; \ -}) +extern pte_t ptep_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pte_t *ptep); +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -99,8 +145,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addres } #endif +#ifndef __HAVE_ARCH_PMDP_SET_WRPROTECT +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + pmd_t old_pmd = *pmdp; + set_pmd_at(mm, address, pmdp, pmd_wrprotect(old_pmd)); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long address, pmd_t *pmdp) +{ + BUG(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); +#endif + #ifndef __HAVE_ARCH_PTE_SAME -#define pte_same(A,B) (pte_val(A) == pte_val(B)) +static inline int pte_same(pte_t pte_a, pte_t pte_b) +{ + return pte_val(pte_a) == pte_val(pte_b); +} +#endif + +#ifndef __HAVE_ARCH_PMD_SAME +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + return pmd_val(pmd_a) == pmd_val(pmd_b); +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int pmd_same(pmd_t pmd_a, pmd_t pmd_b) +{ + BUG(); + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PAGE_TEST_DIRTY @@ -357,6 +444,13 @@ static inline int pmd_trans_splitting(pmd_t pmd) { return 0; } +#ifndef __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + BUG(); + return 0; +} +#endif /* __HAVE_ARCH_PMD_WRITE */ #endif #endif /* !__ASSEMBLY__ */ diff --git a/mm/Makefile b/mm/Makefile index f73f75a29f8..380772a9ccb 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -5,7 +5,7 @@ mmu-y := nommu.o mmu-$(CONFIG_MMU) := fremap.o highmem.o madvise.o memory.o mincore.o \ mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \ - vmalloc.o pagewalk.o + vmalloc.o pagewalk.o pgtable-generic.o obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \ maccess.o page_alloc.o page-writeback.o \ diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c new file mode 100644 index 00000000000..d030548047e --- /dev/null +++ b/mm/pgtable-generic.c @@ -0,0 +1,123 @@ +/* + * mm/pgtable-generic.c + * + * Generic pgtable methods declared in asm-generic/pgtable.h + * + * Copyright (C) 2010 Linus Torvalds + */ + +#include +#include + +#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +/* + * Only sets the access flags (dirty, accessed, and + * writable). Furthermore, we know it always gets set to a "more + * permissive" setting, which allows most architectures to optimize + * this. We return whether the PTE actually changed, which in turn + * instructs the caller to do things like update__mmu_cache. This + * used to be done in the caller, but sparc needs minor faults to + * force that call on sun4c so we changed this macro slightly + */ +int ptep_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep, + pte_t entry, int dirty) +{ + int changed = !pte_same(*ptep, entry); + if (changed) { + set_pte_at(vma->vm_mm, address, ptep, entry); + flush_tlb_page(vma, address); + } + return changed; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + int changed = !pmd_same(*pmdp, entry); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + if (changed) { + set_pmd_at(vma->vm_mm, address, pmdp, entry); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + } + return changed; +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); + return 0; +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH +int ptep_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pte_t *ptep) +{ + int young; + young = ptep_test_and_clear_young(vma, address, ptep); + if (young) + flush_tlb_page(vma, address); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp) +{ + int young; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + young = pmdp_test_and_clear_young(vma, address, pmdp); + if (young) + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return young; +} +#endif + +#ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH +pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep) +{ + pte_t pte; + pte = ptep_get_and_clear((vma)->vm_mm, address, ptep); + flush_tlb_page(vma, address); + return pte; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ + pmd_t pmd; +#ifndef CONFIG_TRANSPARENT_HUGEPAGE + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return pmd; +} +#endif + +#ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp) +{ +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + pmd_t pmd = pmd_mksplitting(*pmdp); + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + /* tlb flush only to serialize against gup-fast */ + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ + BUG(); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +} +#endif -- cgit v1.2.3-70-g09d2 From 8ac1f8320a0073f28cf9e0491af4cd98f504f92a Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:43 -0800 Subject: thp: pte alloc trans splitting pte alloc routines must wait for split_huge_page if the pmd is not present and not null (i.e. pmd_trans_splitting). The additional branches are optimized away at compile time by pmd_trans_splitting if the config option is off. However we must pass the vma down in order to know the anon_vma lock to wait for. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/pgd.c | 2 +- arch/ia64/mm/hugetlbpage.c | 2 +- arch/sh/mm/hugetlbpage.c | 2 +- arch/sparc/mm/generic_32.c | 2 +- arch/sparc/mm/generic_64.c | 2 +- arch/sparc/mm/hugetlbpage.c | 2 +- arch/um/kernel/skas/mmu.c | 2 +- arch/x86/kernel/tboot.c | 2 +- include/linux/mm.h | 15 +++++++++------ mm/memory.c | 19 +++++++++++++------ mm/mremap.c | 8 +++++--- 11 files changed, 35 insertions(+), 23 deletions(-) (limited to 'mm') diff --git a/arch/arm/mm/pgd.c b/arch/arm/mm/pgd.c index 93292a18cf7..709244c66fa 100644 --- a/arch/arm/mm/pgd.c +++ b/arch/arm/mm/pgd.c @@ -50,7 +50,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) if (!new_pmd) goto no_pmd; - new_pte = pte_alloc_map(mm, new_pmd, 0); + new_pte = pte_alloc_map(mm, NULL, new_pmd, 0); if (!new_pte) goto no_pte; diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 1841ee7e65f..5ca674b7473 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -38,7 +38,7 @@ huge_pte_alloc(struct mm_struct *mm, unsigned long addr, unsigned long sz) if (pud) { pmd = pmd_alloc(mm, pud, taddr); if (pmd) - pte = pte_alloc_map(mm, pmd, taddr); + pte = pte_alloc_map(mm, NULL, pmd, taddr); } return pte; } diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 9163db3e8d1..d7762349ea4 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -35,7 +35,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } } diff --git a/arch/sparc/mm/generic_32.c b/arch/sparc/mm/generic_32.c index 5edcac184ea..e6067b75f11 100644 --- a/arch/sparc/mm/generic_32.c +++ b/arch/sparc/mm/generic_32.c @@ -50,7 +50,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/generic_64.c b/arch/sparc/mm/generic_64.c index 04f2bf4cd57..3cb00dfd4bd 100644 --- a/arch/sparc/mm/generic_64.c +++ b/arch/sparc/mm/generic_64.c @@ -92,7 +92,7 @@ static inline int io_remap_pmd_range(struct mm_struct *mm, pmd_t * pmd, unsigned end = PGDIR_SIZE; offset -= address; do { - pte_t * pte = pte_alloc_map(mm, pmd, address); + pte_t *pte = pte_alloc_map(mm, NULL, pmd, address); if (!pte) return -ENOMEM; io_remap_pte_range(mm, pte, address, end - address, address + offset, prot, space); diff --git a/arch/sparc/mm/hugetlbpage.c b/arch/sparc/mm/hugetlbpage.c index 5fdddf134ca..f4e97646ce2 100644 --- a/arch/sparc/mm/hugetlbpage.c +++ b/arch/sparc/mm/hugetlbpage.c @@ -214,7 +214,7 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, if (pud) { pmd = pmd_alloc(mm, pud, addr); if (pmd) - pte = pte_alloc_map(mm, pmd, addr); + pte = pte_alloc_map(mm, NULL, pmd, addr); } return pte; } diff --git a/arch/um/kernel/skas/mmu.c b/arch/um/kernel/skas/mmu.c index 3d099f97478..1aee587e9c5 100644 --- a/arch/um/kernel/skas/mmu.c +++ b/arch/um/kernel/skas/mmu.c @@ -31,7 +31,7 @@ static int init_stub_pte(struct mm_struct *mm, unsigned long proc, if (!pmd) goto out_pmd; - pte = pte_alloc_map(mm, pmd, proc); + pte = pte_alloc_map(mm, NULL, pmd, proc); if (!pte) goto out_pte; diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index c2f1b26141e..998e972f3b1 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -133,7 +133,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn, pmd = pmd_alloc(&tboot_mm, pud, vaddr); if (!pmd) return -1; - pte = pte_alloc_map(&tboot_mm, pmd, vaddr); + pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr); if (!pte) return -1; set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); diff --git a/include/linux/mm.h b/include/linux/mm.h index 6bef67d74ad..14ddd98b063 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1131,7 +1131,8 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); #endif -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address); +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address); int __pte_alloc_kernel(pmd_t *pmd, unsigned long address); /* @@ -1200,16 +1201,18 @@ static inline void pgtable_page_dtor(struct page *page) pte_unmap(pte); \ } while (0) -#define pte_alloc_map(mm, pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ - NULL: pte_offset_map(pmd, address)) +#define pte_alloc_map(mm, vma, pmd, address) \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, vma, \ + pmd, address))? \ + NULL: pte_offset_map(pmd, address)) #define pte_alloc_map_lock(mm, pmd, address, ptlp) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc(mm, pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, NULL, \ + pmd, address))? \ NULL: pte_offset_map_lock(mm, pmd, address, ptlp)) #define pte_alloc_kernel(pmd, address) \ - ((unlikely(!pmd_present(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ + ((unlikely(pmd_none(*(pmd))) && __pte_alloc_kernel(pmd, address))? \ NULL: pte_offset_kernel(pmd, address)) extern void free_area_init(unsigned long * zones_size); diff --git a/mm/memory.c b/mm/memory.c index bdf19366b70..567bca80ea5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -394,9 +394,11 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, } } -int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, + pmd_t *pmd, unsigned long address) { pgtable_t new = pte_alloc_one(mm, address); + int wait_split_huge_page; if (!new) return -ENOMEM; @@ -416,14 +418,18 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */ spin_lock(&mm->page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + wait_split_huge_page = 0; + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ mm->nr_ptes++; pmd_populate(mm, pmd, new); new = NULL; - } + } else if (unlikely(pmd_trans_splitting(*pmd))) + wait_split_huge_page = 1; spin_unlock(&mm->page_table_lock); if (new) pte_free(mm, new); + if (wait_split_huge_page) + wait_split_huge_page(vma->anon_vma, pmd); return 0; } @@ -436,10 +442,11 @@ int __pte_alloc_kernel(pmd_t *pmd, unsigned long address) smp_wmb(); /* See comment in __pte_alloc */ spin_lock(&init_mm.page_table_lock); - if (!pmd_present(*pmd)) { /* Has another populated it ? */ + if (likely(pmd_none(*pmd))) { /* Has another populated it ? */ pmd_populate_kernel(&init_mm, pmd, new); new = NULL; - } + } else + VM_BUG_ON(pmd_trans_splitting(*pmd)); spin_unlock(&init_mm.page_table_lock); if (new) pte_free_kernel(&init_mm, new); @@ -3253,7 +3260,7 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, pmd, address); + pte = pte_alloc_map(mm, vma, pmd, address); if (!pte) return VM_FAULT_OOM; diff --git a/mm/mremap.c b/mm/mremap.c index 563fbdd6293..b09eefaea0b 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -47,7 +47,8 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return pmd; } -static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) +static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long addr) { pgd_t *pgd; pud_t *pud; @@ -62,7 +63,8 @@ static pmd_t *alloc_new_pmd(struct mm_struct *mm, unsigned long addr) if (!pmd) return NULL; - if (!pmd_present(*pmd) && __pte_alloc(mm, pmd, addr)) + VM_BUG_ON(pmd_trans_huge(*pmd)); + if (pmd_none(*pmd) && __pte_alloc(mm, vma, pmd, addr)) return NULL; return pmd; @@ -147,7 +149,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma, old_pmd = get_old_pmd(vma->vm_mm, old_addr); if (!old_pmd) continue; - new_pmd = alloc_new_pmd(vma->vm_mm, new_addr); + new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr); if (!new_pmd) break; next = (new_addr + PMD_SIZE) & PMD_MASK; -- cgit v1.2.3-70-g09d2 From bae9c19bf12bb2a914a8e530270f41d36cc87c63 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:46 -0800 Subject: thp: split_huge_page_mm/vma split_huge_page_pmd compat code. Each one of those would need to be expanded to hundred of lines of complex code without a fully reliable split_huge_page_pmd design. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/vm86_32.c | 1 + mm/mempolicy.c | 1 + mm/mincore.c | 1 + mm/mprotect.c | 1 + mm/mremap.c | 1 + mm/pagewalk.c | 1 + 6 files changed, 6 insertions(+) (limited to 'mm') diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index 61fb9851962..863f8753ab0 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm) if (pud_none_or_clear_bad(pud)) goto out; pmd = pmd_offset(pud, 0xA0000); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e6d351265ae..83b7df309fc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -514,6 +514,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, diff --git a/mm/mincore.c b/mm/mincore.c index 9ac42dc6d7b..9959bb41570 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,6 +154,7 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else diff --git a/mm/mprotect.c b/mm/mprotect.c index 4c513387309..bd27db6b992 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -88,6 +88,7 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); diff --git a/mm/mremap.c b/mm/mremap.c index b09eefaea0b..9925b6391b8 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -41,6 +41,7 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) return NULL; pmd = pmd_offset(pud, addr); + split_huge_page_pmd(mm, pmd); if (pmd_none_or_clear_bad(pmd)) return NULL; diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 38cc58b8b2b..7cfa6ae0230 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -34,6 +34,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + split_huge_page_pmd(walk->mm, pmd); if (pmd_none_or_clear_bad(pmd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); -- cgit v1.2.3-70-g09d2 From 3f04f62f90d46a82dd73027c5fd7a15daed5c33d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:47 -0800 Subject: thp: split_huge_page paging Paging logic that splits the page before it is unmapped and added to swap to ensure backwards compatibility with the legacy swap code. Eventually swap should natively pageout the hugepages to increase performance and decrease seeking and fragmentation of swap space. swapoff can just skip over huge pmd as they cannot be part of swap yet. In add_to_swap be careful to split the page only if we got a valid swap entry so we don't split hugepages with a full swap. In theory we could split pages before isolating them during the lru scan, but for khugepaged to be safe, I'm relying on either mmap_sem write mode, or PG_lock taken, so split_huge_page has to run either with mmap_sem read/write mode or PG_lock taken. Calling it from isolate_lru_page would make locking more complicated, in addition to that split_huge_page would deadlock if called by __isolate_lru_page because it has to take the lru lock to add the tail pages. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 ++ mm/rmap.c | 1 + mm/swap_state.c | 6 ++++++ mm/swapfile.c | 2 ++ 4 files changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2323a8039a9..6a283cc9317 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -386,6 +386,8 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; + if (unlikely(split_huge_page(page))) + return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ diff --git a/mm/rmap.c b/mm/rmap.c index c95d2ba27a0..a3197a8a295 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1400,6 +1400,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); + BUG_ON(PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); diff --git a/mm/swap_state.c b/mm/swap_state.c index e10f5833167..5c8cfabbc9b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -157,6 +157,12 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) { + swapcache_free(entry, NULL); + return 0; + } + /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC diff --git a/mm/swapfile.c b/mm/swapfile.c index b6adcfbf6f4..07a458d72fa 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -964,6 +964,8 @@ static inline int unuse_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (unlikely(pmd_trans_huge(*pmd))) + continue; if (pmd_none_or_clear_bad(pmd)) continue; ret = unuse_pte_range(vma, pmd, addr, next, entry, page); -- cgit v1.2.3-70-g09d2 From 47ad8475c000141eacb3ecda5e5ce4b43a9cd04d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:47 -0800 Subject: thp: clear_copy_huge_page Move the copy/clear_huge_page functions to common code to share between hugetlb.c and huge_memory.c. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 +++++++ mm/hugetlb.c | 70 +++-------------------------------------------------- mm/memory.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 83 insertions(+), 67 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 14ddd98b063..cc6ab1038f6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1589,5 +1589,14 @@ static inline int is_hwpoison_address(unsigned long addr) extern void dump_page(struct page *page); +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +extern void clear_huge_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page); +extern void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page); +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 85855240933..7bf223d6677 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -394,71 +394,6 @@ static int vma_has_reserves(struct vm_area_struct *vma) return 0; } -static void clear_gigantic_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - struct page *p = page; - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { - cond_resched(); - clear_user_highpage(p, addr + i * PAGE_SIZE); - } -} -static void clear_huge_page(struct page *page, - unsigned long addr, unsigned long sz) -{ - int i; - - if (unlikely(sz/PAGE_SIZE > MAX_ORDER_NR_PAGES)) { - clear_gigantic_page(page, addr, sz); - return; - } - - might_sleep(); - for (i = 0; i < sz/PAGE_SIZE; i++) { - cond_resched(); - clear_user_highpage(page + i, addr + i * PAGE_SIZE); - } -} - -static void copy_user_gigantic_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - struct page *dst_base = dst; - struct page *src_base = src; - - for (i = 0; i < pages_per_huge_page(h); ) { - cond_resched(); - copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); - } -} - -static void copy_user_huge_page(struct page *dst, struct page *src, - unsigned long addr, struct vm_area_struct *vma) -{ - int i; - struct hstate *h = hstate_vma(vma); - - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { - copy_user_gigantic_page(dst, src, addr, vma); - return; - } - - might_sleep(); - for (i = 0; i < pages_per_huge_page(h); i++) { - cond_resched(); - copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); - } -} - static void copy_gigantic_page(struct page *dst, struct page *src) { int i; @@ -2454,7 +2389,8 @@ retry_avoidcopy: return VM_FAULT_OOM; } - copy_user_huge_page(new_page, old_page, address, vma); + copy_user_huge_page(new_page, old_page, address, vma, + pages_per_huge_page(h)); __SetPageUptodate(new_page); /* @@ -2558,7 +2494,7 @@ retry: ret = -PTR_ERR(page); goto out; } - clear_huge_page(page, address, huge_page_size(h)); + clear_huge_page(page, address, pages_per_huge_page(h)); __SetPageUptodate(page); if (vma->vm_flags & VM_MAYSHARE) { diff --git a/mm/memory.c b/mm/memory.c index 567bca80ea5..60e1c68d821 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3645,3 +3645,74 @@ void might_fault(void) } EXPORT_SYMBOL(might_fault); #endif + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) || defined(CONFIG_HUGETLBFS) +static void clear_gigantic_page(struct page *page, + unsigned long addr, + unsigned int pages_per_huge_page) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < pages_per_huge_page; + i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} +void clear_huge_page(struct page *page, + unsigned long addr, unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); + } +} + +static void copy_user_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, + struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + struct page *dst_base = dst; + struct page *src_base = src; + + for (i = 0; i < pages_per_huge_page; ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} + +void copy_user_huge_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma, + unsigned int pages_per_huge_page) +{ + int i; + + if (unlikely(pages_per_huge_page > MAX_ORDER_NR_PAGES)) { + copy_user_gigantic_page(dst, src, addr, vma, + pages_per_huge_page); + return; + } + + might_sleep(); + for (i = 0; i < pages_per_huge_page; i++) { + cond_resched(); + copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma); + } +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE || CONFIG_HUGETLBFS */ -- cgit v1.2.3-70-g09d2 From 32dba98e085f8b2b4345887df9abf5e0e93bfc12 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:49 -0800 Subject: thp: _GFP_NO_KSWAPD Transparent hugepage allocations must be allowed not to invoke kswapd or any other kind of indirect reclaim (especially when the defrag sysfs is control disabled). It's unacceptable to swap out anonymous pages (potentially anonymous transparent hugepages) in order to create new transparent hugepages. This is true for the MADV_HUGEPAGE areas too (swapping out a kvm virtual machine and so having it suffer an unbearable slowdown, so another one with guest physical memory marked MADV_HUGEPAGE can run 30% faster if it is running memory intensive workloads, makes no sense). If a transparent hugepage allocation fails the slowdown is minor and there is total fallback, so kswapd should never be asked to swapout memory to allow the high order allocation to succeed. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 5 ++++- mm/page_alloc.c | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f54adfcbec9..49d2356bb82 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -34,6 +34,7 @@ struct vm_area_struct; #else #define ___GFP_NOTRACK 0 #endif +#define ___GFP_NO_KSWAPD 0x400000u /* * GFP bitmasks.. @@ -81,13 +82,15 @@ struct vm_area_struct; #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */ #define __GFP_NOTRACK ((__force gfp_t)___GFP_NOTRACK) /* Don't track with kmemcheck */ +#define __GFP_NO_KSWAPD ((__force gfp_t)___GFP_NO_KSWAPD) + /* * This may seem redundant, but it's a way of annotating false positives vs. * allocations that simply cannot be supported (e.g. page tables). */ #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK) -#define __GFP_BITS_SHIFT 22 /* Room for 22 __GFP_FOO bits */ +#define __GFP_BITS_SHIFT 23 /* Room for 23 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* This equals 0, but use constants in case they ever change */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6e62d5f9d40..bbd0423f282 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2027,7 +2027,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, goto nopage; restart: - wake_all_kswapd(order, zonelist, high_zoneidx, + if (!(gfp_mask & __GFP_NO_KSWAPD)) + wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(preferred_zone)); /* -- cgit v1.2.3-70-g09d2 From 5c3240d92e29ae7bfb9cb58a9b37e80ab40894ff Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:49 -0800 Subject: thp: don't alloc harder for gfp nomemalloc even if nowait Not worth throwing away the precious reserved free memory pool for allocations that can fail gracefully (either through mempool or because they're transhuge allocations later falling back to 4k allocations). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Reviewed-by: KOSAKI Motohiro Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bbd0423f282..e7664b9f706 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1971,7 +1971,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) alloc_flags |= (__force int) (gfp_mask & __GFP_HIGH); if (!wait) { - alloc_flags |= ALLOC_HARDER; + /* + * Not worth trying to allocate harder for + * __GFP_NOMEMALLOC even if it can't schedule. + */ + if (!(gfp_mask & __GFP_NOMEMALLOC)) + alloc_flags |= ALLOC_HARDER; /* * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc. * See also cpuset_zone_allowed() comment in kernel/cpuset.c. -- cgit v1.2.3-70-g09d2 From 71e3aac0724ffe8918992d76acfe3aad7d8724a5 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:52 -0800 Subject: thp: transparent hugepage core Lately I've been working to make KVM use hugepages transparently without the usual restrictions of hugetlbfs. Some of the restrictions I'd like to see removed: 1) hugepages have to be swappable or the guest physical memory remains locked in RAM and can't be paged out to swap 2) if a hugepage allocation fails, regular pages should be allocated instead and mixed in the same vma without any failure and without userland noticing 3) if some task quits and more hugepages become available in the buddy, guest physical memory backed by regular pages should be relocated on hugepages automatically in regions under madvise(MADV_HUGEPAGE) (ideally event driven by waking up the kernel deamon if the order=HPAGE_PMD_SHIFT-PAGE_SHIFT list becomes not null) 4) avoidance of reservation and maximization of use of hugepages whenever possible. Reservation (needed to avoid runtime fatal faliures) may be ok for 1 machine with 1 database with 1 database cache with 1 database cache size known at boot time. It's definitely not feasible with a virtualization hypervisor usage like RHEV-H that runs an unknown number of virtual machines with an unknown size of each virtual machine with an unknown amount of pagecache that could be potentially useful in the host for guest not using O_DIRECT (aka cache=off). hugepages in the virtualization hypervisor (and also in the guest!) are much more important than in a regular host not using virtualization, becasue with NPT/EPT they decrease the tlb-miss cacheline accesses from 24 to 19 in case only the hypervisor uses transparent hugepages, and they decrease the tlb-miss cacheline accesses from 19 to 15 in case both the linux hypervisor and the linux guest both uses this patch (though the guest will limit the addition speedup to anonymous regions only for now...). Even more important is that the tlb miss handler is much slower on a NPT/EPT guest than for a regular shadow paging or no-virtualization scenario. So maximizing the amount of virtual memory cached by the TLB pays off significantly more with NPT/EPT than without (even if there would be no significant speedup in the tlb-miss runtime). The first (and more tedious) part of this work requires allowing the VM to handle anonymous hugepages mixed with regular pages transparently on regular anonymous vmas. This is what this patch tries to achieve in the least intrusive possible way. We want hugepages and hugetlb to be used in a way so that all applications can benefit without changes (as usual we leverage the KVM virtualization design: by improving the Linux VM at large, KVM gets the performance boost too). The most important design choice is: always fallback to 4k allocation if the hugepage allocation fails! This is the _very_ opposite of some large pagecache patches that failed with -EIO back then if a 64k (or similar) allocation failed... Second important decision (to reduce the impact of the feature on the existing pagetable handling code) is that at any time we can split an hugepage into 512 regular pages and it has to be done with an operation that can't fail. This way the reliability of the swapping isn't decreased (no need to allocate memory when we are short on memory to swap) and it's trivial to plug a split_huge_page* one-liner where needed without polluting the VM. Over time we can teach mprotect, mremap and friends to handle pmd_trans_huge natively without calling split_huge_page*. The fact it can't fail isn't just for swap: if split_huge_page would return -ENOMEM (instead of the current void) we'd need to rollback the mprotect from the middle of it (ideally including undoing the split_vma) which would be a big change and in the very wrong direction (it'd likely be simpler not to call split_huge_page at all and to teach mprotect and friends to handle hugepages instead of rolling them back from the middle). In short the very value of split_huge_page is that it can't fail. The collapsing and madvise(MADV_HUGEPAGE) part will remain separated and incremental and it'll just be an "harmless" addition later if this initial part is agreed upon. It also should be noted that locking-wise replacing regular pages with hugepages is going to be very easy if compared to what I'm doing below in split_huge_page, as it will only happen when page_count(page) matches page_mapcount(page) if we can take the PG_lock and mmap_sem in write mode. collapse_huge_page will be a "best effort" that (unlike split_huge_page) can fail at the minimal sign of trouble and we can try again later. collapse_huge_page will be similar to how KSM works and the madvise(MADV_HUGEPAGE) will work similar to madvise(MADV_MERGEABLE). The default I like is that transparent hugepages are used at page fault time. This can be changed with /sys/kernel/mm/transparent_hugepage/enabled. The control knob can be set to three values "always", "madvise", "never" which mean respectively that hugepages are always used, or only inside madvise(MADV_HUGEPAGE) regions, or never used. /sys/kernel/mm/transparent_hugepage/defrag instead controls if the hugepage allocation should defrag memory aggressively "always", only inside "madvise" regions, or "never". The pmd_trans_splitting/pmd_trans_huge locking is very solid. The put_page (from get_user_page users that can't use mmu notifier like O_DIRECT) that runs against a __split_huge_page_refcount instead was a pain to serialize in a way that would result always in a coherent page count for both tail and head. I think my locking solution with a compound_lock taken only after the page_first is valid and is still a PageHead should be safe but it surely needs review from SMP race point of view. In short there is no current existing way to serialize the O_DIRECT final put_page against split_huge_page_refcount so I had to invent a new one (O_DIRECT loses knowledge on the mapping status by the time gup_fast returns so...). And I didn't want to impact all gup/gup_fast users for now, maybe if we change the gup interface substantially we can avoid this locking, I admit I didn't think too much about it because changing the gup unpinning interface would be invasive. If we ignored O_DIRECT we could stick to the existing compound refcounting code, by simply adding a get_user_pages_fast_flags(foll_flags) where KVM (and any other mmu notifier user) would call it without FOLL_GET (and if FOLL_GET isn't set we'd just BUG_ON if nobody registered itself in the current task mmu notifier list yet). But O_DIRECT is fundamental for decent performance of virtualized I/O on fast storage so we can't avoid it to solve the race of put_page against split_huge_page_refcount to achieve a complete hugepage feature for KVM. Swap and oom works fine (well just like with regular pages ;). MMU notifier is handled transparently too, with the exception of the young bit on the pmd, that didn't have a range check but I think KVM will be fine because the whole point of hugepages is that EPT/NPT will also use a huge pmd when they notice gup returns pages with PageCompound set, so they won't care of a range and there's just the pmd young bit to check in that case. NOTE: in some cases if the L2 cache is small, this may slowdown and waste memory during COWs because 4M of memory are accessed in a single fault instead of 8k (the payoff is that after COW the program can run faster). So we might want to switch the copy_huge_page (and clear_huge_page too) to not temporal stores. I also extensively researched ways to avoid this cache trashing with a full prefault logic that would cow in 8k/16k/32k/64k up to 1M (I can send those patches that fully implemented prefault) but I concluded they're not worth it and they add an huge additional complexity and they remove all tlb benefits until the full hugepage has been faulted in, to save a little bit of memory and some cache during app startup, but they still don't improve substantially the cache-trashing during startup if the prefault happens in >4k chunks. One reason is that those 4k pte entries copied are still mapped on a perfectly cache-colored hugepage, so the trashing is the worst one can generate in those copies (cow of 4k page copies aren't so well colored so they trashes less, but again this results in software running faster after the page fault). Those prefault patches allowed things like a pte where post-cow pages were local 4k regular anon pages and the not-yet-cowed pte entries were pointing in the middle of some hugepage mapped read-only. If it doesn't payoff substantially with todays hardware it will payoff even less in the future with larger l2 caches, and the prefault logic would blot the VM a lot. If one is emebdded transparent_hugepage can be disabled during boot with sysfs or with the boot commandline parameter transparent_hugepage=0 (or transparent_hugepage=2 to restrict hugepages inside madvise regions) that will ensure not a single hugepage is allocated at boot time. It is simple enough to just disable transparent hugepage globally and let transparent hugepages be allocated selectively by applications in the MADV_HUGEPAGE region (both at page fault time, and if enabled with the collapse_huge_page too through the kernel daemon). This patch supports only hugepages mapped in the pmd, archs that have smaller hugepages will not fit in this patch alone. Also some archs like power have certain tlb limits that prevents mixing different page size in the same regions so they will not fit in this framework that requires "graceful fallback" to basic PAGE_SIZE in case of physical memory fragmentation. hugetlbfs remains a perfect fit for those because its software limits happen to match the hardware limits. hugetlbfs also remains a perfect fit for hugepage sizes like 1GByte that cannot be hoped to be found not fragmented after a certain system uptime and that would be very expensive to defragment with relocation, so requiring reservation. hugetlbfs is the "reservation way", the point of transparent hugepages is not to have any reservation at all and maximizing the use of cache and hugepages at all times automatically. Some performance result: vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largep ages3 memset page fault 1566023 memset tlb miss 453854 memset second tlb miss 453321 random access tlb miss 41635 random access second tlb miss 41658 vmx andrea # LD_PRELOAD=/usr/lib64/libhugetlbfs.so HUGETLB_MORECORE=yes HUGETLB_PATH=/mnt/huge/ ./largepages3 memset page fault 1566471 memset tlb miss 453375 memset second tlb miss 453320 random access tlb miss 41636 random access second tlb miss 41637 vmx andrea # ./largepages3 memset page fault 1566642 memset tlb miss 453417 memset second tlb miss 453313 random access tlb miss 41630 random access second tlb miss 41647 vmx andrea # ./largepages3 memset page fault 1566872 memset tlb miss 453418 memset second tlb miss 453315 random access tlb miss 41618 random access second tlb miss 41659 vmx andrea # echo 0 > /proc/sys/vm/transparent_hugepage vmx andrea # ./largepages3 memset page fault 2182476 memset tlb miss 460305 memset second tlb miss 460179 random access tlb miss 44483 random access second tlb miss 44186 vmx andrea # ./largepages3 memset page fault 2182791 memset tlb miss 460742 memset second tlb miss 459962 random access tlb miss 43981 random access second tlb miss 43988 ============ #include #include #include #include #define SIZE (3UL*1024*1024*1024) int main() { char *p = malloc(SIZE), *p2; struct timeval before, after; gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset page fault %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); memset(p, 0, SIZE); gettimeofday(&after, NULL); printf("memset second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); gettimeofday(&before, NULL); for (p2 = p; p2 < p+SIZE; p2 += 4096) *p2 = 0; gettimeofday(&after, NULL); printf("random access second tlb miss %Lu\n", (after.tv_sec-before.tv_sec)*1000000UL + after.tv_usec-before.tv_usec); return 0; } ============ Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable_64.h | 5 + include/linux/gfp.h | 3 + include/linux/huge_mm.h | 118 +++++ include/linux/mm.h | 4 + include/linux/mm_inline.h | 11 +- include/linux/page-flags.h | 21 + include/linux/rmap.h | 2 + include/linux/swap.h | 2 + mm/Makefile | 1 + mm/huge_memory.c | 901 ++++++++++++++++++++++++++++++++++++++ mm/internal.h | 4 + mm/memory.c | 84 +++- mm/rmap.c | 62 ++- mm/swap.c | 37 ++ 14 files changed, 1220 insertions(+), 35 deletions(-) create mode 100644 include/linux/huge_mm.h create mode 100644 mm/huge_memory.c (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 1fb61a74b2e..b2df039a411 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -286,6 +286,11 @@ static inline pmd_t pmd_mkwrite(pmd_t pmd) return pmd_set_flags(pmd, _PAGE_RW); } +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 49d2356bb82..d95082cc6f4 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -109,6 +109,9 @@ struct vm_area_struct; __GFP_HARDWALL | __GFP_HIGHMEM | \ __GFP_MOVABLE) #define GFP_IOFS (__GFP_IO | __GFP_FS) +#define GFP_TRANSHUGE (GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN | \ + __GFP_NO_KSWAPD) #ifdef CONFIG_NUMA #define GFP_THISNODE (__GFP_THISNODE | __GFP_NOWARN | __GFP_NORETRY) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h new file mode 100644 index 00000000000..9301824c749 --- /dev/null +++ b/include/linux/huge_mm.h @@ -0,0 +1,118 @@ +#ifndef _LINUX_HUGE_MM_H +#define _LINUX_HUGE_MM_H + +extern int do_huge_pmd_anonymous_page(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + unsigned int flags); +extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, + struct vm_area_struct *vma); +extern int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, + pmd_t orig_pmd); +extern pgtable_t get_pmd_huge_pte(struct mm_struct *mm); +extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags); +extern int zap_huge_pmd(struct mmu_gather *tlb, + struct vm_area_struct *vma, + pmd_t *pmd); + +enum transparent_hugepage_flag { + TRANSPARENT_HUGEPAGE_FLAG, + TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, +#ifdef CONFIG_DEBUG_VM + TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, +#endif +}; + +enum page_check_address_pmd_flag { + PAGE_CHECK_ADDRESS_PMD_FLAG, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG, +}; +extern pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +#define HPAGE_PMD_SHIFT HPAGE_SHIFT +#define HPAGE_PMD_MASK HPAGE_MASK +#define HPAGE_PMD_SIZE HPAGE_SIZE + +#define transparent_hugepage_enabled(__vma) \ + (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) +#define transparent_hugepage_defrag(__vma) \ + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE)) +#ifdef CONFIG_DEBUG_VM +#define transparent_hugepage_debug_cow() \ + (transparent_hugepage_flags & \ + (1<root->lock); \ + /* \ + * spin_unlock_wait() is just a loop in C and so the \ + * CPU can reorder anything around it. \ + */ \ + smp_mb(); \ + BUG_ON(pmd_trans_splitting(*____pmd) || \ + pmd_trans_huge(*____pmd)); \ + } while (0) +#define HPAGE_PMD_ORDER (HPAGE_PMD_SHIFT-PAGE_SHIFT) +#define HPAGE_PMD_NR (1< MAX_ORDER +#error "hugepages can't be allocated by the buddy allocator" +#endif +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +#define HPAGE_PMD_SHIFT ({ BUG(); 0; }) +#define HPAGE_PMD_MASK ({ BUG(); 0; }) +#define HPAGE_PMD_SIZE ({ BUG(); 0; }) + +#define transparent_hugepage_enabled(__vma) 0 + +#define transparent_hugepage_flags 0UL +static inline int split_huge_page(struct page *page) +{ + return 0; +} +#define split_huge_page_pmd(__mm, __pmd) \ + do { } while (0) +#define wait_split_huge_page(__anon_vma, __pmd) \ + do { } while (0) +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_HUGE_MM_H */ diff --git a/include/linux/mm.h b/include/linux/mm.h index cc6ab1038f6..78adec4ba9f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -111,6 +111,9 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ +#if BITS_PER_LONG > 32 +#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ +#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) @@ -243,6 +246,7 @@ struct inode; * files which need it (119 of them) */ #include +#include /* * Methods to modify the page usage count. diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 8835b877b8d..650f31eabdb 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -20,13 +20,20 @@ static inline int page_is_file_cache(struct page *page) } static inline void -add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +__add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, + struct list_head *head) { - list_add(&page->lru, &zone->lru[l].list); + list_add(&page->lru, head); __inc_zone_state(zone, NR_LRU_BASE + l); mem_cgroup_add_lru_list(page, l); } +static inline void +add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l) +{ + __add_page_to_lru_list(zone, page, l, &zone->lru[l].list); +} + static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 907f1605926..4ca1241ef94 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -410,11 +410,32 @@ static inline void ClearPageCompound(struct page *page) #endif /* !PAGEFLAGS_EXTENDED */ #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * PageHuge() only returns true for hugetlbfs pages, but not for + * normal or transparent huge pages. + * + * PageTransHuge() returns true for both transparent huge and + * hugetlbfs pages, but not normal pages. PageTransHuge() can only be + * called only in the core VM paths where hugetlbfs pages can't exist. + */ +static inline int PageTransHuge(struct page *page) +{ + VM_BUG_ON(PageTail(page)); + return PageHead(page); +} + static inline int PageTransCompound(struct page *page) { return PageCompound(page); } + #else + +static inline int PageTransHuge(struct page *page) +{ + return 0; +} + static inline int PageTransCompound(struct page *page) { return 0; diff --git a/include/linux/rmap.h b/include/linux/rmap.h index bb83c0da207..e9fd04ca1e5 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -198,6 +198,8 @@ enum ttu_flags { }; #define TTU_ACTION(x) ((x) & TTU_ACTION_MASK) +bool is_vma_temporary_stack(struct vm_area_struct *vma); + int try_to_unmap(struct page *, enum ttu_flags flags); int try_to_unmap_one(struct page *, struct vm_area_struct *, unsigned long address, enum ttu_flags flags); diff --git a/include/linux/swap.h b/include/linux/swap.h index eba53e71d2c..4d559325d91 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -208,6 +208,8 @@ extern unsigned int nr_free_pagecache_pages(void); /* linux/mm/swap.c */ extern void __lru_cache_add(struct page *, enum lru_list lru); extern void lru_cache_add_lru(struct page *, enum lru_list lru); +extern void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail); extern void activate_page(struct page *); extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); diff --git a/mm/Makefile b/mm/Makefile index 380772a9ccb..2b1b575ae71 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_QUICKLIST) += quicklist.o +obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o diff --git a/mm/huge_memory.c b/mm/huge_memory.c new file mode 100644 index 00000000000..0c1e8f939f7 --- /dev/null +++ b/mm/huge_memory.c @@ -0,0 +1,901 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "internal.h" + +unsigned long transparent_hugepage_flags __read_mostly = + (1<page_table_lock); + + /* FIFO */ + pgtable = mm->pmd_huge_pte; + if (list_empty(&pgtable->lru)) + mm->pmd_huge_pte = NULL; + else { + mm->pmd_huge_pte = list_entry(pgtable->lru.next, + struct page, lru); + list_del(&pgtable->lru); + } + return pgtable; +} + +static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + struct page *page, + unsigned long haddr) +{ + pgtable_t pgtable; + pmd_t _pmd; + int ret = 0, i; + struct page **pages; + + pages = kmalloc(sizeof(struct page *) * HPAGE_PMD_NR, + GFP_KERNEL); + if (unlikely(!pages)) { + ret |= VM_FAULT_OOM; + goto out; + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, + vma, address); + if (unlikely(!pages[i])) { + while (--i >= 0) + put_page(pages[i]); + kfree(pages); + ret |= VM_FAULT_OOM; + goto out; + } + } + + for (i = 0; i < HPAGE_PMD_NR; i++) { + copy_user_highpage(pages[i], page + i, + haddr + PAGE_SHIFT*i, vma); + __SetPageUptodate(pages[i]); + cond_resched(); + } + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_free_pages; + VM_BUG_ON(!PageHead(page)); + + pmdp_clear_flush_notify(vma, haddr, pmd); + /* leave pmd empty until pte is filled */ + + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + entry = mk_pte(pages[i], vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + page_add_new_anon_rmap(pages[i], vma, haddr); + pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + kfree(pages); + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + pmd_populate(mm, pmd, pgtable); + page_remove_rmap(page); + spin_unlock(&mm->page_table_lock); + + ret |= VM_FAULT_WRITE; + put_page(page); + +out: + return ret; + +out_free_pages: + spin_unlock(&mm->page_table_lock); + for (i = 0; i < HPAGE_PMD_NR; i++) + put_page(pages[i]); + kfree(pages); + goto out; +} + +int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, + unsigned long address, pmd_t *pmd, pmd_t orig_pmd) +{ + int ret = 0; + struct page *page, *new_page; + unsigned long haddr; + + VM_BUG_ON(!vma->anon_vma); + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto out_unlock; + + page = pmd_page(orig_pmd); + VM_BUG_ON(!PageCompound(page) || !PageHead(page)); + haddr = address & HPAGE_PMD_MASK; + if (page_mapcount(page) == 1) { + pmd_t entry; + entry = pmd_mkyoung(orig_pmd); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + if (pmdp_set_access_flags(vma, haddr, pmd, entry, 1)) + update_mmu_cache(vma, address, entry); + ret |= VM_FAULT_WRITE; + goto out_unlock; + } + get_page(page); + spin_unlock(&mm->page_table_lock); + + if (transparent_hugepage_enabled(vma) && + !transparent_hugepage_debug_cow()) + new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + else + new_page = NULL; + + if (unlikely(!new_page)) { + ret = do_huge_pmd_wp_page_fallback(mm, vma, address, + pmd, orig_pmd, page, haddr); + put_page(page); + goto out; + } + + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); + __SetPageUptodate(new_page); + + spin_lock(&mm->page_table_lock); + put_page(page); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + put_page(new_page); + else { + pmd_t entry; + VM_BUG_ON(!PageHead(page)); + entry = mk_pmd(new_page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + pmdp_clear_flush_notify(vma, haddr, pmd); + page_add_new_anon_rmap(new_page, vma, haddr); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache(vma, address, entry); + page_remove_rmap(page); + put_page(page); + ret |= VM_FAULT_WRITE; + } +out_unlock: + spin_unlock(&mm->page_table_lock); +out: + return ret; +} + +struct page *follow_trans_huge_pmd(struct mm_struct *mm, + unsigned long addr, + pmd_t *pmd, + unsigned int flags) +{ + struct page *page = NULL; + + assert_spin_locked(&mm->page_table_lock); + + if (flags & FOLL_WRITE && !pmd_write(*pmd)) + goto out; + + page = pmd_page(*pmd); + VM_BUG_ON(!PageHead(page)); + if (flags & FOLL_TOUCH) { + pmd_t _pmd; + /* + * We should set the dirty bit only for FOLL_WRITE but + * for now the dirty bit in the pmd is meaningless. + * And if the dirty bit will become meaningful and + * we'll only set it with FOLL_WRITE, an atomic + * set_bit will be required on the pmd to set the + * young bit, instead of the current set_pmd_at. + */ + _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); + set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + } + page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; + VM_BUG_ON(!PageCompound(page)); + if (flags & FOLL_GET) + get_page(page); + +out: + return page; +} + +int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, + pmd_t *pmd) +{ + int ret = 0; + + spin_lock(&tlb->mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&tlb->mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, + pmd); + } else { + struct page *page; + pgtable_t pgtable; + pgtable = get_pmd_huge_pte(tlb->mm); + page = pmd_page(*pmd); + pmd_clear(pmd); + page_remove_rmap(page); + VM_BUG_ON(page_mapcount(page) < 0); + add_mm_counter(tlb->mm, MM_ANONPAGES, -HPAGE_PMD_NR); + VM_BUG_ON(!PageHead(page)); + spin_unlock(&tlb->mm->page_table_lock); + tlb_remove_page(tlb, page); + pte_free(tlb->mm, pgtable); + ret = 1; + } + } else + spin_unlock(&tlb->mm->page_table_lock); + + return ret; +} + +pmd_t *page_check_address_pmd(struct page *page, + struct mm_struct *mm, + unsigned long address, + enum page_check_address_pmd_flag flag) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, *ret = NULL; + + if (address & ~HPAGE_PMD_MASK) + goto out; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (pmd_none(*pmd)) + goto out; + if (pmd_page(*pmd) != page) + goto out; + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)); + if (pmd_trans_huge(*pmd)) { + VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && + !pmd_trans_splitting(*pmd)); + ret = pmd; + } +out: + return ret; +} + +static int __split_huge_page_splitting(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd; + int ret = 0; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG); + if (pmd) { + /* + * We can't temporarily set the pmd to null in order + * to split it, the pmd must remain marked huge at all + * times or the VM won't take the pmd_trans_huge paths + * and it won't wait on the anon_vma->root->lock to + * serialize against split_huge_page*. + */ + pmdp_splitting_flush_notify(vma, address, pmd); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +static void __split_huge_page_refcount(struct page *page) +{ + int i; + unsigned long head_index = page->index; + struct zone *zone = page_zone(page); + + /* prevent PageLRU to go away from under us, and freeze lru stats */ + spin_lock_irq(&zone->lru_lock); + compound_lock(page); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + + /* tail_page->_count cannot change */ + atomic_sub(atomic_read(&page_tail->_count), &page->_count); + BUG_ON(page_count(page) <= 0); + atomic_add(page_mapcount(page) + 1, &page_tail->_count); + BUG_ON(atomic_read(&page_tail->_count) <= 0); + + /* after clearing PageTail the gup refcount can be released */ + smp_mb(); + + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + page_tail->flags |= (page->flags & + ((1L << PG_referenced) | + (1L << PG_swapbacked) | + (1L << PG_mlocked) | + (1L << PG_uptodate))); + page_tail->flags |= (1L << PG_dirty); + + /* + * 1) clear PageTail before overwriting first_page + * 2) clear PageTail before clearing PageHead for VM_BUG_ON + */ + smp_wmb(); + + /* + * __split_huge_page_splitting() already set the + * splitting bit in all pmd that could map this + * hugepage, that will ensure no CPU can alter the + * mapcount on the head page. The mapcount is only + * accounted in the head page and it has to be + * transferred to all tail pages in the below code. So + * for this code to be safe, the split the mapcount + * can't change. But that doesn't mean userland can't + * keep changing and reading the page contents while + * we transfer the mapcount, so the pmd splitting + * status is achieved setting a reserved bit in the + * pmd, not by clearing the present bit. + */ + BUG_ON(page_mapcount(page_tail)); + page_tail->_mapcount = page->_mapcount; + + BUG_ON(page_tail->mapping); + page_tail->mapping = page->mapping; + + page_tail->index = ++head_index; + + BUG_ON(!PageAnon(page_tail)); + BUG_ON(!PageUptodate(page_tail)); + BUG_ON(!PageDirty(page_tail)); + BUG_ON(!PageSwapBacked(page_tail)); + + lru_add_page_tail(zone, page, page_tail); + } + + ClearPageCompound(page); + compound_unlock(page); + spin_unlock_irq(&zone->lru_lock); + + for (i = 1; i < HPAGE_PMD_NR; i++) { + struct page *page_tail = page + i; + BUG_ON(page_count(page_tail) <= 0); + /* + * Tail pages may be freed if there wasn't any mapping + * like if add_to_swap() is running on a lru page that + * had its mapping zapped. And freeing these pages + * requires taking the lru_lock so we do the put_page + * of the tail pages after the split is complete. + */ + put_page(page_tail); + } + + /* + * Only the head page (now become a regular page) is required + * to be pinned by the caller. + */ + BUG_ON(page_count(page) <= 0); +} + +static int __split_huge_page_map(struct page *page, + struct vm_area_struct *vma, + unsigned long address) +{ + struct mm_struct *mm = vma->vm_mm; + pmd_t *pmd, _pmd; + int ret = 0, i; + pgtable_t pgtable; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); + if (pmd) { + pgtable = get_pmd_huge_pte(mm); + pmd_populate(mm, &_pmd, pgtable); + + for (i = 0, haddr = address; i < HPAGE_PMD_NR; + i++, haddr += PAGE_SIZE) { + pte_t *pte, entry; + BUG_ON(PageCompound(page+i)); + entry = mk_pte(page + i, vma->vm_page_prot); + entry = maybe_mkwrite(pte_mkdirty(entry), vma); + if (!pmd_write(*pmd)) + entry = pte_wrprotect(entry); + else + BUG_ON(page_mapcount(page) != 1); + if (!pmd_young(*pmd)) + entry = pte_mkold(entry); + pte = pte_offset_map(&_pmd, haddr); + BUG_ON(!pte_none(*pte)); + set_pte_at(mm, haddr, pte, entry); + pte_unmap(pte); + } + + mm->nr_ptes++; + smp_wmb(); /* make pte visible before pmd */ + /* + * Up to this point the pmd is present and huge and + * userland has the whole access to the hugepage + * during the split (which happens in place). If we + * overwrite the pmd with the not-huge version + * pointing to the pte here (which of course we could + * if all CPUs were bug free), userland could trigger + * a small page size TLB miss on the small sized TLB + * while the hugepage TLB entry is still established + * in the huge TLB. Some CPU doesn't like that. See + * http://support.amd.com/us/Processor_TechDocs/41322.pdf, + * Erratum 383 on page 93. Intel should be safe but is + * also warns that it's only safe if the permission + * and cache attributes of the two entries loaded in + * the two TLB is identical (which should be the case + * here). But it is generally safer to never allow + * small and huge TLB entries for the same virtual + * address to be loaded simultaneously. So instead of + * doing "pmd_populate(); flush_tlb_range();" we first + * mark the current pmd notpresent (atomically because + * here the pmd_trans_huge and pmd_trans_splitting + * must remain set at all times on the pmd until the + * split is complete for this pmd), then we flush the + * SMP TLB and finally we write the non-huge version + * of the pmd entry with pmd_populate. + */ + set_pmd_at(mm, address, pmd, pmd_mknotpresent(*pmd)); + flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + pmd_populate(mm, pmd, pgtable); + ret = 1; + } + spin_unlock(&mm->page_table_lock); + + return ret; +} + +/* must be called with anon_vma->root->lock hold */ +static void __split_huge_page(struct page *page, + struct anon_vma *anon_vma) +{ + int mapcount, mapcount2; + struct anon_vma_chain *avc; + + BUG_ON(!PageHead(page)); + BUG_ON(PageTail(page)); + + mapcount = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount += __split_huge_page_splitting(page, vma, addr); + } + BUG_ON(mapcount != page_mapcount(page)); + + __split_huge_page_refcount(page); + + mapcount2 = 0; + list_for_each_entry(avc, &anon_vma->head, same_anon_vma) { + struct vm_area_struct *vma = avc->vma; + unsigned long addr = vma_address(page, vma); + BUG_ON(is_vma_temporary_stack(vma)); + if (addr == -EFAULT) + continue; + mapcount2 += __split_huge_page_map(page, vma, addr); + } + BUG_ON(mapcount != mapcount2); +} + +int split_huge_page(struct page *page) +{ + struct anon_vma *anon_vma; + int ret = 1; + + BUG_ON(!PageAnon(page)); + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + goto out; + ret = 0; + if (!PageCompound(page)) + goto out_unlock; + + BUG_ON(!PageSwapBacked(page)); + __split_huge_page(page, anon_vma); + + BUG_ON(PageCompound(page)); +out_unlock: + page_unlock_anon_vma(anon_vma); +out: + return ret; +} + +void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) +{ + struct page *page; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_trans_huge(*pmd))) { + spin_unlock(&mm->page_table_lock); + return; + } + page = pmd_page(*pmd); + VM_BUG_ON(!page_count(page)); + get_page(page); + spin_unlock(&mm->page_table_lock); + + split_huge_page(page); + + put_page(page); + BUG_ON(pmd_trans_huge(*pmd)); +} diff --git a/mm/internal.h b/mm/internal.h index bd4f581f624..69488205723 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -134,6 +134,10 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) } } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern unsigned long vma_address(struct page *page, + struct vm_area_struct *vma); +#endif #else /* !CONFIG_MMU */ static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) { diff --git a/mm/memory.c b/mm/memory.c index 60e1c68d821..c50a195041e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -726,9 +726,9 @@ out_set_pte: return 0; } -static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, - pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, - unsigned long addr, unsigned long end) +int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, + pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma, + unsigned long addr, unsigned long end) { pte_t *orig_src_pte, *orig_dst_pte; pte_t *src_pte, *dst_pte; @@ -802,6 +802,16 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src src_pmd = pmd_offset(src_pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*src_pmd)) { + int err; + err = copy_huge_pmd(dst_mm, src_mm, + dst_pmd, src_pmd, addr, vma); + if (err == -ENOMEM) + return -ENOMEM; + if (!err) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(src_pmd)) continue; if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd, @@ -1004,6 +1014,15 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); + if (pmd_trans_huge(*pmd)) { + if (next-addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (zap_huge_pmd(tlb, vma, pmd)) { + (*zap_work)--; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) { (*zap_work)--; continue; @@ -1280,11 +1299,27 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto no_page_table; - if (pmd_huge(*pmd)) { + if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pmd(mm, address, pmd, flags & FOLL_WRITE); goto out; } + if (pmd_trans_huge(*pmd)) { + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + page = follow_trans_huge_pmd(mm, address, + pmd, flags); + spin_unlock(&mm->page_table_lock); + goto out; + } + } else + spin_unlock(&mm->page_table_lock); + /* fall through */ + } if (unlikely(pmd_bad(*pmd))) goto no_page_table; @@ -3179,9 +3214,9 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma, * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static inline int handle_pte_fault(struct mm_struct *mm, - struct vm_area_struct *vma, unsigned long address, - pte_t *pte, pmd_t *pmd, unsigned int flags) +int handle_pte_fault(struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long address, + pte_t *pte, pmd_t *pmd, unsigned int flags) { pte_t entry; spinlock_t *ptl; @@ -3260,9 +3295,40 @@ int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma, pmd = pmd_alloc(mm, pud, address); if (!pmd) return VM_FAULT_OOM; - pte = pte_alloc_map(mm, vma, pmd, address); - if (!pte) + if (pmd_none(*pmd) && transparent_hugepage_enabled(vma)) { + if (!vma->vm_ops) + return do_huge_pmd_anonymous_page(mm, vma, address, + pmd, flags); + } else { + pmd_t orig_pmd = *pmd; + barrier(); + if (pmd_trans_huge(orig_pmd)) { + if (flags & FAULT_FLAG_WRITE && + !pmd_write(orig_pmd) && + !pmd_trans_splitting(orig_pmd)) + return do_huge_pmd_wp_page(mm, vma, address, + pmd, orig_pmd); + return 0; + } + } + + /* + * Use __pte_alloc instead of pte_alloc_map, because we can't + * run pte_offset_map on the pmd, if an huge pmd could + * materialize from under us from a different thread. + */ + if (unlikely(__pte_alloc(mm, vma, pmd, address))) return VM_FAULT_OOM; + /* if an huge pmd materialized from under us just retry later */ + if (unlikely(pmd_trans_huge(*pmd))) + return 0; + /* + * A regular pmd is established and it can't morph into a huge pmd + * from under us anymore at this point because we hold the mmap_sem + * read mode and khugepaged takes it in write mode. So now it's + * safe to run pte_offset_map(). + */ + pte = pte_offset_map(pmd, address); return handle_pte_fault(mm, vma, address, pte, pmd, flags); } diff --git a/mm/rmap.c b/mm/rmap.c index a3197a8a295..e41375a6b02 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -360,7 +360,7 @@ void page_unlock_anon_vma(struct anon_vma *anon_vma) * Returns virtual address or -EFAULT if page's index/offset is not * within the range mapped the @vma. */ -static inline unsigned long +inline unsigned long vma_address(struct page *page, struct vm_area_struct *vma) { pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -435,6 +435,8 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) return NULL; + if (pmd_trans_huge(*pmd)) + return NULL; pte = pte_offset_map(pmd, address); /* Make a quick check before getting the lock */ @@ -489,35 +491,17 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, unsigned long *vm_flags) { struct mm_struct *mm = vma->vm_mm; - pte_t *pte; - spinlock_t *ptl; int referenced = 0; - pte = page_check_address(page, mm, address, &ptl, 0); - if (!pte) - goto out; - /* * Don't want to elevate referenced for mlocked page that gets this far, * in order that it progresses to try_to_unmap and is moved to the * unevictable list. */ if (vma->vm_flags & VM_LOCKED) { - *mapcount = 1; /* break early from loop */ + *mapcount = 0; /* break early from loop */ *vm_flags |= VM_LOCKED; - goto out_unmap; - } - - if (ptep_clear_flush_young_notify(vma, address, pte)) { - /* - * Don't treat a reference through a sequentially read - * mapping as such. If the page has been used in - * another mapping, we will catch it; if this other - * mapping is already gone, the unmap path will have - * set PG_referenced or activated the page. - */ - if (likely(!VM_SequentialReadHint(vma))) - referenced++; + goto out; } /* Pretend the page is referenced if the task has the @@ -526,9 +510,39 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, rwsem_is_locked(&mm->mmap_sem)) referenced++; -out_unmap: + if (unlikely(PageTransHuge(page))) { + pmd_t *pmd; + + spin_lock(&mm->page_table_lock); + pmd = page_check_address_pmd(page, mm, address, + PAGE_CHECK_ADDRESS_PMD_FLAG); + if (pmd && !pmd_trans_splitting(*pmd) && + pmdp_clear_flush_young_notify(vma, address, pmd)) + referenced++; + spin_unlock(&mm->page_table_lock); + } else { + pte_t *pte; + spinlock_t *ptl; + + pte = page_check_address(page, mm, address, &ptl, 0); + if (!pte) + goto out; + + if (ptep_clear_flush_young_notify(vma, address, pte)) { + /* + * Don't treat a reference through a sequentially read + * mapping as such. If the page has been used in + * another mapping, we will catch it; if this other + * mapping is already gone, the unmap path will have + * set PG_referenced or activated the page. + */ + if (likely(!VM_SequentialReadHint(vma))) + referenced++; + } + pte_unmap_unlock(pte, ptl); + } + (*mapcount)--; - pte_unmap_unlock(pte, ptl); if (referenced) *vm_flags |= vma->vm_flags; @@ -1202,7 +1216,7 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, return ret; } -static bool is_vma_temporary_stack(struct vm_area_struct *vma) +bool is_vma_temporary_stack(struct vm_area_struct *vma) { int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP); diff --git a/mm/swap.c b/mm/swap.c index e0eeef94088..c02f93611a8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -479,6 +479,43 @@ void __pagevec_release(struct pagevec *pvec) EXPORT_SYMBOL(__pagevec_release); +/* used by __split_huge_page_refcount() */ +void lru_add_page_tail(struct zone* zone, + struct page *page, struct page *page_tail) +{ + int active; + enum lru_list lru; + const int file = 0; + struct list_head *head; + + VM_BUG_ON(!PageHead(page)); + VM_BUG_ON(PageCompound(page_tail)); + VM_BUG_ON(PageLRU(page_tail)); + VM_BUG_ON(!spin_is_locked(&zone->lru_lock)); + + SetPageLRU(page_tail); + + if (page_evictable(page_tail, NULL)) { + if (PageActive(page)) { + SetPageActive(page_tail); + active = 1; + lru = LRU_ACTIVE_ANON; + } else { + active = 0; + lru = LRU_INACTIVE_ANON; + } + update_page_reclaim_stat(zone, page_tail, file, active); + if (likely(PageLRU(page))) + head = page->lru.prev; + else + head = &zone->lru[lru].list; + __add_page_to_lru_list(zone, page_tail, lru, head); + } else { + SetPageUnevictable(page_tail); + add_page_to_lru_list(zone, page_tail, LRU_UNEVICTABLE); + } +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. -- cgit v1.2.3-70-g09d2 From 8a07651ee8cdaa9e27cb4ae372aed347533770f5 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:46:52 -0800 Subject: thp: transparent hugepage core fixlet If you configure THP in addition to HUGETLB_PAGE on x86_32 without PAE, the p?d-folding works out that munlock_vma_pages_range() can crash to follow_page()'s pud_huge() BUG_ON(flags & FOLL_GET): it needs the same VM_HUGETLB check already there on the pmd_huge() line. Conveniently, openSUSE provides a "blogd" which tests this out at startup! Signed-off-by: Hugh Dickins Cc: Rik van Riel Cc: Johannes Weiner Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c50a195041e..840ce9d98f8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1288,7 +1288,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, pud = pud_offset(pgd, address); if (pud_none(*pud)) goto no_page_table; - if (pud_huge(*pud)) { + if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) { BUG_ON(flags & FOLL_GET); page = follow_huge_pud(mm, address, pud, flags & FOLL_WRITE); goto out; -- cgit v1.2.3-70-g09d2 From 05759d380a9d7f131a475186c07fce58ceaa8902 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:53 -0800 Subject: thp: split_huge_page anon_vma ordering dependency This documents how split_huge_page is safe vs new vma inserctions into the anon_vma that may have already released the anon_vma->lock but not established pmds yet when split_huge_page starts. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 16 ++++++++++++++++ mm/rmap.c | 4 ++++ 2 files changed, 20 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0c1e8f939f7..76350793289 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -841,6 +841,19 @@ static void __split_huge_page(struct page *page, continue; mapcount += __split_huge_page_splitting(page, vma, addr); } + /* + * It is critical that new vmas are added to the tail of the + * anon_vma list. This guarantes that if copy_huge_pmd() runs + * and establishes a child pmd before + * __split_huge_page_splitting() freezes the parent pmd (so if + * we fail to prevent copy_huge_pmd() from running until the + * whole __split_huge_page() is complete), we will still see + * the newly established pmd of the child later during the + * walk, to be able to set it as pmd_trans_splitting too. + */ + if (mapcount != page_mapcount(page)) + printk(KERN_ERR "mapcount %d page_mapcount %d\n", + mapcount, page_mapcount(page)); BUG_ON(mapcount != page_mapcount(page)); __split_huge_page_refcount(page); @@ -854,6 +867,9 @@ static void __split_huge_page(struct page *page, continue; mapcount2 += __split_huge_page_map(page, vma, addr); } + if (mapcount != mapcount2) + printk(KERN_ERR "mapcount %d mapcount2 %d page_mapcount %d\n", + mapcount, mapcount2, page_mapcount(page)); BUG_ON(mapcount != mapcount2); } diff --git a/mm/rmap.c b/mm/rmap.c index e41375a6b02..92e14dcfe73 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -177,6 +177,10 @@ static void anon_vma_chain_link(struct vm_area_struct *vma, list_add(&avc->same_vma, &vma->anon_vma_chain); anon_vma_lock(anon_vma); + /* + * It's critical to add new vmas to the tail of the anon_vma, + * see comment in huge_memory.c:__split_huge_page(). + */ list_add_tail(&avc->same_anon_vma, &anon_vma->head); anon_vma_unlock(anon_vma); } -- cgit v1.2.3-70-g09d2 From f66055ab6fb9731dbfce320c5202ef4441b5d77f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:54 -0800 Subject: thp: verify pmd_trans_huge isn't leaking pte_trans_huge must not leak in certain vmas like the mmio special pfn or filebacked mappings. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 840ce9d98f8..c1a80e00458 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1451,6 +1451,7 @@ int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pmd = pmd_offset(pud, pg); if (pmd_none(*pmd)) return i ? : -EFAULT; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map(pmd, pg); if (pte_none(*pte)) { pte_unmap(pte); @@ -1675,8 +1676,10 @@ pte_t *__get_locked_pte(struct mm_struct *mm, unsigned long addr, pud_t * pud = pud_alloc(mm, pgd, addr); if (pud) { pmd_t * pmd = pmd_alloc(mm, pud, addr); - if (pmd) + if (pmd) { + VM_BUG_ON(pmd_trans_huge(*pmd)); return pte_alloc_map_lock(mm, pmd, addr, ptl); + } } return NULL; } @@ -1895,6 +1898,7 @@ static inline int remap_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_alloc(mm, pud, addr); if (!pmd) return -ENOMEM; + VM_BUG_ON(pmd_trans_huge(*pmd)); do { next = pmd_addr_end(addr, end); if (remap_pte_range(mm, pmd, addr, next, @@ -3471,6 +3475,7 @@ static int __follow_pte(struct mm_struct *mm, unsigned long address, goto out; pmd = pmd_offset(pud, address); + VM_BUG_ON(pmd_trans_huge(*pmd)); if (pmd_none(*pmd) || unlikely(pmd_bad(*pmd))) goto out; -- cgit v1.2.3-70-g09d2 From 0af4e98b6b095c74588af04872f83d333c958c32 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: madvise(MADV_HUGEPAGE) Add madvise MADV_HUGEPAGE to mark regions that are important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++++ mm/huge_memory.c | 16 ++++++++++++++++ mm/madvise.c | 8 ++++++++ 3 files changed, 30 insertions(+) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9301824c749..d9ab70d776e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -97,6 +97,7 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif +extern int hugepage_madvise(unsigned long *vm_flags); #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -113,6 +114,11 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +static inline int hugepage_madvise(unsigned long *vm_flags) +{ + BUG(); + return 0; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 76350793289..620891f4e54 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -896,6 +896,22 @@ out: return ret; } +int hugepage_madvise(unsigned long *vm_flags) +{ + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + + *vm_flags |= VM_HUGEPAGE; + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; diff --git a/mm/madvise.c b/mm/madvise.c index 319528b8db7..ecde40a401c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -71,6 +71,11 @@ static long madvise_behavior(struct vm_area_struct * vma, if (error) goto out; break; + case MADV_HUGEPAGE: + error = hugepage_madvise(&new_flags); + if (error) + goto out; + break; } if (new_flags == vma->vm_flags) { @@ -282,6 +287,9 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_KSM case MADV_MERGEABLE: case MADV_UNMERGEABLE: +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + case MADV_HUGEPAGE: #endif return 1; -- cgit v1.2.3-70-g09d2 From 500d65d471018d9a13b0d51b7e141ed2a3555c1d Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:55 -0800 Subject: thp: pmd_trans_huge migrate bugcheck No pmd_trans_huge should ever materialize in migration ptes areas, because we split the hugepage before migration ptes are instantiated. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 5 +++++ mm/migrate.c | 7 ++++++- 3 files changed, 12 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 78adec4ba9f..2ec5138bada 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1490,6 +1490,7 @@ struct page *follow_page(struct vm_area_struct *, unsigned long address, #define FOLL_DUMP 0x08 /* give error on hole if it would be zero */ #define FOLL_FORCE 0x10 /* get_user_pages read/write w/o permission */ #define FOLL_MLOCK 0x40 /* mark page as mlocked */ +#define FOLL_SPLIT 0x80 /* don't return transhuge pages, split them */ typedef int (*pte_fn_t)(pte_t *pte, pgtable_t token, unsigned long addr, void *data); diff --git a/mm/memory.c b/mm/memory.c index c1a80e00458..12ee1ea237f 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1305,6 +1305,10 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, goto out; } if (pmd_trans_huge(*pmd)) { + if (flags & FOLL_SPLIT) { + split_huge_page_pmd(mm, pmd); + goto split_fallthrough; + } spin_lock(&mm->page_table_lock); if (likely(pmd_trans_huge(*pmd))) { if (unlikely(pmd_trans_splitting(*pmd))) { @@ -1320,6 +1324,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, spin_unlock(&mm->page_table_lock); /* fall through */ } +split_fallthrough: if (unlikely(pmd_bad(*pmd))) goto no_page_table; diff --git a/mm/migrate.c b/mm/migrate.c index 690d0de993a..1a531b760b3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -113,6 +113,8 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; pmd = pmd_offset(pud, addr); + if (pmd_trans_huge(*pmd)) + goto out; if (!pmd_present(*pmd)) goto out; @@ -632,6 +634,9 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* page was freed from under us. So we are done. */ goto move_newpage; } + if (unlikely(PageTransHuge(page))) + if (unlikely(split_huge_page(page))) + goto move_newpage; /* prepare cgroup just returns 0 or -ENOMEM */ rc = -EAGAIN; @@ -1063,7 +1068,7 @@ static int do_move_page_to_node_array(struct mm_struct *mm, if (!vma || pp->addr < vma->vm_start || !vma_migratable(vma)) goto set_status; - page = follow_page(vma, pp->addr, FOLL_GET); + page = follow_page(vma, pp->addr, FOLL_GET|FOLL_SPLIT); err = PTR_ERR(page); if (IS_ERR(page)) -- cgit v1.2.3-70-g09d2 From ec1685109f1314a30919489ef2800ed626a38c1e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:56 -0800 Subject: thp: memcg compound Teach memcg to charge/uncharge compound pages. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 83 ++++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 53 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00bb8a64d02..356d4964fe9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,6 +1027,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -1887,12 +1891,14 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, * oom-killer can be invoked. */ static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom) + gfp_t gfp_mask, + struct mem_cgroup **memcg, bool oom, + int page_size) { int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem = NULL; int ret; - int csize = CHARGE_SIZE; + int csize = max(CHARGE_SIZE, (unsigned long) page_size); /* * Unlike gloval-vm's OOM-kill, we're not in memory shortage @@ -1917,7 +1923,7 @@ again: VM_BUG_ON(css_is_removed(&mem->css)); if (mem_cgroup_is_root(mem)) goto done; - if (consume_stock(mem)) + if (page_size == PAGE_SIZE && consume_stock(mem)) goto done; css_get(&mem->css); } else { @@ -1940,7 +1946,7 @@ again: rcu_read_unlock(); goto done; } - if (consume_stock(mem)) { + if (page_size == PAGE_SIZE && consume_stock(mem)) { /* * It seems dagerous to access memcg without css_get(). * But considering how consume_stok works, it's not @@ -1981,7 +1987,7 @@ again: case CHARGE_OK: break; case CHARGE_RETRY: /* not in OOM situation but retry */ - csize = PAGE_SIZE; + csize = page_size; css_put(&mem->css); mem = NULL; goto again; @@ -2002,8 +2008,8 @@ again: } } while (ret != CHARGE_OK); - if (csize > PAGE_SIZE) - refill_stock(mem, csize - PAGE_SIZE); + if (csize > page_size) + refill_stock(mem, csize - page_size); css_put(&mem->css); done: *memcg = mem; @@ -2031,9 +2037,10 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem, } } -static void mem_cgroup_cancel_charge(struct mem_cgroup *mem) +static void mem_cgroup_cancel_charge(struct mem_cgroup *mem, + int page_size) { - __mem_cgroup_cancel_charge(mem, 1); + __mem_cgroup_cancel_charge(mem, page_size >> PAGE_SHIFT); } /* @@ -2089,8 +2096,9 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) */ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { /* try_charge() can return NULL to *memcg, taking care of it. */ if (!mem) @@ -2099,7 +2107,7 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, lock_page_cgroup(pc); if (unlikely(PageCgroupUsed(pc))) { unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, page_size); return; } @@ -2173,7 +2181,7 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, mem_cgroup_charge_statistics(from, pc, false); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from); + mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; @@ -2234,13 +2242,14 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto put; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, + PAGE_SIZE); if (ret || !parent) goto put_back; ret = mem_cgroup_move_account(pc, child, parent, true); if (ret) - mem_cgroup_cancel_charge(parent); + mem_cgroup_cancel_charge(parent, PAGE_SIZE); put_back: putback_lru_page(page); put: @@ -2261,6 +2270,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, struct mem_cgroup *mem = NULL; struct page_cgroup *pc; int ret; + int page_size = PAGE_SIZE; + + if (PageTransHuge(page)) + page_size <<= compound_order(page); pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2268,11 +2281,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); if (ret || !mem) return ret; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, page_size); return 0; } @@ -2281,8 +2294,6 @@ int mem_cgroup_newpage_charge(struct page *page, { if (mem_cgroup_disabled()) return 0; - if (PageCompound(page)) - return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -2388,13 +2399,13 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, if (!mem) goto charge_cur_mm; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, mask, ptr, true); + ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, PAGE_SIZE); css_put(&mem->css); return ret; charge_cur_mm: if (unlikely(!mm)) mm = &init_mm; - return __mem_cgroup_try_charge(mm, mask, ptr, true); + return __mem_cgroup_try_charge(mm, mask, ptr, true, PAGE_SIZE); } static void @@ -2410,7 +2421,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr, cgroup_exclude_rmdir(&ptr->css); pc = lookup_page_cgroup(page); mem_cgroup_lru_del_before_commit_swapcache(page); - __mem_cgroup_commit_charge(ptr, pc, ctype); + __mem_cgroup_commit_charge(ptr, pc, ctype, PAGE_SIZE); mem_cgroup_lru_add_after_commit_swapcache(page); /* * Now swap is on-memory. This means this page may be @@ -2459,11 +2470,12 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem) return; if (!mem) return; - mem_cgroup_cancel_charge(mem); + mem_cgroup_cancel_charge(mem, PAGE_SIZE); } static void -__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) +__do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype, + int page_size) { struct memcg_batch_info *batch = NULL; bool uncharge_memsw = true; @@ -2490,6 +2502,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) goto direct_uncharge; + if (page_size != PAGE_SIZE) + goto direct_uncharge; + /* * In typical case, batch->memcg == mem. This means we can * merge a series of uncharges to an uncharge of res_counter. @@ -2503,9 +2518,9 @@ __do_uncharge(struct mem_cgroup *mem, const enum charge_type ctype) batch->memsw_bytes += PAGE_SIZE; return; direct_uncharge: - res_counter_uncharge(&mem->res, PAGE_SIZE); + res_counter_uncharge(&mem->res, page_size); if (uncharge_memsw) - res_counter_uncharge(&mem->memsw, PAGE_SIZE); + res_counter_uncharge(&mem->memsw, page_size); if (unlikely(batch->memcg != mem)) memcg_oom_recover(mem); return; @@ -2519,6 +2534,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { struct page_cgroup *pc; struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; if (mem_cgroup_disabled()) return NULL; @@ -2526,6 +2542,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; + if (PageTransHuge(page)) + page_size <<= compound_order(page); + /* * Check if our page_cgroup is valid */ @@ -2579,7 +2598,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) mem_cgroup_get(mem); } if (!mem_cgroup_is_root(mem)) - __do_uncharge(mem, ctype); + __do_uncharge(mem, ctype, page_size); return mem; @@ -2774,6 +2793,7 @@ int mem_cgroup_prepare_migration(struct page *page, enum charge_type ctype; int ret = 0; + VM_BUG_ON(PageTransHuge(page)); if (mem_cgroup_disabled()) return 0; @@ -2823,7 +2843,7 @@ int mem_cgroup_prepare_migration(struct page *page, return 0; *ptr = mem; - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false, PAGE_SIZE); css_put(&mem->css);/* drop extra refcnt */ if (ret || *ptr == NULL) { if (PageAnon(page)) { @@ -2850,7 +2870,7 @@ int mem_cgroup_prepare_migration(struct page *page, ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; else ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; - __mem_cgroup_commit_charge(mem, pc, ctype); + __mem_cgroup_commit_charge(mem, pc, ctype, PAGE_SIZE); return ret; } @@ -4461,7 +4481,8 @@ one_by_one: batch_count = PRECHARGE_COUNT_AT_ONCE; cond_resched(); } - ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false); + ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false, + PAGE_SIZE); if (ret || !mem) /* mem_cgroup_clear_mc() will do uncharge later */ return -ENOMEM; @@ -4623,6 +4644,7 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (is_target_pte_for_mc(vma, addr, *pte, NULL)) @@ -4789,6 +4811,7 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, spinlock_t *ptl; retry: + VM_BUG_ON(pmd_trans_huge(*pmd)); pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { pte_t ptent = *(pte++); -- cgit v1.2.3-70-g09d2 From 152c9ccb75548c027fa3103efa4fa4e19a345449 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:46:56 -0800 Subject: thp: transhuge-memcg: commit tail pages at charge By this patch, when a transparent hugepage is charged, not only the head page but also all the tail pages are committed, IOW pc->mem_cgroup and pc->flags of tail pages are set. Without this patch: - Tail pages are not linked to any memcg's LRU at splitting. This causes many problems, for example, the charged memcg's directory can never be rmdir'ed because it doesn't have enough pages to scan to make the usage decrease to 0. - "rss" field in memory.stat would be incorrect. Moreover, usage_in_bytes in root cgroup is calculated by the stat not by res_counter(since 2.6.32), it would be incorrect too. Signed-off-by: Daisuke Nishimura Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 52 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 35 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 356d4964fe9..a1bb59d4c9d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2094,23 +2094,10 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be * USED state. If already USED, uncharge and return. */ - -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype, - int page_size) +static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype) { - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); - return; - } - pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2135,6 +2122,33 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, } mem_cgroup_charge_statistics(mem, pc, true); +} + +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) +{ + int i; + int count = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ + for (i = 0; i < count; i++) + ____mem_cgroup_commit_charge(mem, pc + i, ctype); unlock_page_cgroup(pc); /* @@ -2532,6 +2546,8 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { + int i; + int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; int page_size = PAGE_SIZE; @@ -2545,6 +2561,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageTransHuge(page)) page_size <<= compound_order(page); + count = page_size >> PAGE_SHIFT; /* * Check if our page_cgroup is valid */ @@ -2577,7 +2594,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - mem_cgroup_charge_statistics(mem, pc, false); + for (i = 0; i < count; i++) + mem_cgroup_charge_statistics(mem, pc + i, false); ClearPageCgroupUsed(pc); /* -- cgit v1.2.3-70-g09d2 From b9bbfbe30ae088cc88a4b2ba7732baeebd1a0162 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:57 -0800 Subject: thp: memcg huge memory Add memcg charge/uncharge to hugepage faults in huge_memory.c. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 36 +++++++++++++++++++++++++++++++----- 1 file changed, 31 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 620891f4e54..a313403b3c5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -233,6 +233,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON(!PageCompound(page)); pgtable = pte_alloc_one(mm, haddr); if (unlikely(!pgtable)) { + mem_cgroup_uncharge_page(page); put_page(page); return VM_FAULT_OOM; } @@ -243,6 +244,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); if (unlikely(!pmd_none(*pmd))) { spin_unlock(&mm->page_table_lock); + mem_cgroup_uncharge_page(page); put_page(page); pte_free(mm, pgtable); } else { @@ -286,6 +288,10 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, page = alloc_hugepage(transparent_hugepage_defrag(vma)); if (unlikely(!page)) goto out; + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + put_page(page); + goto out; + } return __do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page); } @@ -402,9 +408,17 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); - if (unlikely(!pages[i])) { - while (--i >= 0) + if (unlikely(!pages[i] || + mem_cgroup_newpage_charge(pages[i], mm, + GFP_KERNEL))) { + if (pages[i]) put_page(pages[i]); + mem_cgroup_uncharge_start(); + while (--i >= 0) { + mem_cgroup_uncharge_page(pages[i]); + put_page(pages[i]); + } + mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; @@ -455,8 +469,12 @@ out: out_free_pages: spin_unlock(&mm->page_table_lock); - for (i = 0; i < HPAGE_PMD_NR; i++) + mem_cgroup_uncharge_start(); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mem_cgroup_uncharge_page(pages[i]); put_page(pages[i]); + } + mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -501,14 +519,22 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + put_page(new_page); + put_page(page); + ret |= VM_FAULT_OOM; + goto out; + } + copy_user_huge_page(new_page, page, haddr, vma, HPAGE_PMD_NR); __SetPageUptodate(new_page); spin_lock(&mm->page_table_lock); put_page(page); - if (unlikely(!pmd_same(*pmd, orig_pmd))) + if (unlikely(!pmd_same(*pmd, orig_pmd))) { + mem_cgroup_uncharge_page(new_page); put_page(new_page); - else { + } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); entry = mk_pmd(new_page, vma->vm_page_prot); -- cgit v1.2.3-70-g09d2 From 79134171df238171daa4c024a42b77b401ccb00b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: transparent hugepage vmstat Add hugepage stat information to /proc/vmstat and /proc/meminfo. Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/meminfo.c | 14 +++++++++++++- include/linux/mmzone.h | 1 + mm/huge_memory.c | 3 +++ mm/rmap.c | 20 ++++++++++++++++---- mm/vmstat.c | 1 + 5 files changed, 34 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index a65239cfd97..ed257d14156 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -100,6 +100,9 @@ static int meminfo_proc_show(struct seq_file *m, void *v) "VmallocChunk: %8lu kB\n" #ifdef CONFIG_MEMORY_FAILURE "HardwareCorrupted: %5lu kB\n" +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + "AnonHugePages: %8lu kB\n" #endif , K(i.totalram), @@ -128,7 +131,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v) K(i.freeswap), K(global_page_state(NR_FILE_DIRTY)), K(global_page_state(NR_WRITEBACK)), - K(global_page_state(NR_ANON_PAGES)), + K(global_page_state(NR_ANON_PAGES) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR +#endif + ), K(global_page_state(NR_FILE_MAPPED)), K(global_page_state(NR_SHMEM)), K(global_page_state(NR_SLAB_RECLAIMABLE) + @@ -150,6 +158,10 @@ static int meminfo_proc_show(struct seq_file *m, void *v) vmi.largest_chunk >> 10 #ifdef CONFIG_MEMORY_FAILURE ,atomic_long_read(&mce_bad_pages) << (PAGE_SHIFT - 10) +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) * + HPAGE_PMD_NR) #endif ); diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index dad3612a7e3..02ecb0189b1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -114,6 +114,7 @@ enum zone_stat_item { NUMA_LOCAL, /* allocation from local node */ NUMA_OTHER, /* allocation from other node */ #endif + NR_ANON_TRANSPARENT_HUGEPAGES, NR_VM_ZONE_STAT_ITEMS }; /* diff --git a/mm/huge_memory.c b/mm/huge_memory.c index a313403b3c5..7101112a542 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -751,6 +751,9 @@ static void __split_huge_page_refcount(struct page *page) lru_add_page_tail(zone, page, page_tail); } + __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); + __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/rmap.c b/mm/rmap.c index 92e14dcfe73..3825ae4bc32 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -882,8 +882,13 @@ void do_page_add_anon_rmap(struct page *page, struct vm_area_struct *vma, unsigned long address, int exclusive) { int first = atomic_inc_and_test(&page->_mapcount); - if (first) - __inc_zone_page_state(page, NR_ANON_PAGES); + if (first) { + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); + } if (unlikely(PageKsm(page))) return; @@ -911,7 +916,10 @@ void page_add_new_anon_rmap(struct page *page, VM_BUG_ON(address < vma->vm_start || address >= vma->vm_end); SetPageSwapBacked(page); atomic_set(&page->_mapcount, 0); /* increment count (starts at -1) */ - __inc_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __inc_zone_page_state(page, NR_ANON_PAGES); + else + __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); if (page_evictable(page, vma)) lru_cache_add_lru(page, LRU_ACTIVE_ANON); @@ -964,7 +972,11 @@ void page_remove_rmap(struct page *page) return; if (PageAnon(page)) { mem_cgroup_uncharge_page(page); - __dec_zone_page_state(page, NR_ANON_PAGES); + if (!PageTransHuge(page)) + __dec_zone_page_state(page, NR_ANON_PAGES); + else + __dec_zone_page_state(page, + NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); mem_cgroup_update_file_mapped(page, -1); diff --git a/mm/vmstat.c b/mm/vmstat.c index 751a65e00aa..0c3b5048773 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -880,6 +880,7 @@ static const char * const vmstat_text[] = { "numa_local", "numa_other", #endif + "nr_anon_transparent_hugepages", "nr_dirty_threshold", "nr_dirty_background_threshold", -- cgit v1.2.3-70-g09d2 From ba76149f47d8c939efa0acc07a191237af900471 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:58 -0800 Subject: thp: khugepaged Add khugepaged to relocate fragmented pages into hugepages if new hugepages become available. (this is indipendent of the defrag logic that will have to make new hugepages available) The fundamental reason why khugepaged is unavoidable, is that some memory can be fragmented and not everything can be relocated. So when a virtual machine quits and releases gigabytes of hugepages, we want to use those freely available hugepages to create huge-pmd in the other virtual machines that may be running on fragmented memory, to maximize the CPU efficiency at all times. The scan is slow, it takes nearly zero cpu time, except when it copies data (in which case it means we definitely want to pay for that cpu time) so it seems a good tradeoff. In addition to the hugepages being released by other process releasing memory, we have the strong suspicion that the performance impact of potentially defragmenting hugepages during or before each page fault could lead to more performance inconsistency than allocating small pages at first and having them collapsed into large pages later... if they prove themselfs to be long lived mappings (khugepaged scan is slow so short lived mappings have low probability to run into khugepaged if compared to long lived mappings). Signed-off-by: Andrea Arcangeli Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 1 + include/linux/khugepaged.h | 66 +++ include/linux/sched.h | 1 + kernel/fork.c | 5 + mm/huge_memory.c | 1073 +++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 1136 insertions(+), 10 deletions(-) create mode 100644 include/linux/khugepaged.h (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index d9ab70d776e..43a694ef890 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -25,6 +25,7 @@ enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_FLAG, TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG, #ifdef CONFIG_DEBUG_VM TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG, #endif diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h new file mode 100644 index 00000000000..552f3184756 --- /dev/null +++ b/include/linux/khugepaged.h @@ -0,0 +1,66 @@ +#ifndef _LINUX_KHUGEPAGED_H +#define _LINUX_KHUGEPAGED_H + +#include /* MMF_VM_HUGEPAGE */ + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +extern int __khugepaged_enter(struct mm_struct *mm); +extern void __khugepaged_exit(struct mm_struct *mm); +extern int khugepaged_enter_vma_merge(struct vm_area_struct *vma); + +#define khugepaged_enabled() \ + (transparent_hugepage_flags & \ + ((1<flags)) + return __khugepaged_enter(mm); + return 0; +} + +static inline void khugepaged_exit(struct mm_struct *mm) +{ + if (test_bit(MMF_VM_HUGEPAGE, &mm->flags)) + __khugepaged_exit(mm); +} + +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) + if (khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) + if (__khugepaged_enter(vma->vm_mm)) + return -ENOMEM; + return 0; +} +#else /* CONFIG_TRANSPARENT_HUGEPAGE */ +static inline int khugepaged_fork(struct mm_struct *mm, struct mm_struct *oldmm) +{ + return 0; +} +static inline void khugepaged_exit(struct mm_struct *mm) +{ +} +static inline int khugepaged_enter(struct vm_area_struct *vma) +{ + return 0; +} +static inline int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + return 0; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +#endif /* _LINUX_KHUGEPAGED_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f23b5bb6f52..d747f948b34 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -434,6 +434,7 @@ extern int get_dumpable(struct mm_struct *mm); #endif /* leave room for more dump flags */ #define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */ +#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */ #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK) diff --git a/kernel/fork.c b/kernel/fork.c index f78f50ba6cb..25e429152dd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include @@ -328,6 +329,9 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); if (retval) goto out; @@ -546,6 +550,7 @@ void mmput(struct mm_struct *mm) if (atomic_dec_and_test(&mm->mm_users)) { exit_aio(mm); ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ exit_mmap(mm); set_mm_exe_file(mm, NULL); if (!list_empty(&mm->mmlist)) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7101112a542..ae2bf08b109 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -12,14 +12,111 @@ #include #include #include +#include +#include +#include #include #include #include "internal.h" +/* + * By default transparent hugepage support is enabled for all mappings + * and khugepaged scans all mappings. Defrag is only invoked by + * khugepaged hugepage allocations and by page faults inside + * MADV_HUGEPAGE regions to avoid the risk of slowing down short lived + * allocations. + */ unsigned long transparent_hugepage_flags __read_mostly = - (1< 0) { + int err = start_khugepaged(); + if (err) + ret = err; + } + + return ret; } static struct kobj_attribute enabled_attr = __ATTR(enabled, 0644, enabled_show, enabled_store); @@ -153,20 +260,212 @@ static struct attribute *hugepage_attr[] = { static struct attribute_group hugepage_attr_group = { .attrs = hugepage_attr, - .name = "transparent_hugepage", +}; + +static ssize_t scan_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs); +} + +static ssize_t scan_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_scan_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute scan_sleep_millisecs_attr = + __ATTR(scan_sleep_millisecs, 0644, scan_sleep_millisecs_show, + scan_sleep_millisecs_store); + +static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs); +} + +static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long msecs; + int err; + + err = strict_strtoul(buf, 10, &msecs); + if (err || msecs > UINT_MAX) + return -EINVAL; + + khugepaged_alloc_sleep_millisecs = msecs; + wake_up_interruptible(&khugepaged_wait); + + return count; +} +static struct kobj_attribute alloc_sleep_millisecs_attr = + __ATTR(alloc_sleep_millisecs, 0644, alloc_sleep_millisecs_show, + alloc_sleep_millisecs_store); + +static ssize_t pages_to_scan_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_to_scan); +} +static ssize_t pages_to_scan_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long pages; + + err = strict_strtoul(buf, 10, &pages); + if (err || !pages || pages > UINT_MAX) + return -EINVAL; + + khugepaged_pages_to_scan = pages; + + return count; +} +static struct kobj_attribute pages_to_scan_attr = + __ATTR(pages_to_scan, 0644, pages_to_scan_show, + pages_to_scan_store); + +static ssize_t pages_collapsed_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_pages_collapsed); +} +static struct kobj_attribute pages_collapsed_attr = + __ATTR_RO(pages_collapsed); + +static ssize_t full_scans_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_full_scans); +} +static struct kobj_attribute full_scans_attr = + __ATTR_RO(full_scans); + +static ssize_t khugepaged_defrag_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return single_flag_show(kobj, attr, buf, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static ssize_t khugepaged_defrag_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + return single_flag_store(kobj, attr, buf, count, + TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG); +} +static struct kobj_attribute khugepaged_defrag_attr = + __ATTR(defrag, 0644, khugepaged_defrag_show, + khugepaged_defrag_store); + +/* + * max_ptes_none controls if khugepaged should collapse hugepages over + * any unmapped ptes in turn potentially increasing the memory + * footprint of the vmas. When max_ptes_none is 0 khugepaged will not + * reduce the available free memory in the system as it + * runs. Increasing max_ptes_none will instead potentially reduce the + * free memory in the system during the khugepaged scan. + */ +static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%u\n", khugepaged_max_ptes_none); +} +static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + int err; + unsigned long max_ptes_none; + + err = strict_strtoul(buf, 10, &max_ptes_none); + if (err || max_ptes_none > HPAGE_PMD_NR-1) + return -EINVAL; + + khugepaged_max_ptes_none = max_ptes_none; + + return count; +} +static struct kobj_attribute khugepaged_max_ptes_none_attr = + __ATTR(max_ptes_none, 0644, khugepaged_max_ptes_none_show, + khugepaged_max_ptes_none_store); + +static struct attribute *khugepaged_attr[] = { + &khugepaged_defrag_attr.attr, + &khugepaged_max_ptes_none_attr.attr, + &pages_to_scan_attr.attr, + &pages_collapsed_attr.attr, + &full_scans_attr.attr, + &scan_sleep_millisecs_attr.attr, + &alloc_sleep_millisecs_attr.attr, + NULL, +}; + +static struct attribute_group khugepaged_attr_group = { + .attrs = khugepaged_attr, + .name = "khugepaged", }; #endif /* CONFIG_SYSFS */ static int __init hugepage_init(void) { -#ifdef CONFIG_SYSFS int err; +#ifdef CONFIG_SYSFS + static struct kobject *hugepage_kobj; - err = sysfs_create_group(mm_kobj, &hugepage_attr_group); - if (err) - printk(KERN_ERR "hugepage: register sysfs failed\n"); + err = -ENOMEM; + hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); + if (unlikely(!hugepage_kobj)) { + printk(KERN_ERR "hugepage: failed kobject create\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &hugepage_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } + + err = sysfs_create_group(hugepage_kobj, &khugepaged_attr_group); + if (err) { + printk(KERN_ERR "hugepage: failed register hugeage group\n"); + goto out; + } #endif - return 0; + + err = khugepaged_slab_init(); + if (err) + goto out; + + err = mm_slots_hash_init(); + if (err) { + khugepaged_slab_free(); + goto out; + } + + start_khugepaged(); + +out: + return err; } module_init(hugepage_init) @@ -285,6 +584,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (haddr >= vma->vm_start && haddr + HPAGE_PMD_SIZE <= vma->vm_end) { if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; + if (unlikely(khugepaged_enter(vma))) + return VM_FAULT_OOM; page = alloc_hugepage(transparent_hugepage_defrag(vma)); if (unlikely(!page)) goto out; @@ -941,6 +1242,758 @@ int hugepage_madvise(unsigned long *vm_flags) return 0; } +static int __init khugepaged_slab_init(void) +{ + mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", + sizeof(struct mm_slot), + __alignof__(struct mm_slot), 0, NULL); + if (!mm_slot_cache) + return -ENOMEM; + + return 0; +} + +static void __init khugepaged_slab_free(void) +{ + kmem_cache_destroy(mm_slot_cache); + mm_slot_cache = NULL; +} + +static inline struct mm_slot *alloc_mm_slot(void) +{ + if (!mm_slot_cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); +} + +static inline void free_mm_slot(struct mm_slot *mm_slot) +{ + kmem_cache_free(mm_slot_cache, mm_slot); +} + +static int __init mm_slots_hash_init(void) +{ + mm_slots_hash = kzalloc(MM_SLOTS_HASH_HEADS * sizeof(struct hlist_head), + GFP_KERNEL); + if (!mm_slots_hash) + return -ENOMEM; + return 0; +} + +#if 0 +static void __init mm_slots_hash_free(void) +{ + kfree(mm_slots_hash); + mm_slots_hash = NULL; +} +#endif + +static struct mm_slot *get_mm_slot(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + struct hlist_head *bucket; + struct hlist_node *node; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + hlist_for_each_entry(mm_slot, node, bucket, hash) { + if (mm == mm_slot->mm) + return mm_slot; + } + return NULL; +} + +static void insert_to_mm_slots_hash(struct mm_struct *mm, + struct mm_slot *mm_slot) +{ + struct hlist_head *bucket; + + bucket = &mm_slots_hash[((unsigned long)mm / sizeof(struct mm_struct)) + % MM_SLOTS_HASH_HEADS]; + mm_slot->mm = mm; + hlist_add_head(&mm_slot->hash, bucket); +} + +static inline int khugepaged_test_exit(struct mm_struct *mm) +{ + return atomic_read(&mm->mm_users) == 0; +} + +int __khugepaged_enter(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int wakeup; + + mm_slot = alloc_mm_slot(); + if (!mm_slot) + return -ENOMEM; + + /* __khugepaged_exit() must not run from under us */ + VM_BUG_ON(khugepaged_test_exit(mm)); + if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { + free_mm_slot(mm_slot); + return 0; + } + + spin_lock(&khugepaged_mm_lock); + insert_to_mm_slots_hash(mm, mm_slot); + /* + * Insert just behind the scanning cursor, to let the area settle + * down a little. + */ + wakeup = list_empty(&khugepaged_scan.mm_head); + list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + spin_unlock(&khugepaged_mm_lock); + + atomic_inc(&mm->mm_count); + if (wakeup) + wake_up_interruptible(&khugepaged_wait); + + return 0; +} + +int khugepaged_enter_vma_merge(struct vm_area_struct *vma) +{ + unsigned long hstart, hend; + if (!vma->anon_vma) + /* + * Not yet faulted in so we will register later in the + * page fault if needed. + */ + return 0; + if (vma->vm_file || vma->vm_ops) + /* khugepaged not yet working on file or special mappings */ + return 0; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart < hend) + return khugepaged_enter(vma); + return 0; +} + +void __khugepaged_exit(struct mm_struct *mm) +{ + struct mm_slot *mm_slot; + int free = 0; + + spin_lock(&khugepaged_mm_lock); + mm_slot = get_mm_slot(mm); + if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + free = 1; + } + + if (free) { + spin_unlock(&khugepaged_mm_lock); + clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + free_mm_slot(mm_slot); + mmdrop(mm); + } else if (mm_slot) { + spin_unlock(&khugepaged_mm_lock); + /* + * This is required to serialize against + * khugepaged_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we + * return all pagetables will be destroyed) until + * khugepaged has finished working on the pagetables + * under the mmap_sem. + */ + down_write(&mm->mmap_sem); + up_write(&mm->mmap_sem); + } else + spin_unlock(&khugepaged_mm_lock); +} + +static void release_pte_page(struct page *page) +{ + /* 0 stands for page_is_file_cache(page) == false */ + dec_zone_page_state(page, NR_ISOLATED_ANON + 0); + unlock_page(page); + putback_lru_page(page); +} + +static void release_pte_pages(pte_t *pte, pte_t *_pte) +{ + while (--_pte >= pte) { + pte_t pteval = *_pte; + if (!pte_none(pteval)) + release_pte_page(pte_page(pteval)); + } +} + +static void release_all_pte_pages(pte_t *pte) +{ + release_pte_pages(pte, pte + HPAGE_PMD_NR); +} + +static int __collapse_huge_page_isolate(struct vm_area_struct *vma, + unsigned long address, + pte_t *pte) +{ + struct page *page; + pte_t *_pte; + int referenced = 0, isolated = 0, none = 0; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else { + release_pte_pages(pte, _pte); + goto out; + } + } + if (!pte_present(pteval) || !pte_write(pteval)) { + release_pte_pages(pte, _pte); + goto out; + } + page = vm_normal_page(vma, address, pteval); + if (unlikely(!page)) { + release_pte_pages(pte, _pte); + goto out; + } + VM_BUG_ON(PageCompound(page)); + BUG_ON(!PageAnon(page)); + VM_BUG_ON(!PageSwapBacked(page)); + + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * We can do it before isolate_lru_page because the + * page can't be freed from under us. NOTE: PG_lock + * is needed to serialize against split_huge_page + * when invoked from the VM. + */ + if (!trylock_page(page)) { + release_pte_pages(pte, _pte); + goto out; + } + /* + * Isolate the page to avoid collapsing an hugepage + * currently in use by the VM. + */ + if (isolate_lru_page(page)) { + unlock_page(page); + release_pte_pages(pte, _pte); + goto out; + } + /* 0 stands for page_is_file_cache(page) == false */ + inc_zone_page_state(page, NR_ISOLATED_ANON + 0); + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageLRU(page)); + + /* If there is no mapped pte young don't collapse the page */ + if (pte_young(pteval)) + referenced = 1; + } + if (unlikely(!referenced)) + release_all_pte_pages(pte); + else + isolated = 1; +out: + return isolated; +} + +static void __collapse_huge_page_copy(pte_t *pte, struct page *page, + struct vm_area_struct *vma, + unsigned long address, + spinlock_t *ptl) +{ + pte_t *_pte; + for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++) { + pte_t pteval = *_pte; + struct page *src_page; + + if (pte_none(pteval)) { + clear_user_highpage(page, address); + add_mm_counter(vma->vm_mm, MM_ANONPAGES, 1); + } else { + src_page = pte_page(pteval); + copy_user_highpage(page, src_page, address, vma); + VM_BUG_ON(page_mapcount(src_page) != 1); + VM_BUG_ON(page_count(src_page) != 2); + release_pte_page(src_page); + /* + * ptl mostly unnecessary, but preempt has to + * be disabled to update the per-cpu stats + * inside page_remove_rmap(). + */ + spin_lock(ptl); + /* + * paravirt calls inside pte_clear here are + * superfluous. + */ + pte_clear(vma->vm_mm, address, _pte); + page_remove_rmap(src_page); + spin_unlock(ptl); + free_page_and_swap_cache(src_page); + } + + address += PAGE_SIZE; + page++; + } +} + +static void collapse_huge_page(struct mm_struct *mm, + unsigned long address, + struct page **hpage) +{ + struct vm_area_struct *vma; + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd, _pmd; + pte_t *pte; + pgtable_t pgtable; + struct page *new_page; + spinlock_t *ptl; + int isolated; + unsigned long hstart, hend; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + VM_BUG_ON(!*hpage); + + /* + * Prevent all access to pagetables with the exception of + * gup_fast later hanlded by the ptep_clear_flush and the VM + * handled by the anon_vma lock + PG_lock. + */ + down_write(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + goto out; + + vma = find_vma(mm, address); + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (address < hstart || address + HPAGE_PMD_SIZE > hend) + goto out; + + if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + goto out; + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + goto out; + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + /* pmd can't go away or become huge under us */ + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + new_page = *hpage; + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out; + + anon_vma_lock(vma->anon_vma); + + pte = pte_offset_map(pmd, address); + ptl = pte_lockptr(mm, pmd); + + spin_lock(&mm->page_table_lock); /* probably unnecessary */ + /* + * After this gup_fast can't run anymore. This also removes + * any huge TLB entry from the CPU so we won't allow + * huge and small TLB entries for the same virtual address + * to avoid the risk of CPU bugs in that area. + */ + _pmd = pmdp_clear_flush_notify(vma, address, pmd); + spin_unlock(&mm->page_table_lock); + + spin_lock(ptl); + isolated = __collapse_huge_page_isolate(vma, address, pte); + spin_unlock(ptl); + pte_unmap(pte); + + if (unlikely(!isolated)) { + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + set_pmd_at(mm, address, pmd, _pmd); + spin_unlock(&mm->page_table_lock); + anon_vma_unlock(vma->anon_vma); + mem_cgroup_uncharge_page(new_page); + goto out; + } + + /* + * All pages are isolated and locked so anon_vma rmap + * can't run anymore. + */ + anon_vma_unlock(vma->anon_vma); + + __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + __SetPageUptodate(new_page); + pgtable = pmd_pgtable(_pmd); + VM_BUG_ON(page_count(pgtable) != 1); + VM_BUG_ON(page_mapcount(pgtable) != 0); + + _pmd = mk_pmd(new_page, vma->vm_page_prot); + _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); + _pmd = pmd_mkhuge(_pmd); + + /* + * spin_lock() below is not the equivalent of smp_wmb(), so + * this is needed to avoid the copy_huge_page writes to become + * visible after the set_pmd_at() write. + */ + smp_wmb(); + + spin_lock(&mm->page_table_lock); + BUG_ON(!pmd_none(*pmd)); + page_add_new_anon_rmap(new_page, vma, address); + set_pmd_at(mm, address, pmd, _pmd); + update_mmu_cache(vma, address, entry); + prepare_pmd_huge_pte(pgtable, mm); + mm->nr_ptes--; + spin_unlock(&mm->page_table_lock); + + *hpage = NULL; + khugepaged_pages_collapsed++; +out: + up_write(&mm->mmap_sem); +} + +static int khugepaged_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + struct page **hpage) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + pte_t *pte, *_pte; + int ret = 0, referenced = 0, none = 0; + struct page *page; + unsigned long _address; + spinlock_t *ptl; + + VM_BUG_ON(address & ~HPAGE_PMD_MASK); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + goto out; + + pte = pte_offset_map_lock(mm, pmd, address, &ptl); + for (_address = address, _pte = pte; _pte < pte+HPAGE_PMD_NR; + _pte++, _address += PAGE_SIZE) { + pte_t pteval = *_pte; + if (pte_none(pteval)) { + if (++none <= khugepaged_max_ptes_none) + continue; + else + goto out_unmap; + } + if (!pte_present(pteval) || !pte_write(pteval)) + goto out_unmap; + page = vm_normal_page(vma, _address, pteval); + if (unlikely(!page)) + goto out_unmap; + VM_BUG_ON(PageCompound(page)); + if (!PageLRU(page) || PageLocked(page) || !PageAnon(page)) + goto out_unmap; + /* cannot use mapcount: can't collapse if there's a gup pin */ + if (page_count(page) != 1) + goto out_unmap; + if (pte_young(pteval)) + referenced = 1; + } + if (referenced) + ret = 1; +out_unmap: + pte_unmap_unlock(pte, ptl); + if (ret) { + up_read(&mm->mmap_sem); + collapse_huge_page(mm, address, hpage); + } +out: + return ret; +} + +static void collect_mm_slot(struct mm_slot *mm_slot) +{ + struct mm_struct *mm = mm_slot->mm; + + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_test_exit(mm)) { + /* free mm_slot */ + hlist_del(&mm_slot->hash); + list_del(&mm_slot->mm_node); + + /* + * Not strictly needed because the mm exited already. + * + * clear_bit(MMF_VM_HUGEPAGE, &mm->flags); + */ + + /* khugepaged_mm_lock actually not necessary for the below */ + free_mm_slot(mm_slot); + mmdrop(mm); + } +} + +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, + struct page **hpage) +{ + struct mm_slot *mm_slot; + struct mm_struct *mm; + struct vm_area_struct *vma; + int progress = 0; + + VM_BUG_ON(!pages); + VM_BUG_ON(!spin_is_locked(&khugepaged_mm_lock)); + + if (khugepaged_scan.mm_slot) + mm_slot = khugepaged_scan.mm_slot; + else { + mm_slot = list_entry(khugepaged_scan.mm_head.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + khugepaged_scan.mm_slot = mm_slot; + } + spin_unlock(&khugepaged_mm_lock); + + mm = mm_slot->mm; + down_read(&mm->mmap_sem); + if (unlikely(khugepaged_test_exit(mm))) + vma = NULL; + else + vma = find_vma(mm, khugepaged_scan.address); + + progress++; + for (; vma; vma = vma->vm_next) { + unsigned long hstart, hend; + + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) { + progress++; + break; + } + + if (!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) { + progress++; + continue; + } + + /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) { + khugepaged_scan.address = vma->vm_end; + progress++; + continue; + } + VM_BUG_ON(is_linear_pfn_mapping(vma) || is_pfn_mapping(vma)); + + hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = vma->vm_end & HPAGE_PMD_MASK; + if (hstart >= hend) { + progress++; + continue; + } + if (khugepaged_scan.address < hstart) + khugepaged_scan.address = hstart; + if (khugepaged_scan.address > hend) { + khugepaged_scan.address = hend + HPAGE_PMD_SIZE; + progress++; + continue; + } + BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); + + while (khugepaged_scan.address < hend) { + int ret; + cond_resched(); + if (unlikely(khugepaged_test_exit(mm))) + goto breakouterloop; + + VM_BUG_ON(khugepaged_scan.address < hstart || + khugepaged_scan.address + HPAGE_PMD_SIZE > + hend); + ret = khugepaged_scan_pmd(mm, vma, + khugepaged_scan.address, + hpage); + /* move to next address */ + khugepaged_scan.address += HPAGE_PMD_SIZE; + progress += HPAGE_PMD_NR; + if (ret) + /* we released mmap_sem so break loop */ + goto breakouterloop_mmap_sem; + if (progress >= pages) + goto breakouterloop; + } + } +breakouterloop: + up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */ +breakouterloop_mmap_sem: + + spin_lock(&khugepaged_mm_lock); + BUG_ON(khugepaged_scan.mm_slot != mm_slot); + /* + * Release the current mm_slot if this mm is about to die, or + * if we scanned all vmas of this mm. + */ + if (khugepaged_test_exit(mm) || !vma) { + /* + * Make sure that if mm_users is reaching zero while + * khugepaged runs here, khugepaged_exit will find + * mm_slot not pointing to the exiting mm. + */ + if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { + khugepaged_scan.mm_slot = list_entry( + mm_slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.address = 0; + } else { + khugepaged_scan.mm_slot = NULL; + khugepaged_full_scans++; + } + + collect_mm_slot(mm_slot); + } + + return progress; +} + +static int khugepaged_has_work(void) +{ + return !list_empty(&khugepaged_scan.mm_head) && + khugepaged_enabled(); +} + +static int khugepaged_wait_event(void) +{ + return !list_empty(&khugepaged_scan.mm_head) || + !khugepaged_enabled(); +} + +static void khugepaged_do_scan(struct page **hpage) +{ + unsigned int progress = 0, pass_through_head = 0; + unsigned int pages = khugepaged_pages_to_scan; + + barrier(); /* write khugepaged_pages_to_scan to local stack */ + + while (progress < pages) { + cond_resched(); + + if (!*hpage) { + *hpage = alloc_hugepage(khugepaged_defrag()); + if (unlikely(!*hpage)) + break; + } + + spin_lock(&khugepaged_mm_lock); + if (!khugepaged_scan.mm_slot) + pass_through_head++; + if (khugepaged_has_work() && + pass_through_head < 2) + progress += khugepaged_scan_mm_slot(pages - progress, + hpage); + else + progress = pages; + spin_unlock(&khugepaged_mm_lock); + } +} + +static struct page *khugepaged_alloc_hugepage(void) +{ + struct page *hpage; + + do { + hpage = alloc_hugepage(khugepaged_defrag()); + if (!hpage) { + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } + } while (unlikely(!hpage) && + likely(khugepaged_enabled())); + return hpage; +} + +static void khugepaged_loop(void) +{ + struct page *hpage; + + while (likely(khugepaged_enabled())) { + hpage = khugepaged_alloc_hugepage(); + if (unlikely(!hpage)) + break; + + khugepaged_do_scan(&hpage); + if (hpage) + put_page(hpage); + if (khugepaged_has_work()) { + DEFINE_WAIT(wait); + if (!khugepaged_scan_sleep_millisecs) + continue; + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_scan_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); + } else if (khugepaged_enabled()) + wait_event_interruptible(khugepaged_wait, + khugepaged_wait_event()); + } +} + +static int khugepaged(void *none) +{ + struct mm_slot *mm_slot; + + set_user_nice(current, 19); + + /* serialize with start_khugepaged() */ + mutex_lock(&khugepaged_mutex); + + for (;;) { + mutex_unlock(&khugepaged_mutex); + BUG_ON(khugepaged_thread != current); + khugepaged_loop(); + BUG_ON(khugepaged_thread != current); + + mutex_lock(&khugepaged_mutex); + if (!khugepaged_enabled()) + break; + } + + spin_lock(&khugepaged_mm_lock); + mm_slot = khugepaged_scan.mm_slot; + khugepaged_scan.mm_slot = NULL; + if (mm_slot) + collect_mm_slot(mm_slot); + spin_unlock(&khugepaged_mm_lock); + + khugepaged_thread = NULL; + mutex_unlock(&khugepaged_mutex); + + return 0; +} + void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) { struct page *page; -- cgit v1.2.3-70-g09d2 From b15d00b6af617251cc70a908df983e9aff57e169 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:46:59 -0800 Subject: thp: khugepaged vma merge register in khugepaged if the vma grows. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 50a4aa0255a..753f44d1704 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -815,6 +816,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, end, prev->vm_pgoff, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(prev); return prev; } @@ -833,6 +835,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, next->vm_pgoff - pglen, NULL); if (err) return NULL; + khugepaged_enter_vma_merge(area); return area; } @@ -1761,6 +1764,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } #endif /* CONFIG_STACK_GROWSUP || CONFIG_IA64 */ @@ -1808,6 +1812,7 @@ static int expand_downwards(struct vm_area_struct *vma, } } vma_unlock_anon_vma(vma); + khugepaged_enter_vma_merge(vma); return error; } -- cgit v1.2.3-70-g09d2 From 21ae5b01750f14140809508a478a4413792e0261 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:00 -0800 Subject: thp: skip transhuge pages in ksm for now Skip transhuge pages in ksm for now. Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index b5b907cb0f9..5e7d5d35ea8 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -430,7 +430,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page)) { + if (PageAnon(page) && !PageTransCompound(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -1279,7 +1279,19 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (!IS_ERR_OR_NULL(*page) && PageAnon(*page)) { + if (IS_ERR_OR_NULL(*page)) { + ksm_scan.address += PAGE_SIZE; + cond_resched(); + continue; + } + if (PageTransCompound(*page)) { + put_page(*page); + ksm_scan.address &= HPAGE_PMD_MASK; + ksm_scan.address += HPAGE_PMD_SIZE; + cond_resched(); + continue; + } + if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, @@ -1293,8 +1305,7 @@ next_mm: up_read(&mm->mmap_sem); return rmap_item; } - if (!IS_ERR_OR_NULL(*page)) - put_page(*page); + put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); } -- cgit v1.2.3-70-g09d2 From 5f24ce5fd34c3ca1b3d10d30da754732da64d5c0 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:00 -0800 Subject: thp: remove PG_buddy PG_buddy can be converted to _mapcount == -2. So the PG_compound_lock can be added to page->flags without overflowing (because of the sparse section bits increasing) with CONFIG_X86_PAE=y and CONFIG_X86_PAT=y. This also has to move the memory hotplug code from _mapcount to lru.next to avoid any risk of clashes. We can't use lru.next for PG_buddy removal, but memory hotplug can use lru.next even more easily than the mapcount instead. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 14 ++++++++------ include/linux/memory_hotplug.h | 14 +++++++++----- include/linux/mm.h | 21 +++++++++++++++++++++ include/linux/page-flags.h | 7 +------ mm/memory_hotplug.c | 14 ++++++++------ mm/page_alloc.c | 7 +++---- mm/sparse.c | 4 ++-- 7 files changed, 52 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/fs/proc/page.c b/fs/proc/page.c index b06c674624e..6d8e6a9e93a 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -116,15 +116,17 @@ u64 stable_page_flags(struct page *page) if (PageHuge(page)) u |= 1 << KPF_HUGE; - u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); - /* - * Caveats on high order pages: - * PG_buddy will only be set on the head page; SLUB/SLQB do the same - * for PG_slab; SLOB won't set PG_slab at all on compound pages. + * Caveats on high order pages: page->_count will only be set + * -1 on the head page; SLUB/SLQB do the same for PG_slab; + * SLOB won't set PG_slab at all on compound pages. */ + if (PageBuddy(page)) + u |= 1 << KPF_BUDDY; + + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); + u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); - u |= kpf_copy_bit(k, KPF_BUDDY, PG_buddy); u |= kpf_copy_bit(k, KPF_ERROR, PG_error); u |= kpf_copy_bit(k, KPF_DIRTY, PG_dirty); diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 31c237a00c4..24376fe7ee6 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -13,12 +13,16 @@ struct mem_section; #ifdef CONFIG_MEMORY_HOTPLUG /* - * Types for free bootmem. - * The normal smallest mapcount is -1. Here is smaller value than it. + * Types for free bootmem stored in page->lru.next. These have to be in + * some random range in unsigned long space for debugging purposes. */ -#define SECTION_INFO (-1 - 1) -#define MIX_SECTION_INFO (-1 - 2) -#define NODE_INFO (-1 - 3) +enum { + MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE = 12, + SECTION_INFO = MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE, + MIX_SECTION_INFO, + NODE_INFO, + MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE = NODE_INFO, +}; /* * pgdat resizing functions diff --git a/include/linux/mm.h b/include/linux/mm.h index 2ec5138bada..7ab7d2b6004 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -397,6 +397,27 @@ static inline void init_page_count(struct page *page) atomic_set(&page->_count, 1); } +/* + * PageBuddy() indicate that the page is free and in the buddy system + * (see mm/page_alloc.c). + */ +static inline int PageBuddy(struct page *page) +{ + return atomic_read(&page->_mapcount) == -2; +} + +static inline void __SetPageBuddy(struct page *page) +{ + VM_BUG_ON(atomic_read(&page->_mapcount) != -1); + atomic_set(&page->_mapcount, -2); +} + +static inline void __ClearPageBuddy(struct page *page) +{ + VM_BUG_ON(!PageBuddy(page)); + atomic_set(&page->_mapcount, -1); +} + void put_page(struct page *page); void put_pages_list(struct list_head *pages); diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 4ca1241ef94..0db8037e272 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -48,9 +48,6 @@ * struct page (these bits with information) are always mapped into kernel * address space... * - * PG_buddy is set to indicate that the page is free and in the buddy system - * (see mm/page_alloc.c). - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch! @@ -96,7 +93,6 @@ enum pageflags { PG_swapcache, /* Swap page: swp_entry_t in private */ PG_mappedtodisk, /* Has blocks allocated on-disk */ PG_reclaim, /* To be reclaimed asap */ - PG_buddy, /* Page is free, on buddy lists */ PG_swapbacked, /* Page is backed by RAM/swap */ PG_unevictable, /* Page is "unevictable" */ #ifdef CONFIG_MMU @@ -233,7 +229,6 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1) * risky: they bypass page accounting. */ TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback) -__PAGEFLAG(Buddy, buddy) PAGEFLAG(MappedToDisk, mappedtodisk) /* PG_readahead is only used for file reads; PG_reclaim is only for writes */ @@ -461,7 +456,7 @@ static inline int PageTransCompound(struct page *page) #define PAGE_FLAGS_CHECK_AT_FREE \ (1 << PG_lru | 1 << PG_locked | \ 1 << PG_private | 1 << PG_private_2 | \ - 1 << PG_buddy | 1 << PG_writeback | 1 << PG_reserved | \ + 1 << PG_writeback | 1 << PG_reserved | \ 1 << PG_slab | 1 << PG_swapcache | 1 << PG_active | \ 1 << PG_unevictable | __PG_MLOCKED | __PG_HWPOISON | \ __PG_COMPOUND_LOCK) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index a2832c09250..e92f04749fc 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -82,9 +82,10 @@ static void release_memory_resource(struct resource *res) #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE #ifndef CONFIG_SPARSEMEM_VMEMMAP -static void get_page_bootmem(unsigned long info, struct page *page, int type) +static void get_page_bootmem(unsigned long info, struct page *page, + unsigned long type) { - atomic_set(&page->_mapcount, type); + page->lru.next = (struct list_head *) type; SetPagePrivate(page); set_page_private(page, info); atomic_inc(&page->_count); @@ -94,15 +95,16 @@ static void get_page_bootmem(unsigned long info, struct page *page, int type) * so use __ref to tell modpost not to generate a warning */ void __ref put_page_bootmem(struct page *page) { - int type; + unsigned long type; - type = atomic_read(&page->_mapcount); - BUG_ON(type >= -1); + type = (unsigned long) page->lru.next; + BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || + type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE); if (atomic_dec_return(&page->_count) == 1) { ClearPagePrivate(page); set_page_private(page, 0); - reset_page_mapcount(page); + INIT_LIST_HEAD(&page->lru); __free_pages_bootmem(page, 0); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index e7664b9f706..9dfe49bceff 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -449,8 +449,8 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we use PG_buddy. - * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * For recording whether a page is in the buddy system, we set ->_mapcount -2. + * Setting, clearing, and testing _mapcount -2 is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -483,7 +483,7 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_buddy. Page's + * free pages of length of (1 << order) and marked with _mapcount -2. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -5574,7 +5574,6 @@ static struct trace_print_flags pageflag_names[] = { {1UL << PG_swapcache, "swapcache" }, {1UL << PG_mappedtodisk, "mappedtodisk" }, {1UL << PG_reclaim, "reclaim" }, - {1UL << PG_buddy, "buddy" }, {1UL << PG_swapbacked, "swapbacked" }, {1UL << PG_unevictable, "unevictable" }, #ifdef CONFIG_MMU diff --git a/mm/sparse.c b/mm/sparse.c index 95ac219af37..93250207c5c 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -671,10 +671,10 @@ static void __kfree_section_memmap(struct page *memmap, unsigned long nr_pages) static void free_map_bootmem(struct page *page, unsigned long nr_pages) { unsigned long maps_section_nr, removing_section_nr, i; - int magic; + unsigned long magic; for (i = 0; i < nr_pages; i++, page++) { - magic = atomic_read(&page->_mapcount); + magic = (unsigned long) page->lru.next; BUG_ON(magic == NODE_INFO); -- cgit v1.2.3-70-g09d2 From f2d6bfe9ff0acec30b713614260e78b03d20e909 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:01 -0800 Subject: thp: add x86 32bit support Add support for transparent hugepages to x86 32bit. Share the same VM_ bitflag for VM_MAPPED_COPY. mm/nommu.c will never support transparent hugepages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-2level.h | 9 +++ arch/x86/include/asm/pgtable-3level.h | 23 +++++++ arch/x86/include/asm/pgtable.h | 117 ++++++++++++++++++++++++++++++++++ arch/x86/include/asm/pgtable_64.h | 109 ------------------------------- arch/x86/mm/pgtable.c | 4 +- include/linux/mm.h | 7 +- mm/Kconfig | 2 +- 7 files changed, 156 insertions(+), 115 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h index 2334982b339..98391db840c 100644 --- a/arch/x86/include/asm/pgtable-2level.h +++ b/arch/x86/include/asm/pgtable-2level.h @@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp) +{ + return __pmd(xchg((pmdval_t *)xp, 0)); +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, * split up the 29 bits of offset into this range: diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index 177b0165ea0..94b979d1b58 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -104,6 +104,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif +#ifdef CONFIG_SMP +union split_pmd { + struct { + u32 pmd_low; + u32 pmd_high; + }; + pmd_t pmd; +}; +static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) +{ + union split_pmd res, *orig = (union split_pmd *)pmdp; + + /* xchg acts as a barrier before setting of the high bits */ + res.pmd_low = xchg(&orig->pmd_low, 0); + res.pmd_high = orig->pmd_high; + orig->pmd_high = 0; + + return res.pmd; +} +#else +#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +#endif + /* * Bits 0, 6 and 7 are taken in the low part of the pte, * put the 32 bits of offset into the high part. diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 3278038e970..001a3831567 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -97,6 +97,11 @@ static inline int pte_young(pte_t pte) return pte_flags(pte) & _PAGE_ACCESSED; } +static inline int pmd_young(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_ACCESSED; +} + static inline int pte_write(pte_t pte) { return pte_flags(pte) & _PAGE_RW; @@ -145,6 +150,18 @@ static inline int pmd_large(pmd_t pte) (_PAGE_PSE | _PAGE_PRESENT); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static inline int pmd_trans_splitting(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_SPLITTING; +} + +static inline int pmd_trans_huge(pmd_t pmd) +{ + return pmd_val(pmd) & _PAGE_PSE; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + static inline pte_t pte_set_flags(pte_t pte, pteval_t set) { pteval_t v = native_pte_val(pte); @@ -219,6 +236,55 @@ static inline pte_t pte_mkspecial(pte_t pte) return pte_set_flags(pte, _PAGE_SPECIAL); } +static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v | set); +} + +static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) +{ + pmdval_t v = native_pmd_val(pmd); + + return __pmd(v & ~clear); +} + +static inline pmd_t pmd_mkold(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_wrprotect(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mkdirty(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_DIRTY); +} + +static inline pmd_t pmd_mkhuge(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_PSE); +} + +static inline pmd_t pmd_mkyoung(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_ACCESSED); +} + +static inline pmd_t pmd_mkwrite(pmd_t pmd) +{ + return pmd_set_flags(pmd, _PAGE_RW); +} + +static inline pmd_t pmd_mknotpresent(pmd_t pmd) +{ + return pmd_clear_flags(pmd, _PAGE_PRESENT); +} + /* * Mask out unsupported bits in a present pgprot. Non-present pgprots * can use those bits for other purposes, so leave them be. @@ -527,6 +593,14 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) return res; } +static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp) +{ + pmd_t res = *pmdp; + + native_pmd_clear(pmdp); + return res; +} + static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep , pte_t pte) { @@ -616,6 +690,49 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, #define flush_tlb_fix_spurious_fault(vma, address) +#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) + +#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS +extern int pmdp_set_access_flags(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, + pmd_t entry, int dirty); + +#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG +extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH +extern int pmdp_clear_flush_young(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp); + + +#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH +extern void pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp); + +#define __HAVE_ARCH_PMD_WRITE +static inline int pmd_write(pmd_t pmd) +{ + return pmd_flags(pmd) & _PAGE_RW; +} + +#define __HAVE_ARCH_PMDP_GET_AND_CLEAR +static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp) +{ + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + pmd_update(mm, addr, pmdp); + return pmd; +} + +#define __HAVE_ARCH_PMDP_SET_WRPROTECT +static inline void pmdp_set_wrprotect(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp) +{ + clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp); + pmd_update(mm, addr, pmdp); +} + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index b2df039a411..975f709e09a 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -182,115 +182,6 @@ extern void cleanup_highmap(void); #define __HAVE_ARCH_PTE_SAME -#ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int pmd_trans_splitting(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_SPLITTING; -} - -static inline int pmd_trans_huge(pmd_t pmd) -{ - return pmd_val(pmd) & _PAGE_PSE; -} -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ - -#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) - -#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS -extern int pmdp_set_access_flags(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp, - pmd_t entry, int dirty); - -#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG -extern int pmdp_test_and_clear_young(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH -extern int pmdp_clear_flush_young(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); - - -#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern void pmdp_splitting_flush(struct vm_area_struct *vma, - unsigned long addr, pmd_t *pmdp); - -#define __HAVE_ARCH_PMD_WRITE -static inline int pmd_write(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_RW; -} - -#define __HAVE_ARCH_PMDP_GET_AND_CLEAR -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp) -{ - pmd_t pmd = native_pmdp_get_and_clear(pmdp); - pmd_update(mm, addr, pmdp); - return pmd; -} - -#define __HAVE_ARCH_PMDP_SET_WRPROTECT -static inline void pmdp_set_wrprotect(struct mm_struct *mm, - unsigned long addr, pmd_t *pmdp) -{ - clear_bit(_PAGE_BIT_RW, (unsigned long *)&pmdp->pmd); - pmd_update(mm, addr, pmdp); -} - -static inline int pmd_young(pmd_t pmd) -{ - return pmd_flags(pmd) & _PAGE_ACCESSED; -} - -static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v | set); -} - -static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear) -{ - pmdval_t v = native_pmd_val(pmd); - - return native_make_pmd(v & ~clear); -} - -static inline pmd_t pmd_mkold(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_ACCESSED); -} - -static inline pmd_t pmd_wrprotect(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_RW); -} - -static inline pmd_t pmd_mkdirty(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_DIRTY); -} - -static inline pmd_t pmd_mkhuge(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_PSE); -} - -static inline pmd_t pmd_mkyoung(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_ACCESSED); -} - -static inline pmd_t pmd_mkwrite(pmd_t pmd) -{ - return pmd_set_flags(pmd, _PAGE_RW); -} - -static inline pmd_t pmd_mknotpresent(pmd_t pmd) -{ - return pmd_clear_flags(pmd, _PAGE_PRESENT); -} - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PGTABLE_64_H */ diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 65e92d58f94..500242d3c96 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -362,7 +362,7 @@ int pmdp_test_and_clear_young(struct vm_area_struct *vma, if (pmd_young(*pmdp)) ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, - (unsigned long *) &pmdp->pmd); + (unsigned long *)pmdp); if (ret) pmd_update(vma->vm_mm, addr, pmdp); @@ -404,7 +404,7 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, int set; VM_BUG_ON(address & ~HPAGE_PMD_MASK); set = !test_and_set_bit(_PAGE_BIT_SPLITTING, - (unsigned long *)&pmdp->pmd); + (unsigned long *)pmdp); if (set) { pmd_update(vma->vm_mm, address, pmdp); /* need tlb flush only to serialize against gup-fast */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 7ab7d2b6004..9c2695beab8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -102,7 +102,11 @@ extern unsigned int kobjsize(const void *objp); #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_NONLINEAR 0x00800000 /* Is non-linear (remap_file_pages) */ +#ifndef CONFIG_TRANSPARENT_HUGEPAGE #define VM_MAPPED_COPY 0x01000000 /* T if mapped copy of data (nommu mmap) */ +#else +#define VM_HUGEPAGE 0x01000000 /* MADV_HUGEPAGE marked this vma */ +#endif #define VM_INSERTPAGE 0x02000000 /* The vma has had "vm_insert_page()" done on it */ #define VM_ALWAYSDUMP 0x04000000 /* Always include in core dumps */ @@ -111,9 +115,6 @@ extern unsigned int kobjsize(const void *objp); #define VM_SAO 0x20000000 /* Strong Access Ordering (powerpc) */ #define VM_PFN_AT_MMAP 0x40000000 /* PFNMAP vma that is fully mapped at mmap time */ #define VM_MERGEABLE 0x80000000 /* KSM may merge identical pages */ -#if BITS_PER_LONG > 32 -#define VM_HUGEPAGE 0x100000000UL /* MADV_HUGEPAGE marked this vma */ -#endif /* Bits set in the VMA until the stack is in its final location */ #define VM_STACK_INCOMPLETE_SETUP (VM_RAND_READ | VM_SEQ_READ) diff --git a/mm/Kconfig b/mm/Kconfig index 3982be2d721..d774f77538c 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -304,7 +304,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" if EMBEDDED - depends on X86_64 && MMU + depends on X86 && MMU default y help Transparent Hugepages allows the kernel to use huge pages and -- cgit v1.2.3-70-g09d2 From 0ca1634d4143c3579273ca53b993df19f5c98e92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:02 -0800 Subject: thp: mincore transparent hugepage support Handle transparent huge page pmd entries natively instead of splitting them into subpages. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 3 +++ mm/huge_memory.c | 25 +++++++++++++++++++++++++ mm/mincore.c | 8 +++++++- 3 files changed, 35 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 43a694ef890..25125fb6acf 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -19,6 +19,9 @@ extern struct page *follow_trans_huge_pmd(struct mm_struct *mm, extern int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t *pmd); +extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ae2bf08b109..37e89a32a0b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -923,6 +923,31 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, return ret; } +int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, unsigned long end, + unsigned char *vec) +{ + int ret = 0; + + spin_lock(&vma->vm_mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + ret = !pmd_trans_splitting(*pmd); + spin_unlock(&vma->vm_mm->page_table_lock); + if (unlikely(!ret)) + wait_split_huge_page(vma->anon_vma, pmd); + else { + /* + * All logical pages in the range are present + * if backed by a huge page. + */ + memset(vec, 1, (end - addr) >> PAGE_SHIFT); + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, diff --git a/mm/mincore.c b/mm/mincore.c index 9959bb41570..a4e6b9d75c7 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -154,7 +154,13 @@ static void mincore_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (pmd_trans_huge(*pmd)) { + if (mincore_huge_pmd(vma, pmd, addr, next, vec)) { + vec += (next - addr) >> PAGE_SHIFT; + continue; + } + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) mincore_unmapped_range(vma, addr, next, vec); else -- cgit v1.2.3-70-g09d2 From b36f5b0710e9e3b92484de32920fddcb17278664 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:03 -0800 Subject: thp: mprotect: pass vma down to page table walkers Flushing the tlb for huge pmds requires the vma's anon_vma, so pass along the vma instead of the mm, we can always get the latter when we need it. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/mprotect.c b/mm/mprotect.c index bd27db6b992..9402080d0d4 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -78,7 +78,7 @@ static void change_pte_range(struct mm_struct *mm, pmd_t *pmd, pte_unmap_unlock(pte - 1, ptl); } -static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, +static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -88,14 +88,15 @@ static inline void change_pmd_range(struct mm_struct *mm, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(mm, pmd); + split_huge_page_pmd(vma->vm_mm, pmd); if (pmd_none_or_clear_bad(pmd)) continue; - change_pte_range(mm, pmd, addr, next, newprot, dirty_accountable); + change_pte_range(vma->vm_mm, pmd, addr, next, newprot, + dirty_accountable); } while (pmd++, addr = next, addr != end); } -static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, +static inline void change_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, pgprot_t newprot, int dirty_accountable) { @@ -107,7 +108,8 @@ static inline void change_pud_range(struct mm_struct *mm, pgd_t *pgd, next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - change_pmd_range(mm, pud, addr, next, newprot, dirty_accountable); + change_pmd_range(vma, pud, addr, next, newprot, + dirty_accountable); } while (pud++, addr = next, addr != end); } @@ -127,7 +129,8 @@ static void change_protection(struct vm_area_struct *vma, next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - change_pud_range(mm, pgd, addr, next, newprot, dirty_accountable); + change_pud_range(vma, pgd, addr, next, newprot, + dirty_accountable); } while (pgd++, addr = next, addr != end); flush_tlb_range(vma, start, end); } -- cgit v1.2.3-70-g09d2 From cd7548ab360c462118568eebb8c6da3bc303b02e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:04 -0800 Subject: thp: mprotect: transparent huge page support Natively handle huge pmds when changing page tables on behalf of mprotect(). I left out update_mmu_cache() because we do not need it on x86 anyway but more importantly the interface works on ptes, not pmds. Signed-off-by: Johannes Weiner Signed-off-by: Andrea Arcangeli Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 2 ++ mm/huge_memory.c | 27 +++++++++++++++++++++++++++ mm/mprotect.c | 8 +++++++- 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 25125fb6acf..c590b08c6fa 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -22,6 +22,8 @@ extern int zap_huge_pmd(struct mmu_gather *tlb, extern int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec); +extern int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot); enum transparent_hugepage_flag { TRANSPARENT_HUGEPAGE_FLAG, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 37e89a32a0b..7b55fe0e998 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -948,6 +948,33 @@ int mincore_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, return ret; } +int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, + unsigned long addr, pgprot_t newprot) +{ + struct mm_struct *mm = vma->vm_mm; + int ret = 0; + + spin_lock(&mm->page_table_lock); + if (likely(pmd_trans_huge(*pmd))) { + if (unlikely(pmd_trans_splitting(*pmd))) { + spin_unlock(&mm->page_table_lock); + wait_split_huge_page(vma->anon_vma, pmd); + } else { + pmd_t entry; + + entry = pmdp_get_and_clear(mm, addr, pmd); + entry = pmd_modify(entry, newprot); + set_pmd_at(mm, addr, pmd, entry); + spin_unlock(&vma->vm_mm->page_table_lock); + flush_tlb_range(vma, addr, addr + HPAGE_PMD_SIZE); + ret = 1; + } + } else + spin_unlock(&vma->vm_mm->page_table_lock); + + return ret; +} + pmd_t *page_check_address_pmd(struct page *page, struct mm_struct *mm, unsigned long address, diff --git a/mm/mprotect.c b/mm/mprotect.c index 9402080d0d4..5a688a2756b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -88,7 +88,13 @@ static inline void change_pmd_range(struct vm_area_struct *vma, pud_t *pud, pmd = pmd_offset(pud, addr); do { next = pmd_addr_end(addr, end); - split_huge_page_pmd(vma->vm_mm, pmd); + if (pmd_trans_huge(*pmd)) { + if (next - addr != HPAGE_PMD_SIZE) + split_huge_page_pmd(vma->vm_mm, pmd); + else if (change_huge_pmd(vma, pmd, addr, newprot)) + continue; + /* fall through */ + } if (pmd_none_or_clear_bad(pmd)) continue; change_pte_range(vma->vm_mm, pmd, addr, next, newprot, -- cgit v1.2.3-70-g09d2 From f000565adb770b14cebbafde0a4f3e61a3342a63 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:04 -0800 Subject: thp: set recommended min free kbytes If transparent hugepage is enabled initialize min_free_kbytes to an optimal value by default. This moves the hugeadm algorithm in kernel. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7b55fe0e998..4ed97a2a115 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -85,6 +85,47 @@ struct khugepaged_scan { .mm_head = LIST_HEAD_INIT(khugepaged_scan.mm_head), }; + +static int set_recommended_min_free_kbytes(void) +{ + struct zone *zone; + int nr_zones = 0; + unsigned long recommended_min; + extern int min_free_kbytes; + + if (!test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) && + !test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags)) + return 0; + + for_each_populated_zone(zone) + nr_zones++; + + /* Make sure at least 2 hugepages are free for MIGRATE_RESERVE */ + recommended_min = pageblock_nr_pages * nr_zones * 2; + + /* + * Make sure that on average at least two pageblocks are almost free + * of another type, one for a migratetype to fall back to and a + * second to avoid subsequent fallbacks of other types There are 3 + * MIGRATE_TYPES we care about. + */ + recommended_min += pageblock_nr_pages * nr_zones * + MIGRATE_PCPTYPES * MIGRATE_PCPTYPES; + + /* don't ever allow to reserve more than 5% of the lowmem */ + recommended_min = min(recommended_min, + (unsigned long) nr_free_buffer_pages() / 20); + recommended_min <<= (PAGE_SHIFT-10); + + if (recommended_min > min_free_kbytes) + min_free_kbytes = recommended_min; + setup_per_zone_wmarks(); + return 0; +} +late_initcall(set_recommended_min_free_kbytes); + static int start_khugepaged(void) { int err = 0; @@ -108,6 +149,8 @@ static int start_khugepaged(void) mutex_unlock(&khugepaged_mutex); if (wakeup) wake_up_interruptible(&khugepaged_wait); + + set_recommended_min_free_kbytes(); } else /* wakeup to exit */ wake_up_interruptible(&khugepaged_wait); @@ -177,6 +220,13 @@ static ssize_t enabled_store(struct kobject *kobj, ret = err; } + if (ret > 0 && + (test_bit(TRANSPARENT_HUGEPAGE_FLAG, + &transparent_hugepage_flags) || + test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, + &transparent_hugepage_flags))) + set_recommended_min_free_kbytes(); + return ret; } static struct kobj_attribute enabled_attr = @@ -464,6 +514,8 @@ static int __init hugepage_init(void) start_khugepaged(); + set_recommended_min_free_kbytes(); + out: return err; } -- cgit v1.2.3-70-g09d2 From d39d33c332c611094f84cee39715866f4cbf79e2 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: enable direct defrag With memory compaction in, and lumpy-reclaim disabled, it seems safe enough to defrag memory during the (synchronous) transparent hugepage page faults (TRANSPARENT_HUGEPAGE_DEFRAG_FLAG) and not only during khugepaged (async) hugepage allocations that was already enabled even before memory compaction was in (TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG). Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4ed97a2a115..0415a83afd6 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -28,6 +28,7 @@ */ unsigned long transparent_hugepage_flags __read_mostly = (1< Date: Thu, 13 Jan 2011 15:47:05 -0800 Subject: thp: add numa awareness to hugepage allocations It's mostly a matter of replacing alloc_pages with alloc_pages_vma after introducing alloc_pages_vma. khugepaged needs special handling as the allocation has to happen inside collapse_huge_page where the vma is known and an error has to be returned to the outer loop to sleep alloc_sleep_millisecs in case of failure. But it retains the more efficient logic of handling allocation failures in khugepaged in case of CONFIG_NUMA=n. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 7 +++-- mm/huge_memory.c | 87 +++++++++++++++++++++++++++++++++++++++++++++-------- mm/mempolicy.c | 13 +++++--- 3 files changed, 87 insertions(+), 20 deletions(-) (limited to 'mm') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index d95082cc6f4..a3b148a9187 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -331,14 +331,17 @@ alloc_pages(gfp_t gfp_mask, unsigned int order) { return alloc_pages_current(gfp_mask, order); } -extern struct page *alloc_page_vma(gfp_t gfp_mask, +extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr); #else #define alloc_pages(gfp_mask, order) \ alloc_pages_node(numa_node_id(), gfp_mask, order) -#define alloc_page_vma(gfp_mask, vma, addr) alloc_pages(gfp_mask, 0) +#define alloc_pages_vma(gfp_mask, order, vma, addr) \ + alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) +#define alloc_page_vma(gfp_mask, vma, addr) \ + alloc_pages_vma(gfp_mask, 0, vma, addr) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0415a83afd6..f6559e7711b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -620,11 +620,26 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, return ret; } +static inline gfp_t alloc_hugepage_gfpmask(int defrag) +{ + return GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT); +} + +static inline struct page *alloc_hugepage_vma(int defrag, + struct vm_area_struct *vma, + unsigned long haddr) +{ + return alloc_pages_vma(alloc_hugepage_gfpmask(defrag), + HPAGE_PMD_ORDER, vma, haddr); +} + +#ifndef CONFIG_NUMA static inline struct page *alloc_hugepage(int defrag) { - return alloc_pages(GFP_TRANSHUGE & ~(defrag ? 0 : __GFP_WAIT), + return alloc_pages(alloc_hugepage_gfpmask(defrag), HPAGE_PMD_ORDER); } +#endif int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pmd_t *pmd, @@ -639,7 +654,8 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma))) return VM_FAULT_OOM; - page = alloc_hugepage(transparent_hugepage_defrag(vma)); + page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); if (unlikely(!page)) goto out; if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { @@ -862,7 +878,8 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) - new_page = alloc_hugepage(transparent_hugepage_defrag(vma)); + new_page = alloc_hugepage_vma(transparent_hugepage_defrag(vma), + vma, haddr); else new_page = NULL; @@ -1661,7 +1678,11 @@ static void collapse_huge_page(struct mm_struct *mm, unsigned long hstart, hend; VM_BUG_ON(address & ~HPAGE_PMD_MASK); +#ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); +#else + VM_BUG_ON(*hpage); +#endif /* * Prevent all access to pagetables with the exception of @@ -1699,9 +1720,17 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; +#ifndef CONFIG_NUMA new_page = *hpage; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) +#else + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + *hpage = ERR_PTR(-ENOMEM); goto out; + } +#endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + goto out_put_page; anon_vma_lock(vma->anon_vma); @@ -1730,7 +1759,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out; + goto out_put_page; } /* @@ -1765,10 +1794,19 @@ static void collapse_huge_page(struct mm_struct *mm, mm->nr_ptes--; spin_unlock(&mm->page_table_lock); +#ifndef CONFIG_NUMA *hpage = NULL; +#endif khugepaged_pages_collapsed++; out: up_write(&mm->mmap_sem); + return; + +out_put_page: +#ifdef CONFIG_NUMA + put_page(new_page); +#endif + goto out; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -2001,11 +2039,16 @@ static void khugepaged_do_scan(struct page **hpage) while (progress < pages) { cond_resched(); +#ifndef CONFIG_NUMA if (!*hpage) { *hpage = alloc_hugepage(khugepaged_defrag()); if (unlikely(!*hpage)) break; } +#else + if (IS_ERR(*hpage)) + break; +#endif spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) @@ -2020,37 +2063,55 @@ static void khugepaged_do_scan(struct page **hpage) } } +static void khugepaged_alloc_sleep(void) +{ + DEFINE_WAIT(wait); + add_wait_queue(&khugepaged_wait, &wait); + schedule_timeout_interruptible( + msecs_to_jiffies( + khugepaged_alloc_sleep_millisecs)); + remove_wait_queue(&khugepaged_wait, &wait); +} + +#ifndef CONFIG_NUMA static struct page *khugepaged_alloc_hugepage(void) { struct page *hpage; do { hpage = alloc_hugepage(khugepaged_defrag()); - if (!hpage) { - DEFINE_WAIT(wait); - add_wait_queue(&khugepaged_wait, &wait); - schedule_timeout_interruptible( - msecs_to_jiffies( - khugepaged_alloc_sleep_millisecs)); - remove_wait_queue(&khugepaged_wait, &wait); - } + if (!hpage) + khugepaged_alloc_sleep(); } while (unlikely(!hpage) && likely(khugepaged_enabled())); return hpage; } +#endif static void khugepaged_loop(void) { struct page *hpage; +#ifdef CONFIG_NUMA + hpage = NULL; +#endif while (likely(khugepaged_enabled())) { +#ifndef CONFIG_NUMA hpage = khugepaged_alloc_hugepage(); if (unlikely(!hpage)) break; +#else + if (IS_ERR(hpage)) { + khugepaged_alloc_sleep(); + hpage = NULL; + } +#endif khugepaged_do_scan(&hpage); +#ifndef CONFIG_NUMA if (hpage) put_page(hpage); +#endif if (khugepaged_has_work()) { DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 83b7df309fc..368fc9d2361 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1796,7 +1796,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, } /** - * alloc_page_vma - Allocate a page for a VMA. + * alloc_pages_vma - Allocate a page for a VMA. * * @gfp: * %GFP_USER user allocation. @@ -1805,6 +1805,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * %GFP_FS allocation should not call back into a file system. * %GFP_ATOMIC don't sleep. * + * @order:Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual Address of the allocation. Must be inside the VMA. * @@ -1818,7 +1819,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, * Should be called with the mm_sem of the vma hold. */ struct page * -alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) +alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, + unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); struct zonelist *zl; @@ -1830,7 +1832,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); mpol_cond_put(pol); - page = alloc_page_interleave(gfp, 0, nid); + page = alloc_page_interleave(gfp, order, nid); put_mems_allowed(); return page; } @@ -1839,7 +1841,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * slow path: ref counted shared policy */ - struct page *page = __alloc_pages_nodemask(gfp, 0, + struct page *page = __alloc_pages_nodemask(gfp, order, zl, policy_nodemask(gfp, pol)); __mpol_put(pol); put_mems_allowed(); @@ -1848,7 +1850,8 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) /* * fast path: default or task policy */ - page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol)); + page = __alloc_pages_nodemask(gfp, order, zl, + policy_nodemask(gfp, pol)); put_mems_allowed(); return page; } -- cgit v1.2.3-70-g09d2 From ce83d2174ea9c3d72d5821cf3ebc974e36391bf7 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:06 -0800 Subject: thp: allocate memory in khugepaged outside of mmap_sem write mode This tries to be more friendly to filesystem in userland, with userland backends that allocate memory in the I/O paths and that could deadlock if khugepaged holds the mmap_sem write mode of the userland backend while allocating memory. Memory allocation may wait for writeback I/O completion from the daemon that may be blocked in the mmap_sem read mode if a page fault happens and the daemon wasn't using mlock for the memory required for the I/O submission and completion. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f6559e7711b..bce6e12140e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1664,9 +1664,9 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, static void collapse_huge_page(struct mm_struct *mm, unsigned long address, - struct page **hpage) + struct page **hpage, + struct vm_area_struct *vma) { - struct vm_area_struct *vma; pgd_t *pgd; pud_t *pud; pmd_t *pmd, _pmd; @@ -1680,9 +1680,34 @@ static void collapse_huge_page(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); #ifndef CONFIG_NUMA VM_BUG_ON(!*hpage); + new_page = *hpage; #else VM_BUG_ON(*hpage); + /* + * Allocate the page while the vma is still valid and under + * the mmap_sem read mode so there is no memory allocation + * later when we take the mmap_sem in write mode. This is more + * friendly behavior (OTOH it may actually hide bugs) to + * filesystems in userland with daemons allocating memory in + * the userland I/O paths. Allocating memory with the + * mmap_sem in read mode is good idea also to allow greater + * scalability. + */ + new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); + if (unlikely(!new_page)) { + up_read(&mm->mmap_sem); + *hpage = ERR_PTR(-ENOMEM); + return; + } #endif + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + up_read(&mm->mmap_sem); + put_page(new_page); + return; + } + + /* after allocating the hugepage upgrade to mmap_sem write mode */ + up_read(&mm->mmap_sem); /* * Prevent all access to pagetables with the exception of @@ -1720,18 +1745,6 @@ static void collapse_huge_page(struct mm_struct *mm, if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) goto out; -#ifndef CONFIG_NUMA - new_page = *hpage; -#else - new_page = alloc_hugepage_vma(khugepaged_defrag(), vma, address); - if (unlikely(!new_page)) { - *hpage = ERR_PTR(-ENOMEM); - goto out; - } -#endif - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) - goto out_put_page; - anon_vma_lock(vma->anon_vma); pte = pte_offset_map(pmd, address); @@ -1759,7 +1772,7 @@ static void collapse_huge_page(struct mm_struct *mm, spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); mem_cgroup_uncharge_page(new_page); - goto out_put_page; + goto out; } /* @@ -1798,15 +1811,15 @@ static void collapse_huge_page(struct mm_struct *mm, *hpage = NULL; #endif khugepaged_pages_collapsed++; -out: +out_up_write: up_write(&mm->mmap_sem); return; -out_put_page: +out: #ifdef CONFIG_NUMA put_page(new_page); #endif - goto out; + goto out_up_write; } static int khugepaged_scan_pmd(struct mm_struct *mm, @@ -1865,10 +1878,9 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, ret = 1; out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - up_read(&mm->mmap_sem); - collapse_huge_page(mm, address, hpage); - } + if (ret) + /* collapse_huge_page will return with the mmap_sem released */ + collapse_huge_page(mm, address, hpage, vma); out: return ret; } -- cgit v1.2.3-70-g09d2 From 13ece886d99cd668483113f7238e419d5331af26 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:07 -0800 Subject: thp: transparent hugepage config choice Allow to choose between the always|madvise default for page faults and khugepaged at config time. madvise guarantees zero risk of higher memory footprint for applications (applications using madvise(MADV_HUGEPAGE) won't risk to use any more memory by backing their virtual regions with hugepages). Initially set the default to N and don't depend on EMBEDDED. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 27 +++++++++++++++++++++++++-- mm/huge_memory.c | 5 +++++ 2 files changed, 30 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index d774f77538c..3e81687263b 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -303,9 +303,8 @@ config NOMMU_INITIAL_TRIM_EXCESS See Documentation/nommu-mmap.txt for more information. config TRANSPARENT_HUGEPAGE - bool "Transparent Hugepage Support" if EMBEDDED + bool "Transparent Hugepage Support" depends on X86 && MMU - default y help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. @@ -316,6 +315,30 @@ config TRANSPARENT_HUGEPAGE If memory constrained on embedded, you may want to say N. +choice + prompt "Transparent Hugepage Support sysfs defaults" + depends on TRANSPARENT_HUGEPAGE + default TRANSPARENT_HUGEPAGE_ALWAYS + help + Selects the sysfs defaults for Transparent Hugepage Support. + + config TRANSPARENT_HUGEPAGE_ALWAYS + bool "always" + help + Enabling Transparent Hugepage always, can increase the + memory footprint of applications without a guaranteed + benefit but it will work automatically for all applications. + + config TRANSPARENT_HUGEPAGE_MADVISE + bool "madvise" + help + Enabling Transparent Hugepage madvise, will only provide a + performance improvement benefit to the applications using + madvise(MADV_HUGEPAGE) but it won't risk to increase the + memory footprint of applications without a guaranteed + benefit. +endchoice + # # UP and nommu archs use km based percpu allocator # diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bce6e12140e..30c3cec8202 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -27,7 +27,12 @@ * allocations. */ unsigned long transparent_hugepage_flags __read_mostly = +#ifdef CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS (1< Date: Thu, 13 Jan 2011 15:47:07 -0800 Subject: thp: select CONFIG_COMPACTION if TRANSPARENT_HUGEPAGE enabled With transparent hugepage support we need compaction for the "defrag" sysfs controls to be effective. At the moment THP hangs the system if COMPACTION isn't selected, as without COMPACTION lumpy reclaim wouldn't be entirely disabled. So at the moment it's not orthogonal. When lumpy will be removed from the VM I can remove the select COMPACTION in theory, but then 99% of THP users would be still doing a mistake in disabling compaction, even if the mistake won't return in fatal runtime but just slightly degraded performance. So from a theoretical standpoing forcing the below select is not needed (the dependency isn't strict nor at compile time nor at runtime) but from a practical standpoint it is safer. If anybody really wants THP to run without compaction, it'd be such a weird setup that editing the Kconfig file to allow it will be surely not a problem. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3e81687263b..3ad483bdf50 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -305,6 +305,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" depends on X86 && MMU + select COMPACTION help Transparent Hugepages allows the kernel to use huge pages and huge tlb transparently to the applications whenever possible. -- cgit v1.2.3-70-g09d2 From bc835011afbea3957217ee716093d791fb2fe44f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:08 -0800 Subject: thp: transhuge isolate_migratepages() It's not worth migrating transparent hugepages during compaction. Those hugepages don't create fragmentation. Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index b0fbfdfad29..da25b5a2e47 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -295,10 +295,25 @@ static unsigned long isolate_migratepages(struct zone *zone, continue; } + if (!PageLRU(page)) + continue; + + /* + * PageLRU is set, and lru_lock excludes isolation, + * splitting and collapsing (collapsing has already + * happened if PageLRU is set). + */ + if (PageTransHuge(page)) { + low_pfn += (1 << compound_order(page)) - 1; + continue; + } + /* Try isolate the page */ if (__isolate_lru_page(page, ISOLATE_BOTH, 0) != 0) continue; + VM_BUG_ON(PageTransCompound(page)); + /* Successfully isolated */ del_page_from_lru_list(zone, page, page_lru(page)); list_add(&page->lru, migratelist); -- cgit v1.2.3-70-g09d2 From 94fcc585fb85ad7b059c70872489b50044d401f3 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:08 -0800 Subject: thp: avoid breaking huge pmd invariants in case of vma_adjust failures An huge pmd can only be mapped if the corresponding 2M virtual range is fully contained in the vma. At times the VM calls split_vma twice, if the first split_vma succeeds and the second fail, the first split_vma remains in effect and it's not rolled back. For split_vma or vma_adjust to fail an allocation failure is needed so it's a very unlikely event (the out of memory killer would normally fire before any allocation failure is visible to kernel and userland and if an out of memory condition happens it's unlikely to happen exactly here). Nevertheless it's safer to ensure that no huge pmd can be left around if the vma is adjusted in a way that can't fit hugepages anymore at the new vm_start/vm_end address. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 19 ++++++++++++ mm/huge_memory.c | 80 +++++++++++++++++++++++++++++++++++++++++++++++-- mm/mmap.c | 2 ++ 3 files changed, 99 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index c590b08c6fa..82759522873 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -104,6 +104,19 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #error "hugepages can't be allocated by the buddy allocator" #endif extern int hugepage_madvise(unsigned long *vm_flags); +extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next); +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + if (!vma->anon_vma || vma->vm_ops || vma->vm_file) + return; + __vma_adjust_trans_huge(vma, start, end, adjust_next); +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -125,6 +138,12 @@ static inline int hugepage_madvise(unsigned long *vm_flags) BUG(); return 0; } +static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif /* _LINUX_HUGE_MM_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 30c3cec8202..b6facc35e89 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1075,8 +1075,16 @@ pmd_t *page_check_address_pmd(struct page *page, goto out; if (pmd_page(*pmd) != page) goto out; - VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && - pmd_trans_splitting(*pmd)); + /* + * split_vma() may create temporary aliased mappings. There is + * no risk as long as all huge pmd are found and have their + * splitting bit set before __split_huge_page_refcount + * runs. Finding the same huge pmd more than once during the + * same rmap walk is not a problem. + */ + if (flag == PAGE_CHECK_ADDRESS_PMD_NOTSPLITTING_FLAG && + pmd_trans_splitting(*pmd)) + goto out; if (pmd_trans_huge(*pmd)) { VM_BUG_ON(flag == PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG && !pmd_trans_splitting(*pmd)); @@ -2196,3 +2204,71 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) put_page(page); BUG_ON(pmd_trans_huge(*pmd)); } + +static void split_huge_page_address(struct mm_struct *mm, + unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd; + + VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + return; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + return; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + return; + /* + * Caller holds the mmap_sem write mode, so a huge pmd cannot + * materialize from under us. + */ + split_huge_page_pmd(mm, pmd); +} + +void __vma_adjust_trans_huge(struct vm_area_struct *vma, + unsigned long start, + unsigned long end, + long adjust_next) +{ + /* + * If the new start address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (start & ~HPAGE_PMD_MASK && + (start & HPAGE_PMD_MASK) >= vma->vm_start && + (start & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, start); + + /* + * If the new end address isn't hpage aligned and it could + * previously contain an hugepage: check if we need to split + * an huge pmd. + */ + if (end & ~HPAGE_PMD_MASK && + (end & HPAGE_PMD_MASK) >= vma->vm_start && + (end & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= vma->vm_end) + split_huge_page_address(vma->vm_mm, end); + + /* + * If we're also updating the vma->vm_next->vm_start, if the new + * vm_next->vm_start isn't page aligned and it could previously + * contain an hugepage: check if we need to split an huge pmd. + */ + if (adjust_next > 0) { + struct vm_area_struct *next = vma->vm_next; + unsigned long nstart = next->vm_start; + nstart += adjust_next << PAGE_SHIFT; + if (nstart & ~HPAGE_PMD_MASK && + (nstart & HPAGE_PMD_MASK) >= next->vm_start && + (nstart & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE <= next->vm_end) + split_huge_page_address(next->vm_mm, nstart); + } +} diff --git a/mm/mmap.c b/mm/mmap.c index 753f44d1704..73cc648873d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -589,6 +589,8 @@ again: remove_next = 1 + (end > next->vm_end); } } + vma_adjust_trans_huge(vma, start, end, adjust_next); + /* * When changing only vma->vm_end, we don't really need anon_vma * lock. This is a fairly rare case by itself, but the anon_vma -- cgit v1.2.3-70-g09d2 From 4b7167b9ff9b7f3f528cbc4c7d02ebd275b9b10c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:09 -0800 Subject: thp: don't allow transparent hugepage support without PSE Archs implementing Transparent Hugepage Support must implement a function called has_transparent_hugepage to be sure the virtual or physical CPU supports Transparent Hugepages. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable.h | 5 +++++ mm/huge_memory.c | 8 ++++++++ 2 files changed, 13 insertions(+) (limited to 'mm') diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index c48ba055f69..18601c86fab 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -160,6 +160,11 @@ static inline int pmd_trans_huge(pmd_t pmd) { return pmd_val(pmd) & _PAGE_PSE; } + +static inline int has_transparent_hugepage(void) +{ + return cpu_has_pse; +} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ static inline pte_t pte_set_flags(pte_t pte, pteval_t set) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6facc35e89..915809b16ed 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -487,7 +487,15 @@ static int __init hugepage_init(void) int err; #ifdef CONFIG_SYSFS static struct kobject *hugepage_kobj; +#endif + err = -EINVAL; + if (!has_transparent_hugepage()) { + transparent_hugepage_flags = 0; + goto out; + } + +#ifdef CONFIG_SYSFS err = -ENOMEM; hugepage_kobj = kobject_create_and_add("transparent_hugepage", mm_kobj); if (unlikely(!hugepage_kobj)) { -- cgit v1.2.3-70-g09d2 From 8ee53820edfd1f3b6554c593f337148dd3d7fc91 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: mmu_notifier_test_young For GRU and EPT, we need gup-fast to set referenced bit too (this is why it's correct to return 0 when shadow_access_mask is zero, it requires gup-fast to set the referenced bit). qemu-kvm access already sets the young bit in the pte if it isn't zero-copy, if it's zero copy or a shadow paging EPT minor fault we relay on gup-fast to signal the page is in use... We also need to check the young bits on the secondary pagetables for NPT and not nested shadow mmu as the data may never get accessed again by the primary pte. Without this closer accuracy, we'd have to remove the heuristic that avoids collapsing hugepages in hugepage virtual regions that have not even a single subpage in use. ->test_young is full backwards compatible with GRU and other usages that don't have young bits in pagetables set by the hardware and that should nuke the secondary mmu mappings when ->clear_flush_young runs just like EPT does. Removing the heuristic that checks the young bit in khugepaged/collapse_huge_page completely isn't so bad either probably but I thought it was worth it and this makes it reliable. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/mmu.c | 34 ++++++++++++++++++++++++++++++++++ arch/x86/mm/gup.c | 3 +++ include/linux/mmu_notifier.h | 26 ++++++++++++++++++++++++++ mm/huge_memory.c | 6 ++++-- mm/mmu_notifier.c | 20 ++++++++++++++++++++ virt/kvm/kvm_main.c | 17 +++++++++++++++++ 7 files changed, 105 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index aa75f21a9fb..ffd7f8d2918 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -822,6 +822,7 @@ extern bool kvm_rebooting; #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); int kvm_age_hva(struct kvm *kvm, unsigned long hva); +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva); void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 47b2c3288b6..f02b8edc3d4 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -945,6 +945,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp, return young; } +static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp, + unsigned long data) +{ + u64 *spte; + int young = 0; + + /* + * If there's no access bit in the secondary pte set by the + * hardware it's up to gup-fast/gup to set the access bit in + * the primary pte or in the page structure. + */ + if (!shadow_accessed_mask) + goto out; + + spte = rmap_next(kvm, rmapp, NULL); + while (spte) { + u64 _spte = *spte; + BUG_ON(!(_spte & PT_PRESENT_MASK)); + young = _spte & PT_ACCESSED_MASK; + if (young) { + young = 1; + break; + } + spte = rmap_next(kvm, rmapp, spte); + } +out: + return young; +} + #define RMAP_RECYCLE_THRESHOLD 1000 static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) @@ -965,6 +994,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva) return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); } +int kvm_test_age_hva(struct kvm *kvm, unsigned long hva) +{ + return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp); +} + #ifdef MMU_DEBUG static int is_empty_shadow_page(u64 *spt) { diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 269aa53932e..dbe34b93137 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -8,6 +8,7 @@ #include #include #include +#include #include @@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + SetPageReferenced(page); pages[*nr] = page; (*nr)++; @@ -103,6 +105,7 @@ static inline void get_head_page_multiple(struct page *page, int nr) VM_BUG_ON(page != compound_head(page)); VM_BUG_ON(page_count(page) == 0); atomic_add(nr, &page->_count); + SetPageReferenced(page); } static inline void get_huge_page_tail(struct page *page) diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index cbfab1e9957..cc2e7dfea9d 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -61,6 +61,16 @@ struct mmu_notifier_ops { struct mm_struct *mm, unsigned long address); + /* + * test_young is called to check the young/accessed bitflag in + * the secondary pte. This is used to know if the page is + * frequently used without actually clearing the flag or tearing + * down the secondary mapping on the page. + */ + int (*test_young)(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address); + /* * change_pte is called in cases that pte mapping to page is changed: * for example, when ksm remaps pte to point to a new shared page. @@ -163,6 +173,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); extern void __mmu_notifier_release(struct mm_struct *mm); extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, unsigned long address); +extern int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); extern void __mmu_notifier_invalidate_page(struct mm_struct *mm, @@ -186,6 +198,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_test_young(mm, address); + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { @@ -313,6 +333,12 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, return 0; } +static inline int mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + return 0; +} + static inline void mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 915809b16ed..39d7df40c06 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1632,7 +1632,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON(PageLRU(page)); /* If there is no mapped pte young don't collapse the page */ - if (pte_young(pteval)) + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } if (unlikely(!referenced)) @@ -1892,7 +1893,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* cannot use mapcount: can't collapse if there's a gup pin */ if (page_count(page) != 1) goto out_unmap; - if (pte_young(pteval)) + if (pte_young(pteval) || PageReferenced(page) || + mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } if (referenced) diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 438951d366f..8d032de4088 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -100,6 +100,26 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm, return young; } +int __mmu_notifier_test_young(struct mm_struct *mm, + unsigned long address) +{ + struct mmu_notifier *mn; + struct hlist_node *n; + int young = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(mn, n, &mm->mmu_notifier_mm->list, hlist) { + if (mn->ops->test_young) { + young = mn->ops->test_young(mn, mm, address); + if (young) + break; + } + } + rcu_read_unlock(); + + return young; +} + void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 85ab7db0d36..4286d476651 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -380,6 +380,22 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn, return young; } +static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct kvm *kvm = mmu_notifier_to_kvm(mn); + int young, idx; + + idx = srcu_read_lock(&kvm->srcu); + spin_lock(&kvm->mmu_lock); + young = kvm_test_age_hva(kvm, address); + spin_unlock(&kvm->mmu_lock); + srcu_read_unlock(&kvm->srcu, idx); + + return young; +} + static void kvm_mmu_notifier_release(struct mmu_notifier *mn, struct mm_struct *mm) { @@ -396,6 +412,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, + .test_young = kvm_mmu_notifier_test_young, .change_pte = kvm_mmu_notifier_change_pte, .release = kvm_mmu_notifier_release, }; -- cgit v1.2.3-70-g09d2 From 878aee7d6b5504e01b9caffce080e792b6b8d090 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:10 -0800 Subject: thp: freeze khugepaged and ksmd It's unclear why schedule friendly kernel threads can't be taken away by the CPU through the scheduler itself. It's safer to stop them as they can trigger memory allocation, if kswapd also freezes itself to avoid generating I/O they have too. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 14 ++++++++++++-- mm/ksm.c | 8 ++++++-- 2 files changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 39d7df40c06..45b6d53bcfb 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -2085,6 +2086,9 @@ static void khugepaged_do_scan(struct page **hpage) break; #endif + if (unlikely(kthread_should_stop() || freezing(current))) + break; + spin_lock(&khugepaged_mm_lock); if (!khugepaged_scan.mm_slot) pass_through_head++; @@ -2147,6 +2151,9 @@ static void khugepaged_loop(void) if (hpage) put_page(hpage); #endif + try_to_freeze(); + if (unlikely(kthread_should_stop())) + break; if (khugepaged_has_work()) { DEFINE_WAIT(wait); if (!khugepaged_scan_sleep_millisecs) @@ -2157,8 +2164,8 @@ static void khugepaged_loop(void) khugepaged_scan_sleep_millisecs)); remove_wait_queue(&khugepaged_wait, &wait); } else if (khugepaged_enabled()) - wait_event_interruptible(khugepaged_wait, - khugepaged_wait_event()); + wait_event_freezable(khugepaged_wait, + khugepaged_wait_event()); } } @@ -2166,6 +2173,7 @@ static int khugepaged(void *none) { struct mm_slot *mm_slot; + set_freezable(); set_user_nice(current, 19); /* serialize with start_khugepaged() */ @@ -2180,6 +2188,8 @@ static int khugepaged(void *none) mutex_lock(&khugepaged_mutex); if (!khugepaged_enabled()) break; + if (unlikely(kthread_should_stop())) + break; } spin_lock(&khugepaged_mm_lock); diff --git a/mm/ksm.c b/mm/ksm.c index 5e7d5d35ea8..e2b0afd0a03 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1365,7 +1366,7 @@ static void ksm_do_scan(unsigned int scan_npages) struct rmap_item *rmap_item; struct page *uninitialized_var(page); - while (scan_npages--) { + while (scan_npages-- && likely(!freezing(current))) { cond_resched(); rmap_item = scan_get_next_rmap_item(&page); if (!rmap_item) @@ -1383,6 +1384,7 @@ static int ksmd_should_run(void) static int ksm_scan_thread(void *nothing) { + set_freezable(); set_user_nice(current, 5); while (!kthread_should_stop()) { @@ -1391,11 +1393,13 @@ static int ksm_scan_thread(void *nothing) ksm_do_scan(ksm_thread_pages_to_scan); mutex_unlock(&ksm_thread_mutex); + try_to_freeze(); + if (ksmd_should_run()) { schedule_timeout_interruptible( msecs_to_jiffies(ksm_thread_sleep_millisecs)); } else { - wait_event_interruptible(ksm_thread_wait, + wait_event_freezable(ksm_thread_wait, ksmd_should_run() || kthread_should_stop()); } } -- cgit v1.2.3-70-g09d2 From 5a03b051ed87e72b959f32a86054e1142ac4cf55 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:11 -0800 Subject: thp: use compaction in kswapd for GFP_ATOMIC order > 0 This takes advantage of memory compaction to properly generate pages of order > 0 if regular page reclaim fails and priority level becomes more severe and we don't reach the proper watermarks. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/compaction.h | 11 ++++++++--- mm/compaction.c | 31 ++++++++++++++++++++++++++----- mm/vmscan.c | 30 ++++++++++++++++++++---------- 3 files changed, 54 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/include/linux/compaction.h b/include/linux/compaction.h index 72cba403478..dfa2ed4c0d2 100644 --- a/include/linux/compaction.h +++ b/include/linux/compaction.h @@ -11,6 +11,9 @@ /* The full zone was compacted */ #define COMPACT_COMPLETE 3 +#define COMPACT_MODE_DIRECT_RECLAIM 0 +#define COMPACT_MODE_KSWAPD 1 + #ifdef CONFIG_COMPACTION extern int sysctl_compact_memory; extern int sysctl_compaction_handler(struct ctl_table *table, int write, @@ -25,7 +28,8 @@ extern unsigned long try_to_compact_pages(struct zonelist *zonelist, bool sync); extern unsigned long compaction_suitable(struct zone *zone, int order); extern unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync); + gfp_t gfp_mask, bool sync, + int compact_mode); /* Do not skip compaction more than 64 times */ #define COMPACT_MAX_DEFER_SHIFT 6 @@ -70,9 +74,10 @@ static inline unsigned long compaction_suitable(struct zone *zone, int order) } static inline unsigned long compact_zone_order(struct zone *zone, int order, - gfp_t gfp_mask, bool sync) + gfp_t gfp_mask, bool sync, + int compact_mode) { - return 0; + return COMPACT_CONTINUE; } static inline void defer_compaction(struct zone *zone) diff --git a/mm/compaction.c b/mm/compaction.c index da25b5a2e47..60d52a7298d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -42,6 +42,8 @@ struct compact_control { unsigned int order; /* order a direct compactor needs */ int migratetype; /* MOVABLE, RECLAIMABLE etc */ struct zone *zone; + + int compact_mode; }; static unsigned long release_freepages(struct list_head *freelist) @@ -382,10 +384,10 @@ static void update_nr_listpages(struct compact_control *cc) } static int compact_finished(struct zone *zone, - struct compact_control *cc) + struct compact_control *cc) { unsigned int order; - unsigned long watermark = low_wmark_pages(zone) + (1 << cc->order); + unsigned long watermark; if (fatal_signal_pending(current)) return COMPACT_PARTIAL; @@ -395,12 +397,27 @@ static int compact_finished(struct zone *zone, return COMPACT_COMPLETE; /* Compaction run is not finished if the watermark is not met */ + if (cc->compact_mode != COMPACT_MODE_KSWAPD) + watermark = low_wmark_pages(zone); + else + watermark = high_wmark_pages(zone); + watermark += (1 << cc->order); + if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; if (cc->order == -1) return COMPACT_CONTINUE; + /* + * Generating only one page of the right order is not enough + * for kswapd, we must continue until we're above the high + * watermark as a pool for high order GFP_ATOMIC allocations + * too. + */ + if (cc->compact_mode == COMPACT_MODE_KSWAPD) + return COMPACT_CONTINUE; + /* Direct compactor: Is a suitable page free? */ for (order = cc->order; order < MAX_ORDER; order++) { /* Job done if page is free of the right migratetype */ @@ -514,8 +531,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) } unsigned long compact_zone_order(struct zone *zone, - int order, gfp_t gfp_mask, - bool sync) + int order, gfp_t gfp_mask, + bool sync, + int compact_mode) { struct compact_control cc = { .nr_freepages = 0, @@ -524,6 +542,7 @@ unsigned long compact_zone_order(struct zone *zone, .migratetype = allocflags_to_migratetype(gfp_mask), .zone = zone, .sync = sync, + .compact_mode = compact_mode, }; INIT_LIST_HEAD(&cc.freepages); INIT_LIST_HEAD(&cc.migratepages); @@ -569,7 +588,8 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, nodemask) { int status; - status = compact_zone_order(zone, order, gfp_mask, sync); + status = compact_zone_order(zone, order, gfp_mask, sync, + COMPACT_MODE_DIRECT_RECLAIM); rc = max(status, rc); /* If a normal allocation would succeed, stop compacting */ @@ -600,6 +620,7 @@ static int compact_node(int nid) .nr_freepages = 0, .nr_migratepages = 0, .order = -1, + .compact_mode = COMPACT_MODE_DIRECT_RECLAIM, }; zone = &pgdat->node_zones[zoneid]; diff --git a/mm/vmscan.c b/mm/vmscan.c index cfdef0bcc7a..f5b762ae23a 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -2382,6 +2383,7 @@ loop_again: * cause too much scanning of the lower zones. */ for (i = 0; i <= end_zone; i++) { + int compaction; struct zone *zone = pgdat->node_zones + i; int nr_slab; @@ -2411,9 +2413,26 @@ loop_again: lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + + compaction = 0; + if (order && + zone_watermark_ok(zone, 0, + high_wmark_pages(zone), + end_zone, 0) && + !zone_watermark_ok(zone, order, + high_wmark_pages(zone), + end_zone, 0)) { + compact_zone_order(zone, + order, + sc.gfp_mask, false, + COMPACT_MODE_KSWAPD); + compaction = 1; + } + if (zone->all_unreclaimable) continue; - if (nr_slab == 0 && !zone_reclaimable(zone)) + if (!compaction && nr_slab == 0 && + !zone_reclaimable(zone)) zone->all_unreclaimable = 1; /* * If we've done a decent amount of scanning and @@ -2424,15 +2443,6 @@ loop_again: total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; - /* - * Compact the zone for higher orders to reduce - * latencies for higher-order allocations that - * would ordinarily call try_to_compact_pages() - */ - if (sc.order > PAGE_ALLOC_COSTLY_ORDER) - compact_zone_order(zone, sc.order, sc.gfp_mask, - false); - if (!zone_watermark_ok_safe(zone, order, high_wmark_pages(zone), end_zone, 0)) { all_zones_ok = 0; -- cgit v1.2.3-70-g09d2 From c5a73c3d55be1faadba35b41a862e036a3b12ddb Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:11 -0800 Subject: thp: use compaction for all allocation orders It makes no sense not to enable compaction for small order pages as we don't want to end up with bad order 2 allocations and good and graceful order 9 allocations. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 60d52a7298d..6d592a02107 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -578,7 +578,7 @@ unsigned long try_to_compact_pages(struct zonelist *zonelist, * made because an assumption is made that the page allocator can satisfy * the "cheaper" orders without taking special steps */ - if (order <= PAGE_ALLOC_COSTLY_ORDER || !may_enter_fs || !may_perform_io) + if (!order || !may_enter_fs || !may_perform_io) return rc; count_vm_event(COMPACTSTALL); -- cgit v1.2.3-70-g09d2 From 97562cd243298acf573620c764a1037bd545c9bc Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:12 -0800 Subject: thp: disable transparent hugepages by default on small systems On small systems, the extra memory used by the anti-fragmentation memory reserve and simply because huge pages are smaller than large pages can easily outweigh the benefits of less TLB misses. A less obvious concern is if run on a NUMA machine with asymmetric node sizes and one of them is very small. The reserve could make the node unusable. In case of the crashdump kernel, OOMs have been observed due to the anti-fragmentation memory reserve taking up a large fraction of the crashdump image. This patch disables transparent hugepages on systems with less than 1GB of RAM, but the hugepage subsystem is fully initialized so administrators can enable THP through /sys if desired. Signed-off-by: Rik van Riel Acked-by: Avi Kiviti Signed-off-by: Andrea Arcangeli Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 45b6d53bcfb..892d8a17a7e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -527,6 +527,14 @@ static int __init hugepage_init(void) goto out; } + /* + * By default disable transparent hugepages on smaller systems, + * where the extra memory used could hurt more than TLB overhead + * is likely to save. The admin can still enable it through /sys. + */ + if (totalram_pages < (512 << (20 - PAGE_SHIFT))) + transparent_hugepage_flags = 0; + start_khugepaged(); set_recommended_min_free_kbytes(); -- cgit v1.2.3-70-g09d2 From 2c888cfbc1b45508a44763d85ba2e8ac43faff5f Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: fix anon memory statistics with transparent hugepages Count each transparent hugepage as HPAGE_PMD_NR pages in the LRU statistics, so the Active(anon) and Inactive(anon) statistics in /proc/meminfo are correct. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 8 ++++++++ include/linux/mm_inline.h | 8 +++++--- mm/huge_memory.c | 10 ++++++++++ mm/memcontrol.c | 2 +- mm/vmscan.c | 11 ++++++----- 5 files changed, 30 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 82759522873..9b48c24df26 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -117,11 +117,19 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, return; __vma_adjust_trans_huge(vma, start, end, adjust_next); } +static inline int hpage_nr_pages(struct page *page) +{ + if (unlikely(PageTransHuge(page))) + return HPAGE_PMD_NR; + return 1; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) #define HPAGE_PMD_SIZE ({ BUG(); 0; }) +#define hpage_nr_pages(x) 1 + #define transparent_hugepage_enabled(__vma) 0 #define transparent_hugepage_flags 0UL diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 650f31eabdb..8f7d24712dc 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -1,6 +1,8 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include + /** * page_is_file_cache - should the page be on a file LRU or anon LRU? * @page: the page to test @@ -24,7 +26,7 @@ __add_page_to_lru_list(struct zone *zone, struct page *page, enum lru_list l, struct list_head *head) { list_add(&page->lru, head); - __inc_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, hpage_nr_pages(page)); mem_cgroup_add_lru_list(page, l); } @@ -38,7 +40,7 @@ static inline void del_page_from_lru_list(struct zone *zone, struct page *page, enum lru_list l) { list_del(&page->lru); - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } @@ -73,7 +75,7 @@ del_page_from_lru(struct zone *zone, struct page *page) l += LRU_ACTIVE; } } - __dec_zone_state(zone, NR_LRU_BASE + l); + __mod_zone_page_state(zone, NR_LRU_BASE + l, -hpage_nr_pages(page)); mem_cgroup_del_lru_list(page, l); } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 892d8a17a7e..f4f6041176a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1143,6 +1143,7 @@ static void __split_huge_page_refcount(struct page *page) int i; unsigned long head_index = page->index; struct zone *zone = page_zone(page); + int zonestat; /* prevent PageLRU to go away from under us, and freeze lru stats */ spin_lock_irq(&zone->lru_lock); @@ -1207,6 +1208,15 @@ static void __split_huge_page_refcount(struct page *page) __dec_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __mod_zone_page_state(zone, NR_ANON_PAGES, HPAGE_PMD_NR); + /* + * A hugepage counts for HPAGE_PMD_NR pages on the LRU statistics, + * so adjust those appropriately if this page is on the LRU. + */ + if (PageLRU(page)) { + zonestat = NR_LRU_BASE + page_lru(page); + __mod_zone_page_state(zone, zonestat, -(HPAGE_PMD_NR-1)); + } + ClearPageCompound(page); compound_unlock(page); spin_unlock_irq(&zone->lru_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1bb59d4c9d..f4ea3410fb4 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1091,7 +1091,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: /* we don't affect global LRU but rotate in our LRU */ diff --git a/mm/vmscan.c b/mm/vmscan.c index f5b762ae23a..0882014d2ce 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1045,7 +1045,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, case 0: list_move(&page->lru, dst); mem_cgroup_del_lru(page); - nr_taken++; + nr_taken += hpage_nr_pages(page); break; case -EBUSY: @@ -1103,7 +1103,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); mem_cgroup_del_lru(cursor_page); - nr_taken++; + nr_taken += hpage_nr_pages(page); nr_lumpy_taken++; if (PageDirty(cursor_page)) nr_lumpy_dirty++; @@ -1158,14 +1158,15 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { + int numpages = hpage_nr_pages(page); lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); - nr_active++; + nr_active += numpages; } if (count) - count[lru]++; + count[lru] += numpages; } return nr_active; @@ -1483,7 +1484,7 @@ static void move_active_pages_to_lru(struct zone *zone, list_move(&page->lru, &zone->lru[lru].list); mem_cgroup_add_lru_list(page, lru); - pgmoved++; + pgmoved += hpage_nr_pages(page); if (!pagevec_add(&pvec, page) || list_empty(list)) { spin_unlock_irq(&zone->lru_lock); -- cgit v1.2.3-70-g09d2 From 9992af102974f3f8a02a1f2729c3461881539e26 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Thu, 13 Jan 2011 15:47:13 -0800 Subject: thp: scale nr_rotated to balance memory pressure Make sure we scale up nr_rotated when we encounter a referenced transparent huge page. This ensures pageout scanning balance is not distorted when there are huge pages on the LRU. Signed-off-by: Rik van Riel Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 0882014d2ce..47a50962ce8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1276,7 +1276,8 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); - reclaim_stat->recent_rotated[file]++; + int numpages = hpage_nr_pages(page); + reclaim_stat->recent_rotated[file] += numpages; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1552,7 +1553,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { - nr_rotated++; + nr_rotated += hpage_nr_pages(page); /* * Identify referenced, file-backed active pages and * give them one more trip around the active list. So -- cgit v1.2.3-70-g09d2 From 14d1a55cd26f1860f837f37ae42520c7c13b1347 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:15 -0800 Subject: thp: add debug checks for mapcount related invariants Add debug checks for invariants that if broken could lead to mapcount vs page_mapcount debug checks to trigger later in split_huge_page. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 12ee1ea237f..31250faff39 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -804,6 +804,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src next = pmd_addr_end(addr, end); if (pmd_trans_huge(*src_pmd)) { int err; + VM_BUG_ON(next-addr != HPAGE_PMD_SIZE); err = copy_huge_pmd(dst_mm, src_mm, dst_pmd, src_pmd, addr, vma); if (err == -ENOMEM) @@ -1015,9 +1016,10 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, do { next = pmd_addr_end(addr, end); if (pmd_trans_huge(*pmd)) { - if (next-addr != HPAGE_PMD_SIZE) + if (next-addr != HPAGE_PMD_SIZE) { + VM_BUG_ON(!rwsem_is_locked(&tlb->mm->mmap_sem)); split_huge_page_pmd(vma->vm_mm, pmd); - else if (zap_huge_pmd(tlb, vma, pmd)) { + } else if (zap_huge_pmd(tlb, vma, pmd)) { (*zap_work)--; continue; } -- cgit v1.2.3-70-g09d2 From 91600e9e592e48736e630851c83da2ad6bf0e91f Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: fix memory-failure hugetlbfs vs THP collision hugetlbfs was changed to allow memory failure to migrate the hugetlbfs pages and that broke THP as split_huge_page was then called on hugetlbfs pages too. compound_head/order was also run unsafe on THP pages that can be splitted at any time. All compound_head() invocations in memory-failure.c that are run on pages that aren't pinned and that can be freed and reused from under us (while compound_head is running) are buggy because compound_head can return a dangling pointer, but I'm not fixing this as this is a generic memory-failure bug not specific to THP but it applies to hugetlbfs too, so I can fix it later after THP is merged upstream. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- mm/rmap.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 6a283cc9317..1b43d0ffff6 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -386,7 +386,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - if (unlikely(split_huge_page(page))) + if (!PageHuge(page) && unlikely(split_huge_page(page))) return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); diff --git a/mm/rmap.c b/mm/rmap.c index 3825ae4bc32..c30f33854f9 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1430,7 +1430,7 @@ int try_to_unmap(struct page *page, enum ttu_flags flags) int ret; BUG_ON(!PageLocked(page)); - BUG_ON(PageTransHuge(page)); + VM_BUG_ON(!PageHuge(page) && PageTransHuge(page)); if (unlikely(PageKsm(page))) ret = try_to_unmap_ksm(page, flags); -- cgit v1.2.3-70-g09d2 From 37c2ac7872a9387542616f658d20ac25f5bdb32e Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:16 -0800 Subject: thp: compound_trans_order Read compound_trans_order safe. Noop for CONFIG_TRANSPARENT_HUGEPAGE=n. Signed-off-by: Andrea Arcangeli Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 14 ++++++++++++++ mm/memcontrol.c | 12 ++++++------ mm/memory-failure.c | 12 ++++++------ 3 files changed, 26 insertions(+), 12 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 9c2695beab8..ce97a2bb0b1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -450,6 +450,20 @@ static inline int compound_order(struct page *page) return (unsigned long)page[1].lru.prev; } +static inline int compound_trans_order(struct page *page) +{ + int order; + unsigned long flags; + + if (!PageHead(page)) + return 0; + + flags = compound_lock_irqsave(page); + order = compound_order(page); + compound_unlock_irqrestore(page, flags); + return order; +} + static inline void set_compound_order(struct page *page, unsigned long order) { page[1].lru.prev = (void *)order; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f4ea3410fb4..741206ffdac 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1027,10 +1027,6 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; - int page_size = PAGE_SIZE; - - if (PageTransHuge(page)) - page_size <<= compound_order(page); if (mem_cgroup_disabled()) return NULL; @@ -2286,8 +2282,10 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, int ret; int page_size = PAGE_SIZE; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } pc = lookup_page_cgroup(page); /* can happen at boot */ @@ -2558,8 +2556,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) if (PageSwapCache(page)) return NULL; - if (PageTransHuge(page)) + if (PageTransHuge(page)) { page_size <<= compound_order(page); + VM_BUG_ON(!PageTransHuge(page)); + } count = page_size >> PAGE_SHIFT; /* diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1b43d0ffff6..548fbd70f02 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -203,7 +203,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno, #ifdef __ARCH_SI_TRAPNO si.si_trapno = trapno; #endif - si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT; + si.si_addr_lsb = compound_trans_order(compound_head(page)) + PAGE_SHIFT; /* * Don't use force here, it's convenient if the signal * can be temporarily blocked. @@ -930,7 +930,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, static void set_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) SetPageHWPoison(hpage + i); } @@ -938,7 +938,7 @@ static void set_page_hwpoison_huge_page(struct page *hpage) static void clear_page_hwpoison_huge_page(struct page *hpage) { int i; - int nr_pages = 1 << compound_order(hpage); + int nr_pages = 1 << compound_trans_order(hpage); for (i = 0; i < nr_pages; i++) ClearPageHWPoison(hpage + i); } @@ -968,7 +968,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) return 0; } - nr_pages = 1 << compound_order(hpage); + nr_pages = 1 << compound_trans_order(hpage); atomic_long_add(nr_pages, &mce_bad_pages); /* @@ -1166,7 +1166,7 @@ int unpoison_memory(unsigned long pfn) return 0; } - nr_pages = 1 << compound_order(page); + nr_pages = 1 << compound_trans_order(page); if (!get_page_unless_zero(page)) { /* @@ -1304,7 +1304,7 @@ static int soft_offline_huge_page(struct page *page, int flags) } done: if (!PageHWPoison(hpage)) - atomic_long_add(1 << compound_order(hpage), &mce_bad_pages); + atomic_long_add(1 << compound_trans_order(hpage), &mce_bad_pages); set_page_hwpoison_huge_page(hpage); dequeue_hwpoisoned_huge_page(hpage); /* keep elevated page count for bad page */ -- cgit v1.2.3-70-g09d2 From a664b2d8555c659127bf8fe049a58449d394a707 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:17 -0800 Subject: thp: madvise(MADV_NOHUGEPAGE) Add madvise MADV_NOHUGEPAGE to mark regions that are not important to be hugepage backed. Return -EINVAL if the vma is not of an anonymous type, or the feature isn't built into the kernel. Never silently return success. Signed-off-by: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 14 ++++++++------ include/linux/khugepaged.h | 7 ++++--- include/linux/mm.h | 1 + mm/huge_memory.c | 41 ++++++++++++++++++++++++++++++----------- mm/madvise.c | 4 +++- 5 files changed, 46 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 9b48c24df26..a8b7e42d19e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -52,10 +52,12 @@ extern pmd_t *page_check_address_pmd(struct page *page, #define HPAGE_PMD_SIZE HPAGE_SIZE #define transparent_hugepage_enabled(__vma) \ - (transparent_hugepage_flags & (1<vm_flags & VM_HUGEPAGE)) + ((transparent_hugepage_flags & \ + (1<vm_flags & VM_HUGEPAGE))) && \ + !((__vma)->vm_flags & VM_NOHUGEPAGE)) #define transparent_hugepage_defrag(__vma) \ ((transparent_hugepage_flags & \ (1< MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags); +extern int hugepage_madvise(unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -141,7 +143,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags) +static inline int hugepage_madvise(unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 552f3184756..6b394f0b514 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -38,9 +38,10 @@ static inline void khugepaged_exit(struct mm_struct *mm) static inline int khugepaged_enter(struct vm_area_struct *vma) { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags)) - if (khugepaged_always() || - (khugepaged_req_madv() && - vma->vm_flags & VM_HUGEPAGE)) + if ((khugepaged_always() || + (khugepaged_req_madv() && + vma->vm_flags & VM_HUGEPAGE)) && + !(vma->vm_flags & VM_NOHUGEPAGE)) if (__khugepaged_enter(vma->vm_mm)) return -ENOMEM; return 0; diff --git a/include/linux/mm.h b/include/linux/mm.h index ce97a2bb0b1..956a35532f4 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -83,6 +83,7 @@ extern unsigned int kobjsize(const void *objp); #define VM_GROWSUP 0x00000200 #else #define VM_GROWSUP 0x00000000 +#define VM_NOHUGEPAGE 0x00000200 /* MADV_NOHUGEPAGE marked this vma */ #endif #define VM_PFNMAP 0x00000400 /* Page-ranges managed without "struct page", just pure PFN */ #define VM_DENYWRITE 0x00000800 /* ETXTBSY on write attempts.. */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f4f6041176a..fce667c0281 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include "internal.h" @@ -1388,18 +1389,36 @@ out: return ret; } -int hugepage_madvise(unsigned long *vm_flags) +int hugepage_madvise(unsigned long *vm_flags, int advice) { - /* - * Be somewhat over-protective like KSM for now! - */ - if (*vm_flags & (VM_HUGEPAGE | VM_SHARED | VM_MAYSHARE | - VM_PFNMAP | VM_IO | VM_DONTEXPAND | - VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | - VM_MIXEDMAP | VM_SAO)) - return -EINVAL; - - *vm_flags |= VM_HUGEPAGE; + switch (advice) { + case MADV_HUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_HUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_NOHUGEPAGE; + *vm_flags |= VM_HUGEPAGE; + break; + case MADV_NOHUGEPAGE: + /* + * Be somewhat over-protective like KSM for now! + */ + if (*vm_flags & (VM_NOHUGEPAGE | + VM_SHARED | VM_MAYSHARE | + VM_PFNMAP | VM_IO | VM_DONTEXPAND | + VM_RESERVED | VM_HUGETLB | VM_INSERTPAGE | + VM_MIXEDMAP | VM_SAO)) + return -EINVAL; + *vm_flags &= ~VM_HUGEPAGE; + *vm_flags |= VM_NOHUGEPAGE; + break; + } return 0; } diff --git a/mm/madvise.c b/mm/madvise.c index ecde40a401c..bbac126e03e 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -72,7 +72,8 @@ static long madvise_behavior(struct vm_area_struct * vma, goto out; break; case MADV_HUGEPAGE: - error = hugepage_madvise(&new_flags); + case MADV_NOHUGEPAGE: + error = hugepage_madvise(&new_flags, behavior); if (error) goto out; break; @@ -290,6 +291,7 @@ madvise_behavior_valid(int behavior) #endif #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: + case MADV_NOHUGEPAGE: #endif return 1; -- cgit v1.2.3-70-g09d2 From 60ab3244ec85c44276c585a2a20d3750402e1cf4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:18 -0800 Subject: thp: khugepaged: make khugepaged aware about madvise MADV_HUGEPAGE and MADV_NOHUGEPAGE were fully effective only if run after mmap and before touching the memory. While this is enough for most usages, it's little effort to make madvise more dynamic at runtime on an existing mapping by making khugepaged aware about madvise. MADV_HUGEPAGE: register in khugepaged immediately without waiting a page fault (that may not ever happen if all pages are already mapped and the "enabled" knob was set to madvise during the initial page faults). MADV_NOHUGEPAGE: skip vmas marked VM_NOHUGEPAGE in khugepaged to stop collapsing pages where not needed. [akpm@linux-foundation.org: tweak comment] Signed-off-by: Andrea Arcangeli Cc: Michael Kerrisk Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 6 ++++-- mm/huge_memory.c | 23 +++++++++++++++++++---- mm/madvise.c | 2 +- 3 files changed, 24 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index a8b7e42d19e..bddfba1d7b8 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -105,7 +105,8 @@ extern void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd); #if HPAGE_PMD_ORDER > MAX_ORDER #error "hugepages can't be allocated by the buddy allocator" #endif -extern int hugepage_madvise(unsigned long *vm_flags, int advice); +extern int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice); extern void __vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, @@ -143,7 +144,8 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) -static inline int hugepage_madvise(unsigned long *vm_flags, int advice) +static inline int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { BUG(); return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index fce667c0281..004c9c2aac7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1389,7 +1389,8 @@ out: return ret; } -int hugepage_madvise(unsigned long *vm_flags, int advice) +int hugepage_madvise(struct vm_area_struct *vma, + unsigned long *vm_flags, int advice) { switch (advice) { case MADV_HUGEPAGE: @@ -1404,6 +1405,13 @@ int hugepage_madvise(unsigned long *vm_flags, int advice) return -EINVAL; *vm_flags &= ~VM_NOHUGEPAGE; *vm_flags |= VM_HUGEPAGE; + /* + * If the vma become good for khugepaged to scan, + * register it here without waiting a page fault that + * may not happen any time soon. + */ + if (unlikely(khugepaged_enter_vma_merge(vma))) + return -ENOMEM; break; case MADV_NOHUGEPAGE: /* @@ -1417,6 +1425,11 @@ int hugepage_madvise(unsigned long *vm_flags, int advice) return -EINVAL; *vm_flags &= ~VM_HUGEPAGE; *vm_flags |= VM_NOHUGEPAGE; + /* + * Setting VM_NOHUGEPAGE will prevent khugepaged from scanning + * this vma even if we leave the mm registered in khugepaged if + * it got registered before VM_NOHUGEPAGE was set. + */ break; } @@ -1784,7 +1797,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - if (!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) goto out; /* VM_PFNMAP vmas may have vm_ops null but vm_file set */ @@ -2007,8 +2021,9 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, break; } - if (!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) { + if ((!(vma->vm_flags & VM_HUGEPAGE) && + !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) { progress++; continue; } diff --git a/mm/madvise.c b/mm/madvise.c index bbac126e03e..2221491ed50 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -73,7 +73,7 @@ static long madvise_behavior(struct vm_area_struct * vma, break; case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: - error = hugepage_madvise(&new_flags, behavior); + error = hugepage_madvise(vma, &new_flags, behavior); if (error) goto out; break; -- cgit v1.2.3-70-g09d2 From 29ad768cfc08611a4c1070d0f13f82eeea2bac7b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:19 -0800 Subject: thp: KSM on THP This makes KSM full operational with THP pages. Subpages are scanned while the hugepage is still in place and delivering max cpu performance, and only if there's a match and we're going to deduplicate memory, the single hugepages with the subpage match is split. There will be no false sharing between ksmd and khugepaged. khugepaged won't collapse 2m virtual regions with KSM pages inside. ksmd also should only split pages when the checksum matches and we're likely to split an hugepage for some long living ksm page (usual ksm heuristic to avoid sharing pages that get de-cowed). Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 58 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index e2b0afd0a03..4d5a681923b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -412,6 +412,29 @@ out: up_read(&mm->mmap_sem); } +static struct page *page_trans_compound_anon(struct page *page) +{ + if (PageTransCompound(page)) { + struct page *head; + head = compound_head(page); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail + * before overwriting first_page, so if + * PageTail is still there it means the head + * pointer isn't dangling. + */ + if (head != page) { + smp_rmb(); + if (!PageTransCompound(page)) + return NULL; + } + if (PageAnon(head)) + return head; + } + return NULL; +} + static struct page *get_mergeable_page(struct rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; @@ -431,7 +454,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) page = follow_page(vma, addr, FOLL_GET); if (IS_ERR_OR_NULL(page)) goto out; - if (PageAnon(page) && !PageTransCompound(page)) { + if (PageAnon(page) || page_trans_compound_anon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { @@ -709,6 +732,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; + BUG_ON(PageTransCompound(page)); ptep = page_check_address(page, mm, addr, &ptl, 0); if (!ptep) goto out; @@ -784,6 +808,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, goto out; pmd = pmd_offset(pud, addr); + BUG_ON(pmd_trans_huge(*pmd)); if (!pmd_present(*pmd)) goto out; @@ -811,6 +836,33 @@ out: return err; } +static int page_trans_compound_anon_split(struct page *page) +{ + int ret = 0; + struct page *transhuge_head = page_trans_compound_anon(page); + if (transhuge_head) { + /* Get the reference on the head to split it. */ + if (get_page_unless_zero(transhuge_head)) { + /* + * Recheck we got the reference while the head + * was still anonymous. + */ + if (PageAnon(transhuge_head)) + ret = split_huge_page(transhuge_head); + else + /* + * Retry later if split_huge_page run + * from under us. + */ + ret = 1; + put_page(transhuge_head); + } else + /* Retry later if split_huge_page run from under us. */ + ret = 1; + } + return ret; +} + /* * try_to_merge_one_page - take two pages and merge them into one * @vma: the vma that holds the pte pointing to page @@ -831,6 +883,9 @@ static int try_to_merge_one_page(struct vm_area_struct *vma, if (!(vma->vm_flags & VM_MERGEABLE)) goto out; + if (PageTransCompound(page) && page_trans_compound_anon_split(page)) + goto out; + BUG_ON(PageTransCompound(page)); if (!PageAnon(page)) goto out; @@ -1285,14 +1340,8 @@ next_mm: cond_resched(); continue; } - if (PageTransCompound(*page)) { - put_page(*page); - ksm_scan.address &= HPAGE_PMD_MASK; - ksm_scan.address += HPAGE_PMD_SIZE; - cond_resched(); - continue; - } - if (PageAnon(*page)) { + if (PageAnon(*page) || + page_trans_compound_anon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); rmap_item = get_next_rmap_item(slot, -- cgit v1.2.3-70-g09d2 From 22e5c47ee238abe636655c3862ed28d6eb084ad4 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 13 Jan 2011 15:47:20 -0800 Subject: thp: add compound_trans_head() helper Cleanup some code with common compound_trans_head helper. Signed-off-by: Andrea Arcangeli Cc: Hugh Dickins Cc: Johannes Weiner Cc: Marcelo Tosatti Cc: Avi Kivity Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/huge_mm.h | 18 ++++++++++++++++++ mm/ksm.c | 15 +++------------ virt/kvm/kvm_main.c | 38 ++++++++++++++------------------------ 3 files changed, 35 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index bddfba1d7b8..8e6c8c42bc3 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -126,6 +126,23 @@ static inline int hpage_nr_pages(struct page *page) return HPAGE_PMD_NR; return 1; } +static inline struct page *compound_trans_head(struct page *page) +{ + if (PageTail(page)) { + struct page *head; + head = page->first_page; + smp_rmb(); + /* + * head may be a dangling pointer. + * __split_huge_page_refcount clears PageTail before + * overwriting first_page, so if PageTail is still + * there it means the head pointer isn't dangling. + */ + if (PageTail(page)) + return head; + } + return page; +} #else /* CONFIG_TRANSPARENT_HUGEPAGE */ #define HPAGE_PMD_SHIFT ({ BUG(); 0; }) #define HPAGE_PMD_MASK ({ BUG(); 0; }) @@ -144,6 +161,7 @@ static inline int split_huge_page(struct page *page) do { } while (0) #define wait_split_huge_page(__anon_vma, __pmd) \ do { } while (0) +#define compound_trans_head(page) compound_head(page) static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { diff --git a/mm/ksm.c b/mm/ksm.c index 4d5a681923b..33781de0b6b 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -415,20 +415,11 @@ out: static struct page *page_trans_compound_anon(struct page *page) { if (PageTransCompound(page)) { - struct page *head; - head = compound_head(page); + struct page *head = compound_trans_head(page); /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail - * before overwriting first_page, so if - * PageTail is still there it means the head - * pointer isn't dangling. + * head may actually be splitted and freed from under + * us but it's ok here. */ - if (head != page) { - smp_rmb(); - if (!PageTransCompound(page)) - return NULL; - } if (PageAnon(head)) return head; } diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 4286d476651..f29abeb6a91 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -104,34 +104,24 @@ static pfn_t fault_pfn; inline int kvm_is_mmio_pfn(pfn_t pfn) { if (pfn_valid(pfn)) { - struct page *head; + int reserved; struct page *tail = pfn_to_page(pfn); - head = compound_head(tail); + struct page *head = compound_trans_head(tail); + reserved = PageReserved(head); if (head != tail) { - smp_rmb(); /* - * head may be a dangling pointer. - * __split_huge_page_refcount clears PageTail - * before overwriting first_page, so if - * PageTail is still there it means the head - * pointer isn't dangling. + * "head" is not a dangling pointer + * (compound_trans_head takes care of that) + * but the hugepage may have been splitted + * from under us (and we may not hold a + * reference count on the head page so it can + * be reused before we run PageReferenced), so + * we've to check PageTail before returning + * what we just read. */ - if (PageTail(tail)) { - /* - * the "head" is not a dangling - * pointer but the hugepage may have - * been splitted from under us (and we - * may not hold a reference count on - * the head page so it can be reused - * before we run PageReferenced), so - * we've to recheck PageTail before - * returning what we just read. - */ - int reserved = PageReserved(head); - smp_rmb(); - if (PageTail(tail)) - return reserved; - } + smp_rmb(); + if (PageTail(tail)) + return reserved; } return PageReserved(tail); } -- cgit v1.2.3-70-g09d2 From 29c1f677d424e8c5683a837fc4f03fc9f19201d7 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 13 Jan 2011 15:47:21 -0800 Subject: mm: migration: use rcu_dereference_protected when dereferencing the radix tree slot during file page migration migrate_pages() -> unmap_and_move() only calls rcu_read_lock() for anonymous pages, as introduced by git commit 989f89c57e6361e7d16fbd9572b5da7d313b073d ("fix rcu_read_lock() in page migraton"). The point of the RCU protection there is part of getting a stable reference to anon_vma and is only held for anon pages as file pages are locked which is sufficient protection against freeing. However, while a file page's mapping is being migrated, the radix tree is double checked to ensure it is the expected page. This uses radix_tree_deref_slot() -> rcu_dereference() without the RCU lock held triggering the following warning. [ 173.674290] =================================================== [ 173.676016] [ INFO: suspicious rcu_dereference_check() usage. ] [ 173.676016] --------------------------------------------------- [ 173.676016] include/linux/radix-tree.h:145 invoked rcu_dereference_check() without protection! [ 173.676016] [ 173.676016] other info that might help us debug this: [ 173.676016] [ 173.676016] [ 173.676016] rcu_scheduler_active = 1, debug_locks = 0 [ 173.676016] 1 lock held by hugeadm/2899: [ 173.676016] #0: (&(&inode->i_data.tree_lock)->rlock){..-.-.}, at: [] migrate_page_move_mapping+0x40/0x1ab [ 173.676016] [ 173.676016] stack backtrace: [ 173.676016] Pid: 2899, comm: hugeadm Not tainted 2.6.37-rc5-autobuild [ 173.676016] Call Trace: [ 173.676016] [] ? printk+0x14/0x1b [ 173.676016] [] lockdep_rcu_dereference+0x7d/0x86 [ 173.676016] [] migrate_page_move_mapping+0xca/0x1ab [ 173.676016] [] migrate_page+0x23/0x39 [ 173.676016] [] buffer_migrate_page+0x22/0x107 [ 173.676016] [] ? buffer_migrate_page+0x0/0x107 [ 173.676016] [] move_to_new_page+0x9a/0x1ae [ 173.676016] [] migrate_pages+0x1e7/0x2fa This patch introduces radix_tree_deref_slot_protected() which calls rcu_dereference_protected(). Users of it must pass in the mapping->tree_lock that is protecting this dereference. Holding the tree lock protects against parallel updaters of the radix tree meaning that rcu_dereference_protected is allowable. [akpm@linux-foundation.org: remove unneeded casts] Signed-off-by: Mel Gorman Cc: Minchan Kim Cc: KAMEZAWA Hiroyuki Cc: Milton Miller Cc: Nick Piggin Cc: Wu Fengguang Cc: [2.6.37.early] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 16 ++++++++++++++++ mm/migrate.c | 4 ++-- 2 files changed, 18 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index ab2baa5c488..23241c2fecc 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -145,6 +145,22 @@ static inline void *radix_tree_deref_slot(void **pslot) return rcu_dereference(*pslot); } +/** + * radix_tree_deref_slot_protected - dereference a slot without RCU lock but with tree lock held + * @pslot: pointer to slot, returned by radix_tree_lookup_slot + * Returns: item that was stored in that slot with any direct pointer flag + * removed. + * + * Similar to radix_tree_deref_slot but only used during migration when a pages + * mapping is being moved. The caller does not hold the RCU read lock but it + * must hold the tree lock to prevent parallel updates. + */ +static inline void *radix_tree_deref_slot_protected(void **pslot, + spinlock_t *treelock) +{ + return rcu_dereference_protected(*pslot, lockdep_is_held(treelock)); +} + /** * radix_tree_deref_retry - check radix_tree_deref_slot * @arg: pointer returned by radix_tree_deref_slot diff --git a/mm/migrate.c b/mm/migrate.c index 1a531b760b3..89a6bc8cd30 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -248,7 +248,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } @@ -320,7 +320,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, expected_count = 2 + page_has_private(page); if (page_count(page) != expected_count || - (struct page *)radix_tree_deref_slot(pslot) != page) { + radix_tree_deref_slot_protected(pslot, &mapping->tree_lock) != page) { spin_unlock_irq(&mapping->tree_lock); return -EAGAIN; } -- cgit v1.2.3-70-g09d2 From 32d6feadf4e17ea9b98071be9bbf402a74a4f818 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 13 Jan 2011 15:47:22 -0800 Subject: mm/hugetlb.c: fix error-path memory leak in nr_hugepages_store_common() The NODEMASK_ALLOC macro may dynamically allocate memory for its second argument ('nodes_allowed' in this context). In nr_hugepages_store_common() we may abort early if strict_strtoul() fails, but in that case we do not free the memory already allocated to 'nodes_allowed', causing a memory leak. This patch closes the leak by freeing the memory in the error path. [akpm@linux-foundation.org: use NODEMASK_FREE, per Minchan Kim] Signed-off-by: Jesper Juhl Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7bf223d6677..8e31cda6fc2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1374,8 +1374,10 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); err = strict_strtoul(buf, 10, &count); - if (err) + if (err) { + NODEMASK_FREE(nodes_allowed); return 0; + } h = kobj_to_hstate(kobj, &nid); if (nid == NUMA_NO_NODE) { -- cgit v1.2.3-70-g09d2 From 5520e89485252c759ee60d313e9422447659947b Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 13 Jan 2011 15:47:23 -0800 Subject: brk: fix min_brk lower bound computation for COMPAT_BRK Even if CONFIG_COMPAT_BRK is set in the kernel configuration, it can still be overriden by randomize_va_space sysctl. If this is the case, the min_brk computation in sys_brk() implementation is wrong, as it solely takes into account COMPAT_BRK setting, assuming that brk start is not randomized. But that might not be the case if randomize_va_space sysctl has been set to '2' at the time the binary has been loaded from disk. In such case, the check has to be done in a same way as in !CONFIG_COMPAT_BRK case. In addition to that, the check for the COMPAT_BRK case introduced back in a5b4592c ("brk: make sys_brk() honor COMPAT_BRK when computing lower bound") is slightly wrong -- the lower bound shouldn't be mm->end_code, but mm->end_data instead, as that's where the legacy applications expect brk section to start (i.e. immediately after last global variable). [akpm@linux-foundation.org: fix comment] Signed-off-by: Jiri Kosina Cc: Geert Uytterhoeven Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 73cc648873d..2ec8eb5a9cd 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -254,7 +254,15 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) down_write(&mm->mmap_sem); #ifdef CONFIG_COMPAT_BRK - min_brk = mm->end_code; + /* + * CONFIG_COMPAT_BRK can still be overridden by setting + * randomize_va_space to 2, which will still cause mm->start_brk + * to be arbitrarily shifted + */ + if (mm->start_brk > PAGE_ALIGN(mm->end_data)) + min_brk = mm->start_brk; + else + min_brk = mm->end_data; #else min_brk = mm->start_brk; #endif -- cgit v1.2.3-70-g09d2 From 43506fad21ca3d8dc59e768ff458f7c5e5c01086 Mon Sep 17 00:00:00 2001 From: KyongHo Cho Date: Thu, 13 Jan 2011 15:47:24 -0800 Subject: mm/page_alloc.c: simplify calculation of combined index of adjacent buddy lists The previous approach of calucation of combined index was page_idx & ~(1 << order)) but we have same result with page_idx & buddy_idx This reduces instructions slightly as well as enhances readability. [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: fix used-unintialised warning] Signed-off-by: KyongHo Cho Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9dfe49bceff..bda1db301d4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -427,18 +427,10 @@ static inline void rmv_page_order(struct page *page) * * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ -static inline struct page * -__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) -{ - unsigned long buddy_idx = page_idx ^ (1 << order); - - return page + (buddy_idx - page_idx); -} - static inline unsigned long -__find_combined_index(unsigned long page_idx, unsigned int order) +__find_buddy_index(unsigned long page_idx, unsigned int order) { - return (page_idx & ~(1 << order)); + return page_idx ^ (1 << order); } /* @@ -500,6 +492,7 @@ static inline void __free_one_page(struct page *page, { unsigned long page_idx; unsigned long combined_idx; + unsigned long uninitialized_var(buddy_idx); struct page *buddy; if (unlikely(PageCompound(page))) @@ -514,7 +507,8 @@ static inline void __free_one_page(struct page *page, VM_BUG_ON(bad_range(zone, page)); while (order < MAX_ORDER-1) { - buddy = __page_find_buddy(page, page_idx, order); + buddy_idx = __find_buddy_index(page_idx, order); + buddy = page + (buddy_idx - page_idx); if (!page_is_buddy(page, buddy, order)) break; @@ -522,7 +516,7 @@ static inline void __free_one_page(struct page *page, list_del(&buddy->lru); zone->free_area[order].nr_free--; rmv_page_order(buddy); - combined_idx = __find_combined_index(page_idx, order); + combined_idx = buddy_idx & page_idx; page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -539,9 +533,10 @@ static inline void __free_one_page(struct page *page, */ if ((order < MAX_ORDER-2) && pfn_valid_within(page_to_pfn(buddy))) { struct page *higher_page, *higher_buddy; - combined_idx = __find_combined_index(page_idx, order); - higher_page = page + combined_idx - page_idx; - higher_buddy = __page_find_buddy(higher_page, combined_idx, order + 1); + combined_idx = buddy_idx & page_idx; + higher_page = page + (combined_idx - page_idx); + buddy_idx = __find_buddy_index(combined_idx, order + 1); + higher_buddy = page + (buddy_idx - combined_idx); if (page_is_buddy(higher_page, higher_buddy, order + 1)) { list_add_tail(&page->lru, &zone->free_area[order].free_list[migratetype]); -- cgit v1.2.3-70-g09d2 From 84bc227d7fde049a568cd58a5610613feedc0dff Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Thu, 13 Jan 2011 15:47:24 -0800 Subject: mm/dmapool.c: take lock only once in dma_pool_free() dma_pool_free() scans for the page to free in the pool list holding the pool lock. Then it releases the lock basically to acquire it immediately again. Modify the code to only take the lock once. This will do some additional loops and computations with the lock held in if memory debugging is activated. If it is not activated the only new operations with this lock is one if and one substraction. Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index 4df2de77e06..a2f6295b4df 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -355,20 +355,15 @@ EXPORT_SYMBOL(dma_pool_alloc); static struct dma_page *pool_find_page(struct dma_pool *pool, dma_addr_t dma) { - unsigned long flags; struct dma_page *page; - spin_lock_irqsave(&pool->lock, flags); list_for_each_entry(page, &pool->page_list, page_list) { if (dma < page->dma) continue; if (dma < (page->dma + pool->allocation)) - goto done; + return page; } - page = NULL; - done: - spin_unlock_irqrestore(&pool->lock, flags); - return page; + return NULL; } /** @@ -386,8 +381,10 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) unsigned long flags; unsigned int offset; + spin_lock_irqsave(&pool->lock, flags); page = pool_find_page(pool, dma); if (!page) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p/%lx (bad dma)\n", @@ -401,6 +398,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) offset = vaddr - page->vaddr; #ifdef DMAPOOL_DEBUG if ((dma - page->dma) != offset) { + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, %p (bad vaddr)/%Lx\n", @@ -418,6 +416,7 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) chain = *(int *)(page->vaddr + chain); continue; } + spin_unlock_irqrestore(&pool->lock, flags); if (pool->dev) dev_err(pool->dev, "dma_pool_free %s, dma %Lx " "already free\n", pool->name, @@ -432,7 +431,6 @@ void dma_pool_free(struct dma_pool *pool, void *vaddr, dma_addr_t dma) memset(vaddr, POOL_POISON_FREED, pool->size); #endif - spin_lock_irqsave(&pool->lock, flags); page->in_use--; *(int *)vaddr = page->offset; page->offset = offset; -- cgit v1.2.3-70-g09d2 From 684265d4a30f133162f06ddb2e5010608e60e4bb Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 13 Jan 2011 15:47:25 -0800 Subject: mm/dmapool.c: use TASK_UNINTERRUPTIBLE in dma_pool_alloc() As it stands this code will degenerate into a busy-wait if the calling task has signal_pending(). Cc: Rolf Eike Beer Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index a2f6295b4df..03bf3bb4519 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -324,7 +324,7 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, if (mem_flags & __GFP_WAIT) { DECLARE_WAITQUEUE(wait, current); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_UNINTERRUPTIBLE); __add_wait_queue(&pool->waitq, &wait); spin_unlock_irqrestore(&pool->lock, flags); -- cgit v1.2.3-70-g09d2 From 08d4a24659f1284f33e574211435aa12ce968477 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 13 Jan 2011 15:47:26 -0800 Subject: hugetlb: check the return value of string conversion in sysctl handler proc_doulongvec_minmax may fail if the given buffer doesn't represent a valid number. If we provide something invalid we will initialize the resulting value (nr_overcommit_huge_pages in this case) to a random value from the stack. The issue was introduced by a3d0c6aa when the default handler has been replaced by the helper function where we do not check the return value. Reproducer: echo "" > /proc/sys/vm/nr_overcommit_hugepages [akpm@linux-foundation.org: correctly propagate proc_doulongvec_minmax return code] Signed-off-by: Michal Hocko Cc: CAI Qian Cc: Nishanth Aravamudan Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8e31cda6fc2..363c4d22602 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1859,13 +1859,16 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->max_huge_pages; table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { NODEMASK_ALLOC(nodemask_t, nodes_allowed, @@ -1880,8 +1883,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (nodes_allowed != &node_states[N_HIGH_MEMORY]) NODEMASK_FREE(nodes_allowed); } - - return 0; +out: + return ret; } int hugetlb_sysctl_handler(struct ctl_table *table, int write, @@ -1919,21 +1922,24 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, { struct hstate *h = &default_hstate; unsigned long tmp; + int ret; if (!write) tmp = h->nr_overcommit_huge_pages; table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, buffer, length, ppos); + ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); + if (ret) + goto out; if (write) { spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = tmp; spin_unlock(&hugetlb_lock); } - - return 0; +out: + return ret; } #endif /* CONFIG_SYSCTL */ -- cgit v1.2.3-70-g09d2 From adbe8726dc2a3805630d517270db17e3af86e526 Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 13 Jan 2011 15:47:27 -0800 Subject: hugetlb: do not allow pagesize >= MAX_ORDER pool adjustment Huge pages with order >= MAX_ORDER must be allocated at boot via the kernel command line, they cannot be allocated or freed once the kernel is up and running. Currently we allow values to be written to the sysfs and sysctl files controling pool size for these huge page sizes. This patch makes the store functions for nr_hugepages and nr_overcommit_hugepages return -EINVAL when the pool for a page size >= MAX_ORDER is changed. [akpm@linux-foundation.org: avoid multiple return paths in nr_hugepages_store_common()] [caiqian@redhat.com: add checking in hugetlb_overcommit_handler()] Signed-off-by: Eric B Munson Reported-by: CAI Qian Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 363c4d22602..ce8e5bb6f03 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1363,6 +1363,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj, return sprintf(buf, "%lu\n", nr_huge_pages); } + static ssize_t nr_hugepages_store_common(bool obey_mempolicy, struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t len) @@ -1375,11 +1376,16 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, err = strict_strtoul(buf, 10, &count); if (err) { - NODEMASK_FREE(nodes_allowed); - return 0; + err = 0; /* This seems wrong */ + goto out; } h = kobj_to_hstate(kobj, &nid); + if (h->order >= MAX_ORDER) { + err = -EINVAL; + goto out; + } + if (nid == NUMA_NO_NODE) { /* * global hstate attribute @@ -1405,6 +1411,9 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_FREE(nodes_allowed); return len; +out: + NODEMASK_FREE(nodes_allowed); + return err; } static ssize_t nr_hugepages_show(struct kobject *kobj, @@ -1447,6 +1456,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, struct hstate *h = kobj_to_hstate(kobj, NULL); return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); } + static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { @@ -1454,6 +1464,9 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, unsigned long input; struct hstate *h = kobj_to_hstate(kobj, NULL); + if (h->order >= MAX_ORDER) + return -EINVAL; + err = strict_strtoul(buf, 10, &input); if (err) return 0; @@ -1864,6 +1877,9 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy, if (!write) tmp = h->max_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); @@ -1927,6 +1943,9 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, if (!write) tmp = h->nr_overcommit_huge_pages; + if (write && h->order >= MAX_ORDER) + return -EINVAL; + table->data = &tmp; table->maxlen = sizeof(unsigned long); ret = proc_doulongvec_minmax(table, write, buffer, length, ppos); -- cgit v1.2.3-70-g09d2 From 73ae31e5986a4c0ee84bfd13ccd9b57a98956f6f Mon Sep 17 00:00:00 2001 From: Eric B Munson Date: Thu, 13 Jan 2011 15:47:28 -0800 Subject: hugetlb: fix handling of parse errors in sysfs When parsing changes to the huge page pool sizes made from userspace via the sysfs interface, bogus input values are being covered up by nr_hugepages_store_common and nr_overcommit_hugepages_store returning 0 when strict_strtoul returns an error. This can cause an infinite loop in the nr_hugepages_store code. This patch changes the return value for these functions to -EINVAL when strict_strtoul returns an error. Signed-off-by: Eric B Munson Reported-by: CAI Qian Cc: Andrea Arcangeli Cc: Eric B Munson Cc: Michal Hocko Cc: Nishanth Aravamudan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index ce8e5bb6f03..bb0b7c12801 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1375,10 +1375,8 @@ static ssize_t nr_hugepages_store_common(bool obey_mempolicy, NODEMASK_ALLOC(nodemask_t, nodes_allowed, GFP_KERNEL | __GFP_NORETRY); err = strict_strtoul(buf, 10, &count); - if (err) { - err = 0; /* This seems wrong */ + if (err) goto out; - } h = kobj_to_hstate(kobj, &nid); if (h->order >= MAX_ORDER) { @@ -1469,7 +1467,7 @@ static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, err = strict_strtoul(buf, 10, &input); if (err) - return 0; + return err; spin_lock(&hugetlb_lock); h->nr_overcommit_huge_pages = input; -- cgit v1.2.3-70-g09d2 From 2919bfd0758257c469abef8c26c3e516bbebb851 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:47:29 -0800 Subject: ksm: drain pagevecs to lru It was hard to explain the page counts which were causing new LTP tests of KSM to fail: we need to drain the per-cpu pagevecs to LRU occasionally. Signed-off-by: Hugh Dickins Reported-by: CAI Qian Cc:Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 33781de0b6b..c2b2a94f9d6 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1296,6 +1296,18 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) slot = ksm_scan.mm_slot; if (slot == &ksm_mm_head) { + /* + * A number of pages can hang around indefinitely on per-cpu + * pagevecs, raised page count preventing write_protect_page + * from merging them. Though it doesn't really matter much, + * it is puzzling to see some stuck in pages_volatile until + * other activity jostles them out, and they also prevented + * LTP's KSM test from succeeding deterministically; so drain + * them here (here rather than on entry to ksm_do_scan(), + * so we don't IPI too often when pages_to_scan is set low). + */ + lru_add_drain_all(); + root_unstable_tree = RB_ROOT; spin_lock(&ksm_mmlist_lock); -- cgit v1.2.3-70-g09d2 From 1ce82b69e96c838d007f316b8347b911fdfa9842 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:47:30 -0800 Subject: mm: fix migration hangs on anon_vma lock Increased usage of page migration in mmotm reveals that the anon_vma locking in unmap_and_move() has been deficient since 2.6.36 (or even earlier). Review at the time of f18194275c39835cb84563500995e0d503a32d9a ("mm: fix hang on anon_vma->root->lock") missed the issue here: the anon_vma to which we get a reference may already have been freed back to its slab (it is in use when we check page_mapped, but that can change), and so its anon_vma->root may be switched at any moment by reuse in anon_vma_prepare. Perhaps we could fix that with a get_anon_vma_unless_zero(), but let's not: just rely on page_lock_anon_vma() to do all the hard thinking for us, then we don't need any rcu read locking over here. In removing the rcu_unlock label: since PageAnon is a bit in page->mapping, it's impossible for a !page->mapping page to be anon; but insert VM_BUG_ON in case the implementation ever changes. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Hugh Dickins Reviewed-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Naoya Horiguchi Cc: "Jun'ichi Nomura" Cc: Andi Kleen Cc: [2.6.37, 2.6.36] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 48 +++++++++++++++++++----------------------------- 1 file changed, 19 insertions(+), 29 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 89a6bc8cd30..a20cf12eded 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -622,7 +622,6 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, int *result = NULL; struct page *newpage = get_new_page(page, private, &result); int remap_swapcache = 1; - int rcu_locked = 0; int charge = 0; struct mem_cgroup *mem = NULL; struct anon_vma *anon_vma = NULL; @@ -694,20 +693,26 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, /* * By try_to_unmap(), page->mapcount goes down to 0 here. In this case, * we cannot notice that anon_vma is freed while we migrates a page. - * This rcu_read_lock() delays freeing anon_vma pointer until the end + * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. */ if (PageAnon(page)) { - rcu_read_lock(); - rcu_locked = 1; - - /* Determine how to safely use anon_vma */ - if (!page_mapped(page)) { - if (!PageSwapCache(page)) - goto rcu_unlock; - + /* + * Only page_lock_anon_vma() understands the subtleties of + * getting a hold on an anon_vma from outside one of its mms. + */ + anon_vma = page_lock_anon_vma(page); + if (anon_vma) { + /* + * Take a reference count on the anon_vma if the + * page is mapped so that it is guaranteed to + * exist when the page is remapped later + */ + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); + } else if (PageSwapCache(page)) { /* * We cannot be sure that the anon_vma of an unmapped * swapcache page is safe to use because we don't @@ -722,13 +727,7 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, */ remap_swapcache = 0; } else { - /* - * Take a reference count on the anon_vma if the - * page is mapped so that it is guaranteed to - * exist when the page is remapped later - */ - anon_vma = page_anon_vma(page); - get_anon_vma(anon_vma); + goto uncharge; } } @@ -745,16 +744,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, * free the metadata, so the page can be freed. */ if (!page->mapping) { - if (!PageAnon(page) && page_has_private(page)) { - /* - * Go direct to try_to_free_buffers() here because - * a) that's what try_to_release_page() would do anyway - * b) we may be under rcu_read_lock() here, so we can't - * use GFP_KERNEL which is what try_to_release_page() - * needs to be effective. - */ + VM_BUG_ON(PageAnon(page)); + if (page_has_private(page)) { try_to_free_buffers(page); - goto rcu_unlock; + goto uncharge; } goto skip_unmap; } @@ -768,14 +761,11 @@ skip_unmap: if (rc && remap_swapcache) remove_migration_ptes(page, page); -rcu_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) drop_anon_vma(anon_vma); - if (rcu_locked) - rcu_read_unlock(); uncharge: if (!charge) mem_cgroup_end_migration(mem, page, newpage); -- cgit v1.2.3-70-g09d2 From fd4a4663db293bfd5dc20fb4113977f62895e550 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Thu, 13 Jan 2011 15:47:31 -0800 Subject: mm: fix hugepage migration 2.6.37 added an unmap_and_move_huge_page() for memory failure recovery, but its anon_vma handling was still based around the 2.6.35 conventions. Update it to use page_lock_anon_vma, get_anon_vma, page_unlock_anon_vma, drop_anon_vma in the same way as we're now changing unmap_and_move(). I don't particularly like to propose this for stable when I've not seen its problems in practice nor tested the solution: but it's clearly out of synch at present. Signed-off-by: Hugh Dickins Cc: Mel Gorman Cc: Rik van Riel Cc: Naoya Horiguchi Cc: "Jun'ichi Nomura" Cc: Andi Kleen Cc: [2.6.37, 2.6.36] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index a20cf12eded..5b7d1fd2962 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -827,7 +827,6 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, int rc = 0; int *result = NULL; struct page *new_hpage = get_new_page(hpage, private, &result); - int rcu_locked = 0; struct anon_vma *anon_vma = NULL; if (!new_hpage) @@ -842,12 +841,10 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (PageAnon(hpage)) { - rcu_read_lock(); - rcu_locked = 1; - - if (page_mapped(hpage)) { - anon_vma = page_anon_vma(hpage); - atomic_inc(&anon_vma->external_refcount); + anon_vma = page_lock_anon_vma(hpage); + if (anon_vma) { + get_anon_vma(anon_vma); + page_unlock_anon_vma(anon_vma); } } @@ -859,16 +856,8 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (rc) remove_migration_ptes(hpage, hpage); - if (anon_vma && atomic_dec_and_lock(&anon_vma->external_refcount, - &anon_vma->lock)) { - int empty = list_empty(&anon_vma->head); - spin_unlock(&anon_vma->lock); - if (empty) - anon_vma_free(anon_vma); - } - - if (rcu_locked) - rcu_read_unlock(); + if (anon_vma) + drop_anon_vma(anon_vma); out: unlock_page(hpage); -- cgit v1.2.3-70-g09d2 From c06b1fca18c3ad868bfcaca230146e3038583422 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 13 Jan 2011 15:47:32 -0800 Subject: mm/page_alloc.c: don't cache `current' in a local It's old-fashioned and unneeded. akpm:/usr/src/25> size mm/page_alloc.o text data bss dec hex filename 39884 1241317 18808 1300009 13d629 mm/page_alloc.o (before) 39838 1241317 18808 1299963 13d5fb mm/page_alloc.o (after) Acked-by: David Rientjes Acked-by: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bda1db301d4..90c1439549f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,15 +1809,14 @@ __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order, bool sync_migration) { struct page *page; - struct task_struct *tsk = current; if (!order || compaction_deferred(preferred_zone)) return NULL; - tsk->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; *did_some_progress = try_to_compact_pages(zonelist, order, gfp_mask, nodemask, sync_migration); - tsk->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; if (*did_some_progress != COMPACT_SKIPPED) { /* Page migration frees to the PCP lists but we want merging */ @@ -1869,23 +1868,22 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, { struct page *page = NULL; struct reclaim_state reclaim_state; - struct task_struct *p = current; bool drained = false; cond_resched(); /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - p->flags |= PF_MEMALLOC; + current->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; - p->reclaim_state = &reclaim_state; + current->reclaim_state = &reclaim_state; *did_some_progress = try_to_free_pages(zonelist, order, gfp_mask, nodemask); - p->reclaim_state = NULL; + current->reclaim_state = NULL; lockdep_clear_current_reclaim_state(); - p->flags &= ~PF_MEMALLOC; + current->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1950,7 +1948,6 @@ void wake_all_kswapd(unsigned int order, struct zonelist *zonelist, static inline int gfp_to_alloc_flags(gfp_t gfp_mask) { - struct task_struct *p = current; int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET; const gfp_t wait = gfp_mask & __GFP_WAIT; @@ -1977,12 +1974,12 @@ gfp_to_alloc_flags(gfp_t gfp_mask) * See also cpuset_zone_allowed() comment in kernel/cpuset.c. */ alloc_flags &= ~ALLOC_CPUSET; - } else if (unlikely(rt_task(p)) && !in_interrupt()) + } else if (unlikely(rt_task(current)) && !in_interrupt()) alloc_flags |= ALLOC_HARDER; if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) { if (!in_interrupt() && - ((p->flags & PF_MEMALLOC) || + ((current->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))) alloc_flags |= ALLOC_NO_WATERMARKS; } @@ -2001,7 +1998,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, int alloc_flags; unsigned long pages_reclaimed = 0; unsigned long did_some_progress; - struct task_struct *p = current; bool sync_migration = false; /* @@ -2060,7 +2056,7 @@ rebalance: goto nopage; /* Avoid recursion of direct reclaim */ - if (p->flags & PF_MEMALLOC) + if (current->flags & PF_MEMALLOC) goto nopage; /* Avoid allocations with no watermarks from looping endlessly */ @@ -2153,7 +2149,7 @@ nopage: if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) { printk(KERN_WARNING "%s: page allocation failure." " order:%d, mode:0x%x\n", - p->comm, order, gfp_mask); + current->comm, order, gfp_mask); dump_stack(); show_mem(); } -- cgit v1.2.3-70-g09d2 From d8505dee1a87b8d41b9c4ee1325cd72258226fbc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Jan 2011 15:47:33 -0800 Subject: mm: simplify code of swap.c Clean up code and remove duplicate code. Next patch will use pagevec_lru_move_fn introduced here too. Signed-off-by: Shaohua Li Cc: Andi Kleen Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 101 +++++++++++++++++++++++++++++++++----------------------------- 1 file changed, 54 insertions(+), 47 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index c02f93611a8..ab498ea04ae 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -178,15 +178,13 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) +static void pagevec_lru_move_fn(struct pagevec *pvec, + void (*move_fn)(struct page *page, void *arg), + void *arg) { int i; - int pgmoved = 0; struct zone *zone = NULL; + unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -194,29 +192,49 @@ static void pagevec_move_tail(struct pagevec *pvec) if (pagezone != zone) { if (zone) - spin_unlock(&zone->lru_lock); + spin_unlock_irqrestore(&zone->lru_lock, flags); zone = pagezone; - spin_lock(&zone->lru_lock); - } - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - pgmoved++; + spin_lock_irqsave(&zone->lru_lock, flags); } + + (*move_fn)(page, arg); } if (zone) - spin_unlock(&zone->lru_lock); - __count_vm_events(PGROTATED, pgmoved); - release_pages(pvec->pages, pvec->nr, pvec->cold); + spin_unlock_irqrestore(&zone->lru_lock, flags); + release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); pagevec_reinit(pvec); } +static void pagevec_move_tail_fn(struct page *page, void *arg) +{ + int *pgmoved = arg; + struct zone *zone = page_zone(page); + + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + (*pgmoved)++; + } +} + +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) +{ + int pgmoved = 0; + + pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + __count_vm_events(PGROTATED, pgmoved); +} + /* * Writeback is about to end against a page which has been marked for immediate * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -516,44 +534,33 @@ void lru_add_page_tail(struct zone* zone, } } +static void ____pagevec_lru_add_fn(struct page *page, void *arg) +{ + enum lru_list lru = (enum lru_list)arg; + struct zone *zone = page_zone(page); + int file = is_file_lru(lru); + int active = is_active_lru(lru); + + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + + SetPageLRU(page); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); +} + /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { - int i; - struct zone *zone = NULL; - VM_BUG_ON(is_unevictable_lru(lru)); - for (i = 0; i < pagevec_count(pvec); i++) { - struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - int file; - int active; - - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); - } - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - active = is_active_lru(lru); - file = is_file_lru(lru); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); - } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); + pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); } EXPORT_SYMBOL(____pagevec_lru_add); -- cgit v1.2.3-70-g09d2 From 744ed1442757767ffede5008bb13e0805085902e Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 13 Jan 2011 15:47:34 -0800 Subject: mm: batch activate_page() to reduce lock contention The zone->lru_lock is heavily contented in workload where activate_page() is frequently used. We could do batch activate_page() to reduce the lock contention. The batched pages will be added into zone list when the pool is full or page reclaim is trying to drain them. For example, in a 4 socket 64 CPU system, create a sparse file and 64 processes, processes shared map to the file. Each process read access the whole file and then exit. The process exit will do unmap_vmas() and cause a lot of activate_page() call. In such workload, we saw about 58% total time reduction with below patch. Other workloads with a lot of activate_page also benefits a lot too. I tested some microbenchmarks: case-anon-cow-rand-mt 0.58% case-anon-cow-rand -3.30% case-anon-cow-seq-mt -0.51% case-anon-cow-seq -5.68% case-anon-r-rand-mt 0.23% case-anon-r-rand 0.81% case-anon-r-seq-mt -0.71% case-anon-r-seq -1.99% case-anon-rx-rand-mt 2.11% case-anon-rx-seq-mt 3.46% case-anon-w-rand-mt -0.03% case-anon-w-rand -0.50% case-anon-w-seq-mt -1.08% case-anon-w-seq -0.12% case-anon-wx-rand-mt -5.02% case-anon-wx-seq-mt -1.43% case-fork 1.65% case-fork-sleep -0.07% case-fork-withmem 1.39% case-hugetlb -0.59% case-lru-file-mmap-read-mt -0.54% case-lru-file-mmap-read 0.61% case-lru-file-mmap-read-rand -2.24% case-lru-file-readonce -0.64% case-lru-file-readtwice -11.69% case-lru-memcg -1.35% case-mmap-pread-rand-mt 1.88% case-mmap-pread-rand -15.26% case-mmap-pread-seq-mt 0.89% case-mmap-pread-seq -69.72% case-mmap-xread-rand-mt 0.71% case-mmap-xread-seq-mt 0.38% The most significent are: case-lru-file-readtwice -11.69% case-mmap-pread-rand -15.26% case-mmap-pread-seq -69.72% which use activate_page a lot. others are basically variations because each run has slightly difference. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Shaohua Li Cc: Andi Kleen Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 9 ++++++ mm/swap.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++++-------- mm/vmscan.c | 6 ++-- 3 files changed, 92 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 69488205723..4c98630f0f7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) extern unsigned long highest_memmap_pfn; +#ifdef CONFIG_SMP +extern int putback_active_lru_page(struct zone *zone, struct page *page); +#else +static inline int putback_active_lru_page(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + /* * in mm/vmscan.c: */ diff --git a/mm/swap.c b/mm/swap.c index ab498ea04ae..bbc1ce9f946 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -271,27 +271,94 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, } /* - * FIXME: speed this up? + * A page will go to active list either by activate_page or putback_lru_page. + * In the activate_page case, the page hasn't active bit set. The page might + * not in LRU list because it's isolated before it gets a chance to be moved to + * active list. The window is small because pagevec just stores several pages. + * For such case, we do nothing for such page. + * In the putback_lru_page case, the page isn't in lru list but has active + * bit set */ -void activate_page(struct page *page) +static void __activate_page(struct page *page, void *arg) { struct zone *zone = page_zone(page); + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + bool putback = !PageLRU(page); - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); + /* The page is isolated before it's moved to active list */ + if (!PageLRU(page) && !PageActive(page)) + return; + if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) + return; + + if (!putback) del_page_from_lru_list(zone, page, lru); + else + SetPageLRU(page); - SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - __count_vm_event(PGACTIVATE); + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); + + if (putback) + return; + __count_vm_event(PGACTIVATE); + update_page_reclaim_stat(zone, page, file, 1); +} + +#ifdef CONFIG_SMP +static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); + +static void activate_page_drain(int cpu) +{ + struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); + + if (pagevec_count(pvec)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); +} + +void activate_page(struct page *page) +{ + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + page_cache_get(page); + if (!pagevec_add(pvec, page)) + pagevec_lru_move_fn(pvec, __activate_page, NULL); + put_cpu_var(activate_page_pvecs); + } +} - update_page_reclaim_stat(zone, page, file, 1); +/* Caller should hold zone->lru_lock */ +int putback_active_lru_page(struct zone *zone, struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + + if (!pagevec_add(pvec, page)) { + spin_unlock_irq(&zone->lru_lock); + pagevec_lru_move_fn(pvec, __activate_page, NULL); + spin_lock_irq(&zone->lru_lock); } + put_cpu_var(activate_page_pvecs); + return 1; +} + +#else +static inline void activate_page_drain(int cpu) +{ +} + +void activate_page(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) + __activate_page(page, NULL); spin_unlock_irq(&zone->lru_lock); } +#endif /* * Mark a page as having seen activity. @@ -390,6 +457,7 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } + activate_page_drain(cpu); } void lru_add_drain(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 47a50962ce8..99999a9b2b0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1271,14 +1271,16 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, spin_lock_irq(&zone->lru_lock); continue; } - SetPageLRU(page); lru = page_lru(page); - add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; + if (putback_active_lru_page(zone, page)) + continue; } + SetPageLRU(page); + add_page_to_lru_list(zone, page, lru); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); -- cgit v1.2.3-70-g09d2 From 2a7106f2cb0768d00fe8c1eb42a754a7d8518f08 Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 13 Jan 2011 15:47:37 -0800 Subject: memcg: create extensible page stat update routines Replace usage of the mem_cgroup_update_file_mapped() memcg statistic update routine with two new routines: * mem_cgroup_inc_page_stat() * mem_cgroup_dec_page_stat() As before, only the file_mapped statistic is managed. However, these more general interfaces allow for new statistics to be more easily added. New statistics are added with memcg dirty page accounting. Signed-off-by: Greg Thelen Signed-off-by: Andrea Righi Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 16 +++++++--------- mm/rmap.c | 4 ++-- 3 files changed, 37 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 159a0762aea..067115ce6b3 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -25,6 +25,11 @@ struct page_cgroup; struct page; struct mm_struct; +/* Stats that can be updated by kernel. */ +enum mem_cgroup_page_stat_item { + MEMCG_NR_FILE_MAPPED, /* # of pages charged as file rss */ +}; + extern unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, struct list_head *dst, unsigned long *scanned, int order, @@ -121,7 +126,22 @@ static inline bool mem_cgroup_disabled(void) return false; } -void mem_cgroup_update_file_mapped(struct page *page, int val); +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, + int val); + +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, 1); +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ + mem_cgroup_update_page_stat(page, idx, -1); +} + unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); @@ -293,8 +313,13 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) { } -static inline void mem_cgroup_update_file_mapped(struct page *page, - int val) +static inline void mem_cgroup_inc_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) +{ +} + +static inline void mem_cgroup_dec_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 741206ffdac..3d8a0c79dec 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1600,7 +1600,8 @@ bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask) * possibility of race condition. If there is, we take a lock. */ -static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) +void mem_cgroup_update_page_stat(struct page *page, + enum mem_cgroup_page_stat_item idx, int val) { struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); @@ -1623,30 +1624,27 @@ static void mem_cgroup_update_file_stat(struct page *page, int idx, int val) goto out; } - this_cpu_add(mem->stat->count[idx], val); - switch (idx) { - case MEM_CGROUP_STAT_FILE_MAPPED: + case MEMCG_NR_FILE_MAPPED: if (val > 0) SetPageCgroupFileMapped(pc); else if (!page_mapped(page)) ClearPageCgroupFileMapped(pc); + idx = MEM_CGROUP_STAT_FILE_MAPPED; break; default: BUG(); } + this_cpu_add(mem->stat->count[idx], val); + out: if (unlikely(need_unlock)) unlock_page_cgroup(pc); rcu_read_unlock(); return; } - -void mem_cgroup_update_file_mapped(struct page *page, int val) -{ - mem_cgroup_update_file_stat(page, MEM_CGROUP_STAT_FILE_MAPPED, val); -} +EXPORT_SYMBOL(mem_cgroup_update_page_stat); /* * size of first charge trial. "32" comes from vmscan.c's magic value. diff --git a/mm/rmap.c b/mm/rmap.c index c30f33854f9..f21f4a1d6a1 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -937,7 +937,7 @@ void page_add_file_rmap(struct page *page) { if (atomic_inc_and_test(&page->_mapcount)) { __inc_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, 1); + mem_cgroup_inc_page_stat(page, MEMCG_NR_FILE_MAPPED); } } @@ -979,7 +979,7 @@ void page_remove_rmap(struct page *page) NR_ANON_TRANSPARENT_HUGEPAGES); } else { __dec_zone_page_state(page, NR_FILE_MAPPED); - mem_cgroup_update_file_mapped(page, -1); + mem_cgroup_dec_page_stat(page, MEMCG_NR_FILE_MAPPED); } /* * It would be tidy to reset the PageAnon mapping here, -- cgit v1.2.3-70-g09d2 From dbd4ea78f002df283c95d9774837041735fa1bf9 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 13 Jan 2011 15:47:38 -0800 Subject: memcg: add lock to synchronize page accounting and migration Introduce a new bit spin lock, PCG_MOVE_LOCK, to synchronize the page accounting and migration code. This reworks the locking scheme of _update_stat() and _move_account() by adding new lock bit PCG_MOVE_LOCK, which is always taken under IRQ disable. 1. If pages are being migrated from a memcg, then updates to that memcg page statistics are protected by grabbing PCG_MOVE_LOCK using move_lock_page_cgroup(). In an upcoming commit, memcg dirty page accounting will be updating memcg page accounting (specifically: num writeback pages) from IRQ context (softirq). Avoid a deadlocking nested spin lock attempt by disabling irq on the local processor when grabbing the PCG_MOVE_LOCK. 2. lock for update_page_stat is used only for avoiding race with move_account(). So, IRQ awareness of lock_page_cgroup() itself is not a problem. The problem is between mem_cgroup_update_page_stat() and mem_cgroup_move_account_page(). Trade-off: * Changing lock_page_cgroup() to always disable IRQ (or local_bh) has some impacts on performance and I think it's bad to disable IRQ when it's not necessary. * adding a new lock makes move_account() slower. Score is here. Performance Impact: moving a 8G anon process. Before: real 0m0.792s user 0m0.000s sys 0m0.780s After: real 0m0.854s user 0m0.000s sys 0m0.842s This score is bad but planned patches for optimization can reduce this impact. Signed-off-by: KAMEZAWA Hiroyuki Signed-off-by: Greg Thelen Reviewed-by: Minchan Kim Acked-by: Daisuke Nishimura Cc: Andrea Righi Cc: Balbir Singh Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page_cgroup.h | 31 ++++++++++++++++++++++++++++--- mm/memcontrol.c | 9 +++++++-- 2 files changed, 35 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/include/linux/page_cgroup.h b/include/linux/page_cgroup.h index fdb5a92b5ac..5b0c971d7ca 100644 --- a/include/linux/page_cgroup.h +++ b/include/linux/page_cgroup.h @@ -35,15 +35,18 @@ struct page_cgroup *lookup_page_cgroup(struct page *page); enum { /* flags for mem_cgroup */ - PCG_LOCK, /* page cgroup is locked */ + PCG_LOCK, /* Lock for pc->mem_cgroup and following bits. */ PCG_CACHE, /* charged as cache */ PCG_USED, /* this object is in use. */ - PCG_ACCT_LRU, /* page has been accounted for */ + PCG_MIGRATION, /* under page migration */ + /* flags for mem_cgroup and file and I/O status */ + PCG_MOVE_LOCK, /* For race between move_account v.s. following bits */ PCG_FILE_MAPPED, /* page is accounted as "mapped" */ PCG_FILE_DIRTY, /* page is dirty */ PCG_FILE_WRITEBACK, /* page is under writeback */ PCG_FILE_UNSTABLE_NFS, /* page is NFS unstable */ - PCG_MIGRATION, /* under page migration */ + /* No lock in page_cgroup */ + PCG_ACCT_LRU, /* page has been accounted for (under lru_lock) */ }; #define TESTPCGFLAG(uname, lname) \ @@ -117,6 +120,10 @@ static inline enum zone_type page_cgroup_zid(struct page_cgroup *pc) static inline void lock_page_cgroup(struct page_cgroup *pc) { + /* + * Don't take this lock in IRQ context. + * This lock is for pc->mem_cgroup, USED, CACHE, MIGRATION + */ bit_spin_lock(PCG_LOCK, &pc->flags); } @@ -130,6 +137,24 @@ static inline int page_is_cgroup_locked(struct page_cgroup *pc) return bit_spin_is_locked(PCG_LOCK, &pc->flags); } +static inline void move_lock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + /* + * We know updates to pc->flags of page cache's stats are from both of + * usual context or IRQ context. Disable IRQ to avoid deadlock. + */ + local_irq_save(*flags); + bit_spin_lock(PCG_MOVE_LOCK, &pc->flags); +} + +static inline void move_unlock_page_cgroup(struct page_cgroup *pc, + unsigned long *flags) +{ + bit_spin_unlock(PCG_MOVE_LOCK, &pc->flags); + local_irq_restore(*flags); +} + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct page_cgroup; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d8a0c79dec..d888956a2cf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1606,6 +1606,7 @@ void mem_cgroup_update_page_stat(struct page *page, struct mem_cgroup *mem; struct page_cgroup *pc = lookup_page_cgroup(page); bool need_unlock = false; + unsigned long uninitialized_var(flags); if (unlikely(!pc)) return; @@ -1617,7 +1618,7 @@ void mem_cgroup_update_page_stat(struct page *page, /* pc->mem_cgroup is unstable ? */ if (unlikely(mem_cgroup_stealed(mem))) { /* take a lock against to access pc->mem_cgroup */ - lock_page_cgroup(pc); + move_lock_page_cgroup(pc, &flags); need_unlock = true; mem = pc->mem_cgroup; if (!mem || !PageCgroupUsed(pc)) @@ -1640,7 +1641,7 @@ void mem_cgroup_update_page_stat(struct page *page, out: if (unlikely(need_unlock)) - unlock_page_cgroup(pc); + move_unlock_page_cgroup(pc, &flags); rcu_read_unlock(); return; } @@ -2211,9 +2212,13 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) { int ret = -EINVAL; + unsigned long flags; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { + move_lock_page_cgroup(pc, &flags); __mem_cgroup_move_account(pc, from, to, uncharge); + move_unlock_page_cgroup(pc, &flags); ret = 0; } unlock_page_cgroup(pc); -- cgit v1.2.3-70-g09d2 From f3e8eb70b1807d1b30aa6972af0cf30077c40112 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 13 Jan 2011 15:47:39 -0800 Subject: memcg: fix unit mismatch in memcg oom limit calculation Adding the number of swap pages to the byte limit of a memory control group makes no sense. Convert the pages to bytes before adding them. The only user of this code is the OOM killer, and the way it is used means that the error results in a higher OOM badness value. Since the cgroup limit is the same for all tasks in the cgroup, the error should have no practical impact at the moment. But let's not wait for future or changing users to trip over it. Signed-off-by: Johannes Weiner Cc: Greg Thelen Cc: David Rientjes Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d888956a2cf..9f8ad77f091 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1312,8 +1312,9 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) u64 limit; u64 memsw; - limit = res_counter_read_u64(&memcg->res, RES_LIMIT) + - total_swap_pages; + limit = res_counter_read_u64(&memcg->res, RES_LIMIT); + limit += total_swap_pages << PAGE_SHIFT; + memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); /* * If memsw is finite and limits the amount of swap space available -- cgit v1.2.3-70-g09d2 From 043d18b1e5bdfc4870b8a19d00f0d5c636a5c231 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 13 Jan 2011 15:47:40 -0800 Subject: memcg: remove unnecessary return from void-returning mem_cgroup_del_lru_list() Signed-off-by: Minchan Kim Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9f8ad77f091..1b44ad64f28 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -821,7 +821,6 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) return; VM_BUG_ON(list_empty(&pc->lru)); list_del_init(&pc->lru); - return; } void mem_cgroup_del_lru(struct page *page) -- cgit v1.2.3-70-g09d2 From dfe076b0971a783469bc2066e85d46e23c8acb1c Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:41 -0800 Subject: memcg: fix deadlock between cpuset and memcg Commit b1dd693e ("memcg: avoid deadlock between move charge and try_charge()") can cause another deadlock about mmap_sem on task migration if cpuset and memcg are mounted onto the same mount point. After the commit, cgroup_attach_task() has sequence like: cgroup_attach_task() ss->can_attach() cpuset_can_attach() mem_cgroup_can_attach() down_read(&mmap_sem) (1) ss->attach() cpuset_attach() mpol_rebind_mm() down_write(&mmap_sem) (2) up_write(&mmap_sem) cpuset_migrate_mm() do_migrate_pages() down_read(&mmap_sem) up_read(&mmap_sem) mem_cgroup_move_task() mem_cgroup_clear_mc() up_read(&mmap_sem) We can cause deadlock at (2) because we've already aquire the mmap_sem at (1). But the commit itself is necessary to fix deadlocks which have existed before the commit like: Ex.1) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | down_write(&mmap_sem) mc.moving_task = current | .. mem_cgroup_precharge_mc() | __mem_cgroup_try_charge() mem_cgroup_count_precharge() | prepare_to_wait() down_read(&mmap_sem) | if (mc.moving_task) -> cannot aquire the lock | -> true | schedule() | -> move charge should wake it up Ex.2) move charge | try charge --------------------------------------+------------------------------ mem_cgroup_can_attach() | mc.moving_task = current | mem_cgroup_precharge_mc() | mem_cgroup_count_precharge() | down_read(&mmap_sem) | .. | up_read(&mmap_sem) | | down_write(&mmap_sem) mem_cgroup_move_task() | .. mem_cgroup_move_charge() | __mem_cgroup_try_charge() down_read(&mmap_sem) | prepare_to_wait() -> cannot aquire the lock | if (mc.moving_task) | -> true | schedule() | -> move charge should wake it up This patch fixes all of these problems by: 1. revert the commit. 2. To fix the Ex.1, we set mc.moving_task after mem_cgroup_count_precharge() has released the mmap_sem. 3. To fix the Ex.2, we use down_read_trylock() instead of down_read() in mem_cgroup_move_charge() and, if it has failed to aquire the lock, cancel all extra charges, wake up all waiters, and retry trylock. Signed-off-by: Daisuke Nishimura Reported-by: Ben Blum Cc: Miao Xie Cc: David Rientjes Cc: Paul Menage Cc: Hiroyuki Kamezawa Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 84 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 49 insertions(+), 35 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1b44ad64f28..c339d7431bd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -292,7 +292,6 @@ static struct move_charge_struct { unsigned long moved_charge; unsigned long moved_swap; struct task_struct *moving_task; /* a task moving charges */ - struct mm_struct *mm; wait_queue_head_t waitq; /* a waitq for other context */ } mc = { .lock = __SPIN_LOCK_UNLOCKED(mc.lock), @@ -4681,7 +4680,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; struct vm_area_struct *vma; - /* We've already held the mmap_sem */ + down_read(&mm->mmap_sem); for (vma = mm->mmap; vma; vma = vma->vm_next) { struct mm_walk mem_cgroup_count_precharge_walk = { .pmd_entry = mem_cgroup_count_precharge_pte_range, @@ -4693,6 +4692,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) walk_page_range(vma->vm_start, vma->vm_end, &mem_cgroup_count_precharge_walk); } + up_read(&mm->mmap_sem); precharge = mc.precharge; mc.precharge = 0; @@ -4702,10 +4702,15 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) static int mem_cgroup_precharge_mc(struct mm_struct *mm) { - return mem_cgroup_do_precharge(mem_cgroup_count_precharge(mm)); + unsigned long precharge = mem_cgroup_count_precharge(mm); + + VM_BUG_ON(mc.moving_task); + mc.moving_task = current; + return mem_cgroup_do_precharge(precharge); } -static void mem_cgroup_clear_mc(void) +/* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */ +static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; @@ -4740,23 +4745,28 @@ static void mem_cgroup_clear_mc(void) PAGE_SIZE * mc.moved_swap); } /* we've already done mem_cgroup_get(mc.to) */ - mc.moved_swap = 0; } - if (mc.mm) { - up_read(&mc.mm->mmap_sem); - mmput(mc.mm); - } + memcg_oom_recover(from); + memcg_oom_recover(to); + wake_up_all(&mc.waitq); +} + +static void mem_cgroup_clear_mc(void) +{ + struct mem_cgroup *from = mc.from; + + /* + * we must clear moving_task before waking up waiters at the end of + * task migration. + */ + mc.moving_task = NULL; + __mem_cgroup_clear_mc(); spin_lock(&mc.lock); mc.from = NULL; mc.to = NULL; spin_unlock(&mc.lock); - mc.moving_task = NULL; - mc.mm = NULL; mem_cgroup_end_move(from); - memcg_oom_recover(from); - memcg_oom_recover(to); - wake_up_all(&mc.waitq); } static int mem_cgroup_can_attach(struct cgroup_subsys *ss, @@ -4778,38 +4788,23 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss, return 0; /* We move charges only when we move a owner of the mm */ if (mm->owner == p) { - /* - * We do all the move charge works under one mmap_sem to - * avoid deadlock with down_write(&mmap_sem) - * -> try_charge() -> if (mc.moving_task) -> sleep. - */ - down_read(&mm->mmap_sem); - VM_BUG_ON(mc.from); VM_BUG_ON(mc.to); VM_BUG_ON(mc.precharge); VM_BUG_ON(mc.moved_charge); VM_BUG_ON(mc.moved_swap); - VM_BUG_ON(mc.moving_task); - VM_BUG_ON(mc.mm); - mem_cgroup_start_move(from); spin_lock(&mc.lock); mc.from = from; mc.to = mem; - mc.precharge = 0; - mc.moved_charge = 0; - mc.moved_swap = 0; spin_unlock(&mc.lock); - mc.moving_task = current; - mc.mm = mm; + /* We set mc.moving_task later */ ret = mem_cgroup_precharge_mc(mm); if (ret) mem_cgroup_clear_mc(); - /* We call up_read() and mmput() in clear_mc(). */ - } else - mmput(mm); + } + mmput(mm); } return ret; } @@ -4898,7 +4893,19 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) struct vm_area_struct *vma; lru_add_drain_all(); - /* We've already held the mmap_sem */ +retry: + if (unlikely(!down_read_trylock(&mm->mmap_sem))) { + /* + * Someone who are holding the mmap_sem might be waiting in + * waitq. So we cancel all extra charges, wake up all waiters, + * and retry. Because we cancel precharges, we might not be able + * to move enough charges, but moving charge is a best-effort + * feature anyway, so it wouldn't be a big problem. + */ + __mem_cgroup_clear_mc(); + cond_resched(); + goto retry; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { int ret; struct mm_walk mem_cgroup_move_charge_walk = { @@ -4917,6 +4924,7 @@ static void mem_cgroup_move_charge(struct mm_struct *mm) */ break; } + up_read(&mm->mmap_sem); } static void mem_cgroup_move_task(struct cgroup_subsys *ss, @@ -4925,11 +4933,17 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss, struct task_struct *p, bool threadgroup) { - if (!mc.mm) + struct mm_struct *mm; + + if (!mc.to) /* no need to move charge */ return; - mem_cgroup_move_charge(mc.mm); + mm = get_task_mm(p); + if (mm) { + mem_cgroup_move_charge(mm); + mmput(mm); + } mem_cgroup_clear_mc(); } #else /* !CONFIG_MMU */ -- cgit v1.2.3-70-g09d2 From 17295c88a160c6eea3fcf46cec9d08a0fcb02db9 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 13 Jan 2011 15:47:42 -0800 Subject: memcg: use [kv]zalloc[_node] rather than [kv]malloc+memset In mem_cgroup_alloc() we currently do either kmalloc() or vmalloc() then followed by memset() to zero the memory. This can be more efficiently achieved by using kzalloc() and vzalloc(). There's also one situation where we can use kzalloc_node() - this is what's new in this version of the patch. Signed-off-by: Jesper Juhl Cc: KAMEZAWA Hiroyuki Cc: Minchan Kim Cc: Wu Fengguang Cc: Balbir Singh Cc: Li Zefan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c339d7431bd..6424ba0fce8 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4216,13 +4216,11 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) */ if (!node_state(node, N_NORMAL_MEMORY)) tmp = -1; - pn = kmalloc_node(sizeof(*pn), GFP_KERNEL, tmp); + pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); if (!pn) return 1; mem->info.nodeinfo[node] = pn; - memset(pn, 0, sizeof(*pn)); - for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; for_each_lru(l) @@ -4246,14 +4244,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void) /* Can be very big if MAX_NUMNODES is very big */ if (size < PAGE_SIZE) - mem = kmalloc(size, GFP_KERNEL); + mem = kzalloc(size, GFP_KERNEL); else - mem = vmalloc(size); + mem = vzalloc(size); if (!mem) return NULL; - memset(mem, 0, size); mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu); if (!mem->stat) goto out_free; -- cgit v1.2.3-70-g09d2 From 50de1dd967d4ba3b8a90ebe7a4f5feca24191317 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 13 Jan 2011 15:47:43 -0800 Subject: memcg: fix memory migration of shmem swapcache In the current implementation mem_cgroup_end_migration() decides whether the page migration has succeeded or not by checking "oldpage->mapping". But if we are tring to migrate a shmem swapcache, the page->mapping of it is NULL from the begining, so the check would be invalid. As a result, mem_cgroup_end_migration() assumes the migration has succeeded even if it's not, so "newpage" would be freed while it's not uncharged. This patch fixes it by passing mem_cgroup_end_migration() the result of the page migration. Signed-off-by: Daisuke Nishimura Reviewed-by: Minchan Kim Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Minchan Kim Reviewed-by: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 5 ++--- mm/memcontrol.c | 5 ++--- mm/migrate.c | 2 +- 3 files changed, 5 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 067115ce6b3..6a576f98943 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -98,7 +98,7 @@ extern int mem_cgroup_prepare_migration(struct page *page, struct page *newpage, struct mem_cgroup **ptr); extern void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage); + struct page *oldpage, struct page *newpage, bool migration_ok); /* * For memory reclaim. @@ -251,8 +251,7 @@ mem_cgroup_prepare_migration(struct page *page, struct page *newpage, } static inline void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, - struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6424ba0fce8..8ab84103143 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2896,7 +2896,7 @@ int mem_cgroup_prepare_migration(struct page *page, /* remove redundant charge if migration failed*/ void mem_cgroup_end_migration(struct mem_cgroup *mem, - struct page *oldpage, struct page *newpage) + struct page *oldpage, struct page *newpage, bool migration_ok) { struct page *used, *unused; struct page_cgroup *pc; @@ -2905,8 +2905,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem, return; /* blocks rmdir() */ cgroup_exclude_rmdir(&mem->css); - /* at migration success, oldpage->mapping is NULL. */ - if (oldpage->mapping) { + if (!migration_ok) { used = oldpage; unused = newpage; } else { diff --git a/mm/migrate.c b/mm/migrate.c index 5b7d1fd2962..46fe8cc13d6 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -768,7 +768,7 @@ skip_unmap: uncharge: if (!charge) - mem_cgroup_end_migration(mem, page, newpage); + mem_cgroup_end_migration(mem, page, newpage, rc == 0); unlock: unlock_page(page); -- cgit v1.2.3-70-g09d2 From 68a1b1955957e222d890f550d2a44ae598db3de9 Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 11 Jan 2011 17:49:32 -0600 Subject: mm/slab.c: make local symbols static Local symbols should be static. Signed-off-by: H Hartley Sweeten Cc: Christoph Lameter Cc: Pekka Enberg Cc: Matt Mackall Signed-off-by: Pekka Enberg --- mm/slab.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/slab.c b/mm/slab.c index 264037449f0..37961d1f584 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -284,7 +284,7 @@ struct kmem_list3 { * Need this for bootstrapping a per node allocator. */ #define NUM_INIT_LISTS (3 * MAX_NUMNODES) -struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; +static struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS]; #define CACHE_CACHE 0 #define SIZE_AC MAX_NUMNODES #define SIZE_L3 (2 * MAX_NUMNODES) @@ -4053,7 +4053,7 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp) * necessary. Note that the l3 listlock also protects the array_cache * if drain_array() is used on the shared array. */ -void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, +static void drain_array(struct kmem_cache *cachep, struct kmem_list3 *l3, struct array_cache *ac, int force, int node) { int tofree; @@ -4317,7 +4317,7 @@ static const struct seq_operations slabinfo_op = { * @count: data length * @ppos: unused */ -ssize_t slabinfo_write(struct file *file, const char __user * buffer, +static ssize_t slabinfo_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { char kbuf[MAX_SLABINFO_WRITE + 1], *tmp; -- cgit v1.2.3-70-g09d2 From b3697c0255d9d73eaaa4deb4512e3f0ff97b3b71 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Sun, 16 Jan 2011 13:10:39 -0800 Subject: fix non-x86 build failure in pmdp_get_and_clear pmdp_get_and_clear/pmdp_clear_flush/pmdp_splitting_flush were trapped as BUG() and they were defined only to diminish the risk of build issues on not-x86 archs and to be consistent with the generic pte methods previously defined in include/asm-generic/pgtable.h. But they are causing more trouble than they were supposed to solve, so it's simpler not to define them when THP is off. This is also correcting the export of pmdp_splitting_flush which is currently unused (x86 isn't using the generic implementation in mm/pgtable-generic.c and no other arch needs that [yet]). Signed-off-by: Andrea Arcangeli Sam Ravnborg Cc: Stephen Rothwell Cc: "David S. Miller" Cc: Benjamin Herrenschmidt Cc: "Luck, Tony" Cc: James Bottomley Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 14 +++----------- mm/pgtable-generic.c | 11 ++++------- 2 files changed, 7 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index f1eddf71dd0..31b6188df22 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -87,14 +87,6 @@ static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, pmd_clear(mm, address, pmdp); return pmd; }) -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ -static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, - unsigned long address, - pmd_t *pmdp) -{ - BUG(); - return __pmd(0); -} #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif @@ -163,9 +155,9 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, #endif #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH -extern pmd_t pmdp_clear_flush(struct vm_area_struct *vma, - unsigned long address, - pmd_t *pmdp); +extern pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmdp); #endif #ifndef __HAVE_ARCH_PTE_SAME diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index d030548047e..0369f5b3ba1 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -92,32 +92,29 @@ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, #endif #ifndef __HAVE_ARCH_PMDP_CLEAR_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t pmdp_clear_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { pmd_t pmd; -#ifndef CONFIG_TRANSPARENT_HUGEPAGE - BUG(); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ VM_BUG_ON(address & ~HPAGE_PMD_MASK); pmd = pmdp_get_and_clear(vma->vm_mm, address, pmdp); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif #ifndef __HAVE_ARCH_PMDP_SPLITTING_FLUSH +#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { -#ifdef CONFIG_TRANSPARENT_HUGEPAGE pmd_t pmd = pmd_mksplitting(*pmdp); VM_BUG_ON(address & ~HPAGE_PMD_MASK); set_pmd_at(vma->vm_mm, address, pmdp, pmd); /* tlb flush only to serialize against gup-fast */ flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); -#else /* CONFIG_TRANSPARENT_HUGEPAGE */ - BUG(); -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ } +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif -- cgit v1.2.3-70-g09d2 From 7a608572a282a74978e10fd6cd63090aebe29f5c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Jan 2011 14:42:19 -0800 Subject: Revert "mm: batch activate_page() to reduce lock contention" This reverts commit 744ed1442757767ffede5008bb13e0805085902e. Chris Mason ended up chasing down some page allocation errors and pages stuck waiting on the IO scheduler, and was able to narrow it down to two commits: commit 744ed1442757 ("mm: batch activate_page() to reduce lock contention") and d8505dee1a87 ("mm: simplify code of swap.c"). This reverts the first of them. Reported-and-debugged-by: Chris Mason Cc: Mel Gorman Cc: Andrew Morton Cc: Jens Axboe Cc: linux-mm Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: Shaohua Li Signed-off-by: Linus Torvalds --- mm/internal.h | 9 ------ mm/swap.c | 90 ++++++++--------------------------------------------------- mm/vmscan.c | 6 ++-- 3 files changed, 13 insertions(+), 92 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 4c98630f0f7..69488205723 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,15 +39,6 @@ static inline void __put_page(struct page *page) extern unsigned long highest_memmap_pfn; -#ifdef CONFIG_SMP -extern int putback_active_lru_page(struct zone *zone, struct page *page); -#else -static inline int putback_active_lru_page(struct zone *zone, struct page *page) -{ - return 0; -} -#endif - /* * in mm/vmscan.c: */ diff --git a/mm/swap.c b/mm/swap.c index bbc1ce9f946..ab498ea04ae 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -271,94 +271,27 @@ static void update_page_reclaim_stat(struct zone *zone, struct page *page, } /* - * A page will go to active list either by activate_page or putback_lru_page. - * In the activate_page case, the page hasn't active bit set. The page might - * not in LRU list because it's isolated before it gets a chance to be moved to - * active list. The window is small because pagevec just stores several pages. - * For such case, we do nothing for such page. - * In the putback_lru_page case, the page isn't in lru list but has active - * bit set + * FIXME: speed this up? */ -static void __activate_page(struct page *page, void *arg) +void activate_page(struct page *page) { struct zone *zone = page_zone(page); - int file = page_is_file_cache(page); - int lru = page_lru_base_type(page); - bool putback = !PageLRU(page); - - /* The page is isolated before it's moved to active list */ - if (!PageLRU(page) && !PageActive(page)) - return; - if ((PageLRU(page) && PageActive(page)) || PageUnevictable(page)) - return; - - if (!putback) - del_page_from_lru_list(zone, page, lru); - else - SetPageLRU(page); - - SetPageActive(page); - lru += LRU_ACTIVE; - add_page_to_lru_list(zone, page, lru); - - if (putback) - return; - __count_vm_event(PGACTIVATE); - update_page_reclaim_stat(zone, page, file, 1); -} - -#ifdef CONFIG_SMP -static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs); - -static void activate_page_drain(int cpu) -{ - struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu); - if (pagevec_count(pvec)) - pagevec_lru_move_fn(pvec, __activate_page, NULL); -} - -void activate_page(struct page *page) -{ + spin_lock_irq(&zone->lru_lock); if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); - - page_cache_get(page); - if (!pagevec_add(pvec, page)) - pagevec_lru_move_fn(pvec, __activate_page, NULL); - put_cpu_var(activate_page_pvecs); - } -} + int file = page_is_file_cache(page); + int lru = page_lru_base_type(page); + del_page_from_lru_list(zone, page, lru); -/* Caller should hold zone->lru_lock */ -int putback_active_lru_page(struct zone *zone, struct page *page) -{ - struct pagevec *pvec = &get_cpu_var(activate_page_pvecs); + SetPageActive(page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); + __count_vm_event(PGACTIVATE); - if (!pagevec_add(pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - pagevec_lru_move_fn(pvec, __activate_page, NULL); - spin_lock_irq(&zone->lru_lock); + update_page_reclaim_stat(zone, page, file, 1); } - put_cpu_var(activate_page_pvecs); - return 1; -} - -#else -static inline void activate_page_drain(int cpu) -{ -} - -void activate_page(struct page *page) -{ - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) - __activate_page(page, NULL); spin_unlock_irq(&zone->lru_lock); } -#endif /* * Mark a page as having seen activity. @@ -457,7 +390,6 @@ static void drain_cpu_pagevecs(int cpu) pagevec_move_tail(pvec); local_irq_restore(flags); } - activate_page_drain(cpu); } void lru_add_drain(void) diff --git a/mm/vmscan.c b/mm/vmscan.c index 99999a9b2b0..47a50962ce8 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1271,16 +1271,14 @@ putback_lru_pages(struct zone *zone, struct scan_control *sc, spin_lock_irq(&zone->lru_lock); continue; } + SetPageLRU(page); lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); if (is_active_lru(lru)) { int file = is_file_lru(lru); int numpages = hpage_nr_pages(page); reclaim_stat->recent_rotated[file] += numpages; - if (putback_active_lru_page(zone, page)) - continue; } - SetPageLRU(page); - add_page_to_lru_list(zone, page, lru); if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); -- cgit v1.2.3-70-g09d2 From 83896fb5e51594281720d145164f866ba769abd5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 17 Jan 2011 14:42:34 -0800 Subject: Revert "mm: simplify code of swap.c" This reverts commit d8505dee1a87b8d41b9c4ee1325cd72258226fbc. Chris Mason ended up chasing down some page allocation errors and pages stuck waiting on the IO scheduler, and was able to narrow it down to two commits: commit 744ed1442757 ("mm: batch activate_page() to reduce lock contention") and d8505dee1a87 ("mm: simplify code of swap.c"). This reverts the second one. Reported-and-debugged-by: Chris Mason Cc: Mel Gorman Cc: Andrew Morton Cc: Jens Axboe Cc: linux-mm Cc: KAMEZAWA Hiroyuki Cc: Andrea Arcangeli Cc: Shaohua Li Signed-off-by: Linus Torvalds --- mm/swap.c | 101 +++++++++++++++++++++++++++++--------------------------------- 1 file changed, 47 insertions(+), 54 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index ab498ea04ae..c02f93611a8 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -178,13 +178,15 @@ void put_pages_list(struct list_head *pages) } EXPORT_SYMBOL(put_pages_list); -static void pagevec_lru_move_fn(struct pagevec *pvec, - void (*move_fn)(struct page *page, void *arg), - void *arg) +/* + * pagevec_move_tail() must be called with IRQ disabled. + * Otherwise this may cause nasty races. + */ +static void pagevec_move_tail(struct pagevec *pvec) { int i; + int pgmoved = 0; struct zone *zone = NULL; - unsigned long flags = 0; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -192,41 +194,21 @@ static void pagevec_lru_move_fn(struct pagevec *pvec, if (pagezone != zone) { if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock(&zone->lru_lock); zone = pagezone; - spin_lock_irqsave(&zone->lru_lock, flags); + spin_lock(&zone->lru_lock); + } + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_lru_base_type(page); + list_move_tail(&page->lru, &zone->lru[lru].list); + pgmoved++; } - - (*move_fn)(page, arg); } if (zone) - spin_unlock_irqrestore(&zone->lru_lock, flags); - release_pages(pvec->pages, pagevec_count(pvec), pvec->cold); - pagevec_reinit(pvec); -} - -static void pagevec_move_tail_fn(struct page *page, void *arg) -{ - int *pgmoved = arg; - struct zone *zone = page_zone(page); - - if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { - int lru = page_lru_base_type(page); - list_move_tail(&page->lru, &zone->lru[lru].list); - (*pgmoved)++; - } -} - -/* - * pagevec_move_tail() must be called with IRQ disabled. - * Otherwise this may cause nasty races. - */ -static void pagevec_move_tail(struct pagevec *pvec) -{ - int pgmoved = 0; - - pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved); + spin_unlock(&zone->lru_lock); __count_vm_events(PGROTATED, pgmoved); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); } /* @@ -234,7 +216,7 @@ static void pagevec_move_tail(struct pagevec *pvec) * reclaim. If it still appears to be reclaimable, move it to the tail of the * inactive list. */ -void rotate_reclaimable_page(struct page *page) +void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && !PageUnevictable(page) && PageLRU(page)) { @@ -534,33 +516,44 @@ void lru_add_page_tail(struct zone* zone, } } -static void ____pagevec_lru_add_fn(struct page *page, void *arg) -{ - enum lru_list lru = (enum lru_list)arg; - struct zone *zone = page_zone(page); - int file = is_file_lru(lru); - int active = is_active_lru(lru); - - VM_BUG_ON(PageActive(page)); - VM_BUG_ON(PageUnevictable(page)); - VM_BUG_ON(PageLRU(page)); - - SetPageLRU(page); - if (active) - SetPageActive(page); - update_page_reclaim_stat(zone, page, file, active); - add_page_to_lru_list(zone, page, lru); -} - /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { + int i; + struct zone *zone = NULL; + VM_BUG_ON(is_unevictable_lru(lru)); - pagevec_lru_move_fn(pvec, ____pagevec_lru_add_fn, (void *)lru); + for (i = 0; i < pagevec_count(pvec); i++) { + struct page *page = pvec->pages[i]; + struct zone *pagezone = page_zone(page); + int file; + int active; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + active = is_active_lru(lru); + file = is_file_lru(lru); + if (active) + SetPageActive(page); + update_page_reclaim_stat(zone, page, file, active); + add_page_to_lru_list(zone, page, lru); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + release_pages(pvec->pages, pvec->nr, pvec->cold); + pagevec_reinit(pvec); } EXPORT_SYMBOL(____pagevec_lru_add); -- cgit v1.2.3-70-g09d2 From 453c719261c0b4030b2676124adb6e81c5fb6833 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 20 Jan 2011 14:44:18 -0800 Subject: thp: keep highpte mapped until it is no longer needed Two users reported THP-related crashes on 32-bit x86 machines. Their oops reports indicated an invalid pte, and subsequent code inspection showed that the highpte is actually used after unmap. The fix is to unmap the pte only after all operations against it are finished. Signed-off-by: Johannes Weiner Reported-by: Ilya Dryomov Reported-by: werner Cc: Andrea Arcangeli Tested-by: Ilya Dryomov Tested-by: Steven Rostedt Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 004c9c2aac7..c4f634b3a48 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1837,9 +1837,9 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(ptl); isolated = __collapse_huge_page_isolate(vma, address, pte); spin_unlock(ptl); - pte_unmap(pte); if (unlikely(!isolated)) { + pte_unmap(pte); spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); set_pmd_at(mm, address, pmd, _pmd); @@ -1856,6 +1856,7 @@ static void collapse_huge_page(struct mm_struct *mm, anon_vma_unlock(vma->anon_vma); __collapse_huge_page_copy(pte, new_page, vma, address, ptl); + pte_unmap(pte); __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); VM_BUG_ON(page_count(pgtable) != 1); -- cgit v1.2.3-70-g09d2 From abb65272a190660790096628859e752172d822fd Mon Sep 17 00:00:00 2001 From: Tomi Valkeinen Date: Thu, 20 Jan 2011 14:44:20 -0800 Subject: memblock: fix memblock_is_region_memory() memblock_is_region_memory() uses reserved memblocks to search for the given region, while it should use the memory memblocks. I encountered the problem with OMAP's framebuffer ram allocation. Normally the ram is allocated dynamically, and this function is not called. However, if we want to pass the framebuffer from the bootloader to the kernel (to retain the boot image), this function is used to check the validity of the kernel parameters for the framebuffer ram area. Signed-off-by: Tomi Valkeinen Acked-by: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: "H. Peter Anvin" Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index 400dc62697d..bdba245d8af 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -683,13 +683,13 @@ int __init_memblock memblock_is_memory(phys_addr_t addr) int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size) { - int idx = memblock_search(&memblock.reserved, base); + int idx = memblock_search(&memblock.memory, base); if (idx == -1) return 0; - return memblock.reserved.regions[idx].base <= base && - (memblock.reserved.regions[idx].base + - memblock.reserved.regions[idx].size) >= (base + size); + return memblock.memory.regions[idx].base <= base && + (memblock.memory.regions[idx].base + + memblock.memory.regions[idx].size) >= (base + size); } int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size) -- cgit v1.2.3-70-g09d2 From 3305de51bf612603c9a4e4dc98ceb839ef933749 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Thu, 20 Jan 2011 14:44:20 -0800 Subject: mm/vmscan.c: remove duplicate include of compaction.h Signed-off-by: Jesper Juhl Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 47a50962ce8..f5d90dedebb 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -41,7 +41,6 @@ #include #include #include -#include #include #include -- cgit v1.2.3-70-g09d2 From 82478fb7bca28e3ca2f3c55c14e690f749dd4dbb Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 20 Jan 2011 14:44:21 -0800 Subject: mm: compaction: prevent division-by-zero during user-requested compaction Up until 3e7d344 ("mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim"), compaction skipped calculating the fragmentation index of a zone when compaction was explicitely requested through the procfs knob. However, when compaction_suitable was introduced, it did not come with an extra check for order == -1, set on explicit compaction requests, and passed this order on to the fragmentation index calculation, where it overshifts the number of requested pages, leading to a division by zero. This patch makes sure that order == -1 is recognized as the flag it is rather than passing it along as valid order parameter. [akpm@linux-foundation.org: add comment, per Mel] Signed-off-by: Johannes Weiner Reviewed-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 6d592a02107..8be430b812d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -406,6 +406,10 @@ static int compact_finished(struct zone *zone, if (!zone_watermark_ok(zone, cc->order, watermark, 0, 0)) return COMPACT_CONTINUE; + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ if (cc->order == -1) return COMPACT_CONTINUE; @@ -453,6 +457,13 @@ unsigned long compaction_suitable(struct zone *zone, int order) if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) return COMPACT_SKIPPED; + /* + * order == -1 is expected when compacting via + * /proc/sys/vm/compact_memory + */ + if (order == -1) + return COMPACT_CONTINUE; + /* * fragmentation index determines if allocation failures are due to * low memory or external fragmentation -- cgit v1.2.3-70-g09d2 From e401f1761c0b01966e36e41e2c385d455a7b44ee Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 20 Jan 2011 14:44:23 -0800 Subject: memcg: modify accounting function for supporting THP better mem_cgroup_charge_statisics() was designed for charging a page but now, we have transparent hugepage. To fix problems (in following patch) it's required to change the function to get the number of pages as its arguments. The new function gets following as argument. - type of page rather than 'pc' - size of page which is accounted. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ab84103143..6d59a2bd520 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -600,23 +600,22 @@ static void mem_cgroup_swap_statistics(struct mem_cgroup *mem, } static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, - struct page_cgroup *pc, - bool charge) + bool file, int nr_pages) { - int val = (charge) ? 1 : -1; - preempt_disable(); - if (PageCgroupCache(pc)) - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val); + if (file) + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], nr_pages); else - __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val); + __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], nr_pages); - if (charge) + /* pagein of a big page is an event. So, ignore page size */ + if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); else __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); - __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]); + + __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); preempt_enable(); } @@ -2115,7 +2114,7 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, pc, true); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), 1); } static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, @@ -2186,14 +2185,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, pc, false); + mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -1); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ mem_cgroup_cancel_charge(from, PAGE_SIZE); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, pc, true); + mem_cgroup_charge_statistics(to, PageCgroupCache(pc), 1); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -2597,7 +2596,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) } for (i = 0; i < count; i++) - mem_cgroup_charge_statistics(mem, pc + i, false); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -1); ClearPageCgroupUsed(pc); /* -- cgit v1.2.3-70-g09d2 From ca3e021417eed30ec2b64ce88eb0acf64aa9bc29 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 20 Jan 2011 14:44:24 -0800 Subject: memcg: fix USED bit handling at uncharge in THP Now, under THP: at charge: - PageCgroupUsed bit is set to all page_cgroup on a hugepage. ....set to 512 pages. at uncharge - PageCgroupUsed bit is unset on the head page. So, some pages will remain with "Used" bit. This patch fixes that Used bit is set only to the head page. Used bits for tail pages will be set at splitting if necessary. This patch adds this lock order: compound_lock() -> page_cgroup_move_lock(). [akpm@linux-foundation.org: fix warning] Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 9 +++++ mm/huge_memory.c | 2 + mm/memcontrol.c | 91 ++++++++++++++++++++++++++-------------------- 3 files changed, 62 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 6a576f98943..f512e189be5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -146,6 +146,10 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, gfp_t gfp_mask); u64 mem_cgroup_get_limit(struct mem_cgroup *mem); +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail); +#endif + #else /* CONFIG_CGROUP_MEM_RES_CTLR */ struct mem_cgroup; @@ -335,6 +339,11 @@ u64 mem_cgroup_get_limit(struct mem_cgroup *mem) return 0; } +static inline void mem_cgroup_split_huge_fixup(struct page *head, + struct page *tail) +{ +} + #endif /* CONFIG_CGROUP_MEM_CONT */ #endif /* _LINUX_MEMCONTROL_H */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c4f634b3a48..e187454d82f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1203,6 +1203,8 @@ static void __split_huge_page_refcount(struct page *page) BUG_ON(!PageDirty(page_tail)); BUG_ON(!PageSwapBacked(page_tail)); + mem_cgroup_split_huge_fixup(page, page_tail); + lru_add_page_tail(zone, page, page_tail); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6d59a2bd520..848b42195e5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1614,7 +1614,7 @@ void mem_cgroup_update_page_stat(struct page *page, if (unlikely(!mem || !PageCgroupUsed(pc))) goto out; /* pc->mem_cgroup is unstable ? */ - if (unlikely(mem_cgroup_stealed(mem))) { + if (unlikely(mem_cgroup_stealed(mem)) || PageTransHuge(page)) { /* take a lock against to access pc->mem_cgroup */ move_lock_page_cgroup(pc, &flags); need_unlock = true; @@ -2083,14 +2083,27 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return mem; } -/* - * commit a charge got by __mem_cgroup_try_charge() and makes page_cgroup to be - * USED state. If already USED, uncharge and return. - */ -static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype) +static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, + struct page_cgroup *pc, + enum charge_type ctype, + int page_size) { + int nr_pages = page_size >> PAGE_SHIFT; + + /* try_charge() can return NULL to *memcg, taking care of it. */ + if (!mem) + return; + + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); + mem_cgroup_cancel_charge(mem, page_size); + return; + } + /* + * we don't need page_cgroup_lock about tail pages, becase they are not + * accessed by any other context at this point. + */ pc->mem_cgroup = mem; /* * We access a page_cgroup asynchronously without lock_page_cgroup(). @@ -2114,35 +2127,7 @@ static void ____mem_cgroup_commit_charge(struct mem_cgroup *mem, break; } - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), 1); -} - -static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, - struct page_cgroup *pc, - enum charge_type ctype, - int page_size) -{ - int i; - int count = page_size >> PAGE_SHIFT; - - /* try_charge() can return NULL to *memcg, taking care of it. */ - if (!mem) - return; - - lock_page_cgroup(pc); - if (unlikely(PageCgroupUsed(pc))) { - unlock_page_cgroup(pc); - mem_cgroup_cancel_charge(mem, page_size); - return; - } - - /* - * we don't need page_cgroup_lock about tail pages, becase they are not - * accessed by any other context at this point. - */ - for (i = 0; i < count; i++) - ____mem_cgroup_commit_charge(mem, pc + i, ctype); - + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), nr_pages); unlock_page_cgroup(pc); /* * "charge_statistics" updated event counter. Then, check it. @@ -2152,6 +2137,34 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem, memcg_check_events(mem, pc->page); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + +#define PCGF_NOCOPY_AT_SPLIT ((1 << PCG_LOCK) | (1 << PCG_MOVE_LOCK) |\ + (1 << PCG_ACCT_LRU) | (1 << PCG_MIGRATION)) +/* + * Because tail pages are not marked as "used", set it. We're under + * zone->lru_lock, 'splitting on pmd' and compund_lock. + */ +void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) +{ + struct page_cgroup *head_pc = lookup_page_cgroup(head); + struct page_cgroup *tail_pc = lookup_page_cgroup(tail); + unsigned long flags; + + /* + * We have no races witch charge/uncharge but will have races with + * page state accounting. + */ + move_lock_page_cgroup(head_pc, &flags); + + tail_pc->mem_cgroup = head_pc->mem_cgroup; + smp_wmb(); /* see __commit_charge() */ + /* we don't need to copy all flags...*/ + tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; + move_unlock_page_cgroup(head_pc, &flags); +} +#endif + /** * __mem_cgroup_move_account - move account of the page * @pc: page_cgroup of the page. @@ -2545,7 +2558,6 @@ direct_uncharge: static struct mem_cgroup * __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) { - int i; int count; struct page_cgroup *pc; struct mem_cgroup *mem = NULL; @@ -2595,8 +2607,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) break; } - for (i = 0; i < count; i++) - mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -1); + mem_cgroup_charge_statistics(mem, PageCgroupCache(pc), -count); ClearPageCgroupUsed(pc); /* -- cgit v1.2.3-70-g09d2 From ece35ca810326946ddc930c43356312ad5de44d4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 20 Jan 2011 14:44:24 -0800 Subject: memcg: fix LRU accounting with THP memory cgroup's LRU stat should take care of size of pages because Transparent Hugepage inserts hugepage into LRU. If this value is the number wrong, memory reclaim will not work well. Note: only head page of THP's huge page is linked into LRU. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 848b42195e5..7a94ef6b35e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -814,7 +814,8 @@ void mem_cgroup_del_lru_list(struct page *page, enum lru_list lru) * removed from global LRU. */ mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) -= 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) -= 1 << compound_order(page); if (mem_cgroup_is_root(pc->mem_cgroup)) return; VM_BUG_ON(list_empty(&pc->lru)); @@ -865,7 +866,8 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; mz = page_cgroup_zoneinfo(pc); - MEM_CGROUP_ZSTAT(mz, lru) += 1; + /* huge page split is done under lru_lock. so, we have no races. */ + MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); SetPageCgroupAcctLRU(pc); if (mem_cgroup_is_root(pc->mem_cgroup)) return; @@ -2152,14 +2154,26 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) unsigned long flags; /* - * We have no races witch charge/uncharge but will have races with + * We have no races with charge/uncharge but will have races with * page state accounting. */ move_lock_page_cgroup(head_pc, &flags); tail_pc->mem_cgroup = head_pc->mem_cgroup; smp_wmb(); /* see __commit_charge() */ - /* we don't need to copy all flags...*/ + if (PageCgroupAcctLRU(head_pc)) { + enum lru_list lru; + struct mem_cgroup_per_zone *mz; + + /* + * LRU flags cannot be copied because we need to add tail + *.page to LRU by generic call and our hook will be called. + * We hold lru_lock, then, reduce counter directly. + */ + lru = page_lru(head); + mz = page_cgroup_zoneinfo(head_pc); + MEM_CGROUP_ZSTAT(mz, lru) -= 1; + } tail_pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; move_unlock_page_cgroup(head_pc, &flags); } -- cgit v1.2.3-70-g09d2 From 987eba66e0e6aa654d60881a14731a353ee0acb4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 20 Jan 2011 14:44:25 -0800 Subject: memcg: fix rmdir, force_empty with THP Now, when THP is enabled, memcg's rmdir() function is broken because move_account() for THP page is not supported. This will cause account leak or -EBUSY issue at rmdir(). This patch fixes the issue by supporting move_account() THP pages. Signed-off-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7a94ef6b35e..5b562b375cb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2197,8 +2197,11 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) */ static void __mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge, + int charge_size) { + int nr_pages = charge_size >> PAGE_SHIFT; + VM_BUG_ON(from == to); VM_BUG_ON(PageLRU(pc->page)); VM_BUG_ON(!page_is_cgroup_locked(pc)); @@ -2212,14 +2215,14 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]); preempt_enable(); } - mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -1); + mem_cgroup_charge_statistics(from, PageCgroupCache(pc), -nr_pages); if (uncharge) /* This is not "cancel", but cancel_charge does all we need. */ - mem_cgroup_cancel_charge(from, PAGE_SIZE); + mem_cgroup_cancel_charge(from, charge_size); /* caller should have done css_get */ pc->mem_cgroup = to; - mem_cgroup_charge_statistics(to, PageCgroupCache(pc), 1); + mem_cgroup_charge_statistics(to, PageCgroupCache(pc), nr_pages); /* * We charges against "to" which may not have any tasks. Then, "to" * can be under rmdir(). But in current implementation, caller of @@ -2234,15 +2237,19 @@ static void __mem_cgroup_move_account(struct page_cgroup *pc, * __mem_cgroup_move_account() */ static int mem_cgroup_move_account(struct page_cgroup *pc, - struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge) + struct mem_cgroup *from, struct mem_cgroup *to, + bool uncharge, int charge_size) { int ret = -EINVAL; unsigned long flags; + if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) + return -EBUSY; + lock_page_cgroup(pc); if (PageCgroupUsed(pc) && pc->mem_cgroup == from) { move_lock_page_cgroup(pc, &flags); - __mem_cgroup_move_account(pc, from, to, uncharge); + __mem_cgroup_move_account(pc, from, to, uncharge, charge_size); move_unlock_page_cgroup(pc, &flags); ret = 0; } @@ -2267,6 +2274,8 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; + int charge = PAGE_SIZE; + unsigned long flags; int ret; /* Is ROOT ? */ @@ -2278,17 +2287,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto out; if (isolate_lru_page(page)) goto put; + /* The page is isolated from LRU and we have no race with splitting */ + charge = PAGE_SIZE << compound_order(page); parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, - PAGE_SIZE); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); if (ret || !parent) goto put_back; - ret = mem_cgroup_move_account(pc, child, parent, true); + if (charge > PAGE_SIZE) + flags = compound_lock_irqsave(page); + + ret = mem_cgroup_move_account(pc, child, parent, true, charge); if (ret) - mem_cgroup_cancel_charge(parent, PAGE_SIZE); + mem_cgroup_cancel_charge(parent, charge); put_back: + if (charge > PAGE_SIZE) + compound_unlock_irqrestore(page, flags); putback_lru_page(page); put: put_page(page); @@ -4868,7 +4883,7 @@ retry: goto put; pc = lookup_page_cgroup(page); if (!mem_cgroup_move_account(pc, - mc.from, mc.to, false)) { + mc.from, mc.to, false, PAGE_SIZE)) { mc.precharge--; /* we uncharge from mc.from later. */ mc.moved_charge++; -- cgit v1.2.3-70-g09d2 From 382e27daa542ce97c500dc357841c6416c735cc2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 20 Jan 2011 14:44:26 -0800 Subject: mm: fix truncate_setsize() comment Contrary to what the comment says, truncate_setsize() should be called *before* filesystem truncated blocks. Signed-off-by: Jan Kara Cc: Christoph Hellwig Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/truncate.c b/mm/truncate.c index 3c2d5ddfa0d..49feb46e77b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -549,13 +549,12 @@ EXPORT_SYMBOL(truncate_pagecache); * @inode: inode * @newsize: new file size * - * truncate_setsize updastes i_size update and performs pagecache - * truncation (if necessary) for a file size updates. It will be - * typically be called from the filesystem's setattr function when - * ATTR_SIZE is passed in. + * truncate_setsize updates i_size and performs pagecache truncation (if + * necessary) to @newsize. It will be typically be called from the filesystem's + * setattr function when ATTR_SIZE is passed in. * - * Must be called with inode_mutex held and after all filesystem - * specific block truncation has been performed. + * Must be called with inode_mutex held and before all filesystem specific + * block truncation has been performed. */ void truncate_setsize(struct inode *inode, loff_t newsize) { -- cgit v1.2.3-70-g09d2 From 713735b4233fad3ae35b5cad656baa41413887ca Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 20 Jan 2011 14:44:31 -0800 Subject: memcg: correctly order reading PCG_USED and pc->mem_cgroup The placement of the read-side barrier is confused: the writer first sets pc->mem_cgroup, then PCG_USED. The read-side barrier has to be between testing PCG_USED and reading pc->mem_cgroup. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5b562b375cb..db76ef72629 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -836,13 +836,12 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); /* unused or root page is not rotated. */ - if (!PageCgroupUsed(pc) || mem_cgroup_is_root(pc->mem_cgroup)) + if (!PageCgroupUsed(pc)) + return; + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); + if (mem_cgroup_is_root(pc->mem_cgroup)) return; mz = page_cgroup_zoneinfo(pc); list_move(&pc->lru, &mz->lists[lru]); @@ -857,14 +856,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); VM_BUG_ON(PageCgroupAcctLRU(pc)); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); /* huge page split is done under lru_lock. so, we have no races. */ MEM_CGROUP_ZSTAT(mz, lru) += 1 << compound_order(page); @@ -1031,14 +1026,10 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); - /* - * Used bit is set without atomic ops but after smp_wmb(). - * For making pc->mem_cgroup visible, insert smp_rmb() here. - */ - smp_rmb(); if (!PageCgroupUsed(pc)) return NULL; - + /* Ensure pc->mem_cgroup is visible after reading PCG_USED. */ + smp_rmb(); mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; -- cgit v1.2.3-70-g09d2 From f95ba941d1bee594d536cdcbf879a0865381b903 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 25 Jan 2011 15:07:11 -0800 Subject: mm/pgtable-generic.c: fix CONFIG_SWAP=n build mips (and sparc32): In file included from arch/mips/include/asm/tlb.h:21, from mm/pgtable-generic.c:9: include/asm-generic/tlb.h: In function `tlb_flush_mmu': include/asm-generic/tlb.h:76: error: implicit declaration of function `release_pages' include/asm-generic/tlb.h: In function `tlb_remove_page': include/asm-generic/tlb.h:105: error: implicit declaration of function `page_cache_release' free_pages_and_swap_cache() and free_page_and_swap_cache() are macros which call release_pages() and page_cache_release(). The obvious fix is to include pagemap.h in swap.h, where those macros are defined. But that breaks sparc for weird reasons. So fix it within mm/pgtable-generic.c instead. Reported-by: Yoichi Yuasa Cc: Geert Uytterhoeven Acked-by: Sam Ravnborg Cc: Sergei Shtylyov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/pgtable-generic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0369f5b3ba1..eb663fb533e 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -6,6 +6,7 @@ * Copyright (C) 2010 Linus Torvalds */ +#include #include #include -- cgit v1.2.3-70-g09d2 From f33261d75b88f55a08e6a9648cef73509979bfba Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Jan 2011 15:07:20 -0800 Subject: mm: fix deferred congestion timeout if preferred zone is not allowed Before 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone"), preferred_zone was only used for NUMA statistics, to determine the zoneidx from which to allocate from given the type requested, and whether to utilize memory compaction. wait_iff_congested(), though, uses preferred_zone to determine if the congestion wait should be deferred because its dirty pages are backed by a congested bdi. This incorrectly defers the timeout and busy loops in the page allocator with various cond_resched() calls if preferred_zone is not allowed in the current context, usually consuming 100% of a cpu. This patch ensures preferred_zone is an allowed zone in the fastpath depending on whether current is constrained by its cpuset or nodes in its mempolicy (when the nodemask passed is non-NULL). This is correct since the fastpath allocation always passes ALLOC_CPUSET when trying to allocate memory. In the slowpath, this patch resets preferred_zone to the first zone of the allowed type when the allocation is not constrained by current's cpuset, i.e. it does not pass ALLOC_CPUSET. This patch also ensures preferred_zone is from the set of allowed nodes when called from within direct reclaim since allocations are always constrained by cpusets in this context (it is blockable). Both of these uses of cpuset_current_mems_allowed are protected by get_mems_allowed(). Signed-off-by: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Acked-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 12 +++++++++++- mm/vmscan.c | 3 ++- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 90c1439549f..f4967910c96 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2034,6 +2034,14 @@ restart: */ alloc_flags = gfp_to_alloc_flags(gfp_mask); + /* + * Find the true preferred zone if the allocation is unconstrained by + * cpusets. + */ + if (!(alloc_flags & ALLOC_CPUSET) && !nodemask) + first_zones_zonelist(zonelist, high_zoneidx, NULL, + &preferred_zone); + /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -2192,7 +2200,9 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, get_mems_allowed(); /* The preferred zone is used for statistics later */ - first_zones_zonelist(zonelist, high_zoneidx, nodemask, &preferred_zone); + first_zones_zonelist(zonelist, high_zoneidx, + nodemask ? : &cpuset_current_mems_allowed, + &preferred_zone); if (!preferred_zone) { put_mems_allowed(); return NULL; diff --git a/mm/vmscan.c b/mm/vmscan.c index f5d90dedebb..148c6e630df 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2083,7 +2083,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zone *preferred_zone; first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - NULL, &preferred_zone); + &cpuset_current_mems_allowed, + &preferred_zone); wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); } } -- cgit v1.2.3-70-g09d2 From 2ff754fa8f416e82327f2d8f1354a033b66286df Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 25 Jan 2011 15:07:23 -0800 Subject: mm: clear pages_scanned only if draining a pcp adds pages to the buddy allocator Commit 0e093d99763e ("writeback: do not sleep on the congestion queue if there are no congested BDIs or if significant congestion is not being encountered in the current zone") uncovered a livelock in the page allocator that resulted in tasks infinitely looping trying to find memory and kswapd running at 100% cpu. The issue occurs because drain_all_pages() is called immediately following direct reclaim when no memory is freed and try_to_free_pages() returns non-zero because all zones in the zonelist do not have their all_unreclaimable flag set. When draining the per-cpu pagesets back to the buddy allocator for each zone, the zone->pages_scanned counter is cleared to avoid erroneously setting zone->all_unreclaimable later. The problem is that no pages may actually be drained and, thus, the unreclaimable logic never fails direct reclaim so the oom killer may be invoked. This apparently only manifested after wait_iff_congested() was introduced and the zone was full of anonymous memory that would not congest the backing store. The page allocator would infinitely loop if there were no other tasks waiting to be scheduled and clear zone->pages_scanned because of drain_all_pages() as the result of this change before kswapd could scan enough pages to trigger the reclaim logic. Additionally, with every loop of the page allocator and in the reclaim path, kswapd would be kicked and would end up running at 100% cpu. In this scenario, current and kswapd are all running continuously with kswapd incrementing zone->pages_scanned and current clearing it. The problem is even more pronounced when current swaps some of its memory to swap cache and the reclaimable logic then considers all active anonymous memory in the all_unreclaimable logic, which requires a much higher zone->pages_scanned value for try_to_free_pages() to return zero that is never attainable in this scenario. Before wait_iff_congested(), the page allocator would incur an unconditional timeout and allow kswapd to elevate zone->pages_scanned to a level that the oom killer would be called the next time it loops. The fix is to only attempt to drain pcp pages if there is actually a quantity to be drained. The unconditional clearing of zone->pages_scanned in free_pcppages_bulk() need not be changed since other callers already ensure that draining will occur. This patch ensures that free_pcppages_bulk() will actually free memory before calling into it from drain_all_pages() so zone->pages_scanned is only cleared if appropriate. Signed-off-by: David Rientjes Cc: Mel Gorman Reviewed-by: Johannes Weiner Cc: Minchan Kim Cc: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f4967910c96..a873e61e312 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1088,8 +1088,10 @@ static void drain_pages(unsigned int cpu) pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - free_pcppages_bulk(zone, pcp->count, pcp); - pcp->count = 0; + if (pcp->count) { + free_pcppages_bulk(zone, pcp->count, pcp); + pcp->count = 0; + } local_irq_restore(flags); } } -- cgit v1.2.3-70-g09d2 From 8dba474f034c322d96ada39cb20cac711d80dcb2 Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Tue, 25 Jan 2011 15:07:24 -0800 Subject: mm/memcontrol.c: fix uninitialized variable use in mem_cgroup_move_parent() In mm/memcontrol.c::mem_cgroup_move_parent() there's a path that jumps to the 'put_back' label ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); if (ret || !parent) goto put_back; where we'll if (charge > PAGE_SIZE) compound_unlock_irqrestore(page, flags); but, we have not assigned anything to 'flags' at this point, nor have we called 'compound_lock_irqsave()' (which is what sets 'flags'). The 'put_back' label should be moved below the call to compound_unlock_irqrestore() as per this patch. Signed-off-by: Jesper Juhl Cc: Balbir Singh Cc: Daisuke Nishimura Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelianov Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index db76ef72629..4fcf47a6255 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2292,9 +2292,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, ret = mem_cgroup_move_account(pc, child, parent, true, charge); if (ret) mem_cgroup_cancel_charge(parent, charge); -put_back: + if (charge > PAGE_SIZE) compound_unlock_irqrestore(page, flags); +put_back: putback_lru_page(page); put: put_page(page); -- cgit v1.2.3-70-g09d2 From 33a938774fdb9933e9c77504b035f4f87c0859df Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 25 Jan 2011 15:07:25 -0800 Subject: mm: compaction: don't depend on HUGETLB_PAGE Commit 5d6892407 ("thp: select CONFIG_COMPACTION if TRANSPARENT_HUGEPAGE enabled") causes this warning during the configuration process: warning: (TRANSPARENT_HUGEPAGE) selects COMPACTION which has unmet direct dependencies (EXPERIMENTAL && HUGETLB_PAGE && MMU) COMPACTION doesn't depend on HUGETLB_PAGE, it doesn't depend on THP either, it is also useful for regular alloc_pages(order > 0) including the very kernel stack during fork (THREAD_ORDER = 1). It's always better to enable COMPACTION. The warning should be an error because we would end up with MIGRATION not selected, and COMPACTION wouldn't work without migration (despite it seems to build with an inline migrate_pages returning -ENOSYS). I'd also like to remove EXPERIMENTAL: compaction has been in the kernel for some releases (for full safety the default remains disabled which I think is enough). Signed-off-by: Andrea Arcangeli Reported-by: Luca Tettamanti Tested-by: Luca Tettamanti Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index 3ad483bdf50..e9c0c61f2dd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -179,7 +179,7 @@ config SPLIT_PTLOCK_CPUS config COMPACTION bool "Allow for memory compaction" select MIGRATION - depends on EXPERIMENTAL && HUGETLB_PAGE && MMU + depends on MMU help Allows the compaction of memory for the allocation of huge pages. -- cgit v1.2.3-70-g09d2 From 28bd65781c848d95ba6a7f58b5c4b8265a804ec6 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 25 Jan 2011 15:07:26 -0800 Subject: mm: migration: clarify migrate_pages() comment Callers of migrate_pages should putback_lru_pages to return pages isolated to LRU or free list. Now comment is rather confusing. It says caller always have to call it. It is more clear to point out that the caller has to call it if migrate_pages's return value isn't zero. Signed-off-by: Minchan Kim Cc: Christoph Lameter Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 46fe8cc13d6..9f29a3b7aac 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -888,7 +888,7 @@ out: * are movable anymore because to has become empty * or no retryable pages exist anymore. * Caller should call putback_lru_pages to return pages to the LRU - * or free list. + * or free list only if ret != 0. * * Return: Number of pages not migrated or error code. */ -- cgit v1.2.3-70-g09d2 From 01c88e2d6b7330c0cc5867fe2297e7d826e1337d Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 25 Jan 2011 15:07:27 -0800 Subject: memcg: fix account leak at failure of memsw acconting Commit 4b53433468 ("memcg: clean up try_charge main loop") removes a cancel of charge at case: memory charge-> success. mem+swap charge-> failure. This leaks usage of memory. Fix it. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Balbir Singh Cc: [2.6.36+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4fcf47a6255..1eb1a04f874 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1832,6 +1832,7 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, if (likely(!ret)) return CHARGE_OK; + res_counter_uncharge(&mem->res, csize); mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else -- cgit v1.2.3-70-g09d2 From 3d37c4a9199920964ffdfaec6335d93b9dcf9ca5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 25 Jan 2011 15:07:28 -0800 Subject: memcg: bugfix check mem_cgroup_disabled() at split fixup mem_cgroup_disabled() should be checked at splitting. If disabled, no heavy work is necesary. Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Reviewed-by: Johannes Weiner Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1eb1a04f874..8ab1d42664f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2145,6 +2145,8 @@ void mem_cgroup_split_huge_fixup(struct page *head, struct page *tail) struct page_cgroup *tail_pc = lookup_page_cgroup(tail); unsigned long flags; + if (mem_cgroup_disabled()) + return; /* * We have no races with charge/uncharge but will have races with * page state accounting. -- cgit v1.2.3-70-g09d2 From 52dbb9050936fd33ceb45f10529dbc992507c058 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 25 Jan 2011 15:07:29 -0800 Subject: memcg: fix race at move_parent around compound_order() A fix up mem_cgroup_move_parent() which use compound_order() in asynchronous manner. This compound_order() may return unknown value because we don't take lock. Use PageTransHuge() and HPAGE_SIZE instead of it. Also clean up for mem_cgroup_move_parent(). - remove unnecessary initialization of local variable. - rename charge_size -> page_size - remove unnecessary (wrong) comment. - added a comment about THP. Note: Current design take compound_page_lock() in caller of move_account(). This should be revisited when we implement direct move_task of hugepage without splitting. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Acked-by: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8ab1d42664f..3878cfe399d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2236,7 +2236,12 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, { int ret = -EINVAL; unsigned long flags; - + /* + * The page is isolated from LRU. So, collapse function + * will not handle this page. But page splitting can happen. + * Do this check under compound_page_lock(). The caller should + * hold it. + */ if ((charge_size > PAGE_SIZE) && !PageTransHuge(pc->page)) return -EBUSY; @@ -2268,7 +2273,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, struct cgroup *cg = child->css.cgroup; struct cgroup *pcg = cg->parent; struct mem_cgroup *parent; - int charge = PAGE_SIZE; + int page_size = PAGE_SIZE; unsigned long flags; int ret; @@ -2281,22 +2286,24 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, goto out; if (isolate_lru_page(page)) goto put; - /* The page is isolated from LRU and we have no race with splitting */ - charge = PAGE_SIZE << compound_order(page); + + if (PageTransHuge(page)) + page_size = HPAGE_SIZE; parent = mem_cgroup_from_cont(pcg); - ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, charge); + ret = __mem_cgroup_try_charge(NULL, gfp_mask, + &parent, false, page_size); if (ret || !parent) goto put_back; - if (charge > PAGE_SIZE) + if (page_size > PAGE_SIZE) flags = compound_lock_irqsave(page); - ret = mem_cgroup_move_account(pc, child, parent, true, charge); + ret = mem_cgroup_move_account(pc, child, parent, true, page_size); if (ret) - mem_cgroup_cancel_charge(parent, charge); + mem_cgroup_cancel_charge(parent, page_size); - if (charge > PAGE_SIZE) + if (page_size > PAGE_SIZE) compound_unlock_irqrestore(page, flags); put_back: putback_lru_page(page); -- cgit v1.2.3-70-g09d2 From 0a08739e81671de2cb690774937fe510c000b27f Mon Sep 17 00:00:00 2001 From: Jesper Juhl Date: Sat, 30 Oct 2010 23:43:05 +0200 Subject: kmemleak: remove memset by using kzalloc We don't need to memset if we just use kzalloc() rather than kmalloc() in kmemleak_test_init(). Signed-off-by: Jesper Juhl Reviewed-by: Minchan Kim Signed-off-by: Catalin Marinas --- mm/kmemleak-test.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak-test.c b/mm/kmemleak-test.c index 177a5169bbd..ff0d9779cec 100644 --- a/mm/kmemleak-test.c +++ b/mm/kmemleak-test.c @@ -75,13 +75,11 @@ static int __init kmemleak_test_init(void) * after the module is removed. */ for (i = 0; i < 10; i++) { - elem = kmalloc(sizeof(*elem), GFP_KERNEL); - pr_info("kmemleak: kmalloc(sizeof(*elem)) = %p\n", elem); + elem = kzalloc(sizeof(*elem), GFP_KERNEL); + pr_info("kmemleak: kzalloc(sizeof(*elem)) = %p\n", elem); if (!elem) return -ENOMEM; - memset(elem, 0, sizeof(*elem)); INIT_LIST_HEAD(&elem->list); - list_add_tail(&elem->list, &test_list); } -- cgit v1.2.3-70-g09d2 From 6ae4bd1f0bc479984f30061b5e5116060c24a267 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 27 Jan 2011 10:30:26 +0000 Subject: kmemleak: Allow kmemleak metadata allocations to fail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch adds __GFP_NORETRY and __GFP_NOMEMALLOC flags to the kmemleak metadata allocations so that it has a smaller effect on the users of the kernel slab allocator. Since kmemleak allocations can now fail more often, this patch also reduces the verbosity by passing __GFP_NOWARN and not dumping the stack trace when a kmemleak allocation fails. Signed-off-by: Catalin Marinas Reported-by: Toralf Förster Acked-by: Pekka Enberg Acked-by: David Rientjes Cc: Ted Ts'o --- mm/kmemleak.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/kmemleak.c b/mm/kmemleak.c index bd9bc214091..84225f3b719 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -113,7 +113,9 @@ #define BYTES_PER_POINTER sizeof(void *) /* GFP bitmask for kmemleak internal allocations */ -#define GFP_KMEMLEAK_MASK (GFP_KERNEL | GFP_ATOMIC) +#define gfp_kmemleak_mask(gfp) (((gfp) & (GFP_KERNEL | GFP_ATOMIC)) | \ + __GFP_NORETRY | __GFP_NOMEMALLOC | \ + __GFP_NOWARN) /* scanning area inside a memory block */ struct kmemleak_scan_area { @@ -511,9 +513,10 @@ static struct kmemleak_object *create_object(unsigned long ptr, size_t size, struct kmemleak_object *object; struct prio_tree_node *node; - object = kmem_cache_alloc(object_cache, gfp & GFP_KMEMLEAK_MASK); + object = kmem_cache_alloc(object_cache, gfp_kmemleak_mask(gfp)); if (!object) { - kmemleak_stop("Cannot allocate a kmemleak_object structure\n"); + pr_warning("Cannot allocate a kmemleak_object structure\n"); + kmemleak_disable(); return NULL; } @@ -734,9 +737,9 @@ static void add_scan_area(unsigned long ptr, size_t size, gfp_t gfp) return; } - area = kmem_cache_alloc(scan_area_cache, gfp & GFP_KMEMLEAK_MASK); + area = kmem_cache_alloc(scan_area_cache, gfp_kmemleak_mask(gfp)); if (!area) { - kmemleak_warn("Cannot allocate a scan area\n"); + pr_warning("Cannot allocate a scan area\n"); goto out; } -- cgit v1.2.3-70-g09d2 From fdf4c587a793ba87935e38e7f25a9540bc9a7b95 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Mon, 31 Jan 2011 17:03:41 -0800 Subject: mlock: operate on any regions with protection != PROT_NONE As Tao Ma noticed, change 5ecfda0 breaks blktrace. This is because blktrace mmaps a file with PROT_WRITE permissions but without PROT_READ, so my attempt to not unnecessarity break COW during mlock ended up causing mlock to fail with a permission problem. I am proposing to let mlock ignore vma protection in all cases except PROT_NONE. In particular, mlock should not fail for PROT_WRITE regions (as in the blktrace case, which broke at 5ecfda0) or for PROT_EXEC regions (which seem to me like they were always broken). Signed-off-by: Michel Lespinasse Acked-by: Rik van Riel Signed-off-by: Linus Torvalds --- mm/mlock.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'mm') diff --git a/mm/mlock.c b/mm/mlock.c index 13e81ee8be9..c3924c7f00b 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -178,6 +178,13 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma, if ((vma->vm_flags & (VM_WRITE | VM_SHARED)) == VM_WRITE) gup_flags |= FOLL_WRITE; + /* + * We want mlock to succeed for regions that have any permissions + * other than PROT_NONE. + */ + if (vma->vm_flags & (VM_READ | VM_WRITE | VM_EXEC)) + gup_flags |= FOLL_FORCE; + if (vma->vm_flags & VM_LOCKED) gup_flags |= FOLL_MLOCK; -- cgit v1.2.3-70-g09d2 From fceda1bf498677501befc7da72fd2e4de7f18466 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Feb 2011 15:52:30 -0800 Subject: memsw: handle swapaccount kernel parameter correctly __setup based kernel command line parameters handlers which are handled in obsolete_checksetup are provided with the parameter value including = (more precisely everything right after the parameter name). This means that the current implementation of swapaccount[=1|0] doesn't work at all because if there is a value for the parameter then we are testing for "0" resp. "1" but we are getting "=0" resp. "=1" and if there is no parameter value we are getting an empty string rather than NULL. The original noswapccount parameter, which doesn't care about the value, works correctly. Signed-off-by: Michal Hocko Acked-by: KAMEZAWA Hiroyuki Cc: Daisuke Nishimura Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3878cfe399d..44f9f9c89f0 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5024,9 +5024,9 @@ struct cgroup_subsys mem_cgroup_subsys = { static int __init enable_swap_account(char *s) { /* consider enabled if no parameter or 1 is given */ - if (!s || !strcmp(s, "1")) + if (!(*s) || !strcmp(s, "=1")) really_do_swap_account = 1; - else if (!strcmp(s, "0")) + else if (!strcmp(s, "=0")) really_do_swap_account = 0; return 1; } @@ -5034,7 +5034,7 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { - enable_swap_account("0"); + enable_swap_account("=0"); return 1; } __setup("noswapaccount", disable_swap_account); -- cgit v1.2.3-70-g09d2 From 552b372ba9db85751e7db2998f07cca2e51f5865 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 1 Feb 2011 15:52:31 -0800 Subject: memsw: deprecate noswapaccount kernel parameter and schedule it for removal noswapaccount couldn't be used to control memsw for both on/off cases so we have added swapaccount[=0|1] parameter. This way we can turn the feature in two ways noswapaccount resp. swapaccount=0. We have kept the original noswapaccount but I think we should remove it after some time as it just makes more command line parameters without any advantages and also the code to handle parameters is uglier if we want both parameters. Signed-off-by: Michal Hocko Requested-by: KAMEZAWA Hiroyuki Acked-by: KAMEZAWA Hiroyuki Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/feature-removal-schedule.txt | 16 ++++++++++++++++ mm/memcontrol.c | 1 + 2 files changed, 17 insertions(+) (limited to 'mm') diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt index b959659c5df..b3f35e5f9c9 100644 --- a/Documentation/feature-removal-schedule.txt +++ b/Documentation/feature-removal-schedule.txt @@ -603,3 +603,19 @@ Why: The adm9240, w83792d and w83793 hardware monitoring drivers have Who: Jean Delvare ---------------------------- + +What: noswapaccount kernel command line parameter +When: 2.6.40 +Why: The original implementation of memsw feature enabled by + CONFIG_CGROUP_MEM_RES_CTLR_SWAP could be disabled by the noswapaccount + kernel parameter (introduced in 2.6.29-rc1). Later on, this decision + turned out to be not ideal because we cannot have the feature compiled + in and disabled by default and let only interested to enable it + (e.g. general distribution kernels might need it). Therefore we have + added swapaccount[=0|1] parameter (introduced in 2.6.37) which provides + the both possibilities. If we remove noswapaccount we will have + less command line parameters with the same functionality and we + can also cleanup the parameter handling a bit (). +Who: Michal Hocko + +---------------------------- diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 44f9f9c89f0..79abb1fd39d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5034,6 +5034,7 @@ __setup("swapaccount", enable_swap_account); static int __init disable_swap_account(char *s) { + printk_once("noswapaccount is deprecated and will be removed in 2.6.40. Use swapaccount=0 instead\n"); enable_swap_account("=0"); return 1; } -- cgit v1.2.3-70-g09d2 From 57fc4a5ee322cde96c33f101d3c2d3b79011c05c Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Tue, 1 Feb 2011 15:52:32 -0800 Subject: mm: when migrate_pages returns 0, all pages must have been released In some cases migrate_pages could return zero while still leaving a few pages in the pagelist (and some caller wouldn't notice it has to call putback_lru_pages after commit cf608ac19c9 ("mm: compaction: fix COMPACTPAGEFAILED counting")). Add one missing putback_lru_pages not added by commit cf608ac19c95 ("mm: compaction: fix COMPACTPAGEFAILED counting"). Signed-off-by: Andrea Arcangeli Signed-off-by: Minchan Kim Reviewed-by: Minchan Kim Cc: Christoph Lameter Acked-by: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 1 + mm/migrate.c | 3 +-- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 548fbd70f02..75398b0bfed 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1419,6 +1419,7 @@ int soft_offline_page(struct page *page, int flags) ret = migrate_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { + putback_lru_pages(&pagelist); pr_info("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 9f29a3b7aac..155a2e9a805 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -772,6 +772,7 @@ uncharge: unlock: unlock_page(page); +move_newpage: if (rc != -EAGAIN) { /* * A page that has been migrated has all references @@ -785,8 +786,6 @@ unlock: putback_lru_page(page); } -move_newpage: - /* * Move the new page to the LRU. If migration was not successful * then this will free the page. -- cgit v1.2.3-70-g09d2 From 48db54ee2f41e8ae2faf330b55db34a9fffb5b3c Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Tue, 1 Feb 2011 15:52:33 -0800 Subject: mm/migration: fix page corruption during hugepage migration If migrate_huge_page by memory-failure fails , it calls put_page in itself to decrease page reference and caller of migrate_huge_page also calls putback_lru_pages. It can do double free of page so it can make page corruption on page holder. In addtion, clean of pages on caller is consistent behavior with migrate_pages by cf608ac19c ("mm: compaction: fix COMPACTPAGEFAILED counting"). Signed-off-by: Minchan Kim Cc: Andrea Arcangeli Cc: Christoph Lameter Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 5 ++++- mm/migrate.c | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 75398b0bfed..237aaa488f4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1295,7 +1295,10 @@ static int soft_offline_huge_page(struct page *page, int flags) ret = migrate_huge_pages(&pagelist, new_page, MPOL_MF_MOVE_ALL, 0, true); if (ret) { - putback_lru_pages(&pagelist); + struct page *page1, *page2; + list_for_each_entry_safe(page1, page2, &pagelist, lru) + put_page(page1); + pr_debug("soft offline: %#lx: migration failed %d, type %lx\n", pfn, ret, page->flags); if (ret > 0) diff --git a/mm/migrate.c b/mm/migrate.c index 155a2e9a805..76611525380 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -980,10 +980,6 @@ int migrate_huge_pages(struct list_head *from, } rc = 0; out: - - list_for_each_entry_safe(page, page2, from, lru) - put_page(page); - if (rc) return rc; -- cgit v1.2.3-70-g09d2 From efeda7a41e09efce506a68c3549b60b16dd7dedd Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 1 Feb 2011 15:52:39 -0800 Subject: thp: fix splitting of hwpoisoned hugepages The poisoned THP is now split with split_huge_page() in collect_procs_anon(). If kmalloc() is failed in collect_procs(), split_huge_page() could not be called. And the work after split_huge_page() for collecting the processes using poisoned page will not be done, too. So the processes using the poisoned page could not be killed. The condition becomes worse when CONFIG_DEBUG_VM == "Y". Because the poisoned THP could not be split, system panic will be caused by VM_BUG_ON(PageTransHuge(page)) in try_to_unmap(). This patch does: 1. move split_huge_page() to the place before collect_procs(). This can be sure the failure of splitting THP is caused by itself. 2. when splitting THP is failed, stop the operations after it. This can avoid unexpected system panic or non sense works. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Jin Dongming Reviewed-by: Hidetoshi Seto Cc: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 237aaa488f4..1e9c30b241c 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -386,8 +386,6 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, struct task_struct *tsk; struct anon_vma *av; - if (!PageHuge(page) && unlikely(split_huge_page(page))) - return; read_lock(&tasklist_lock); av = page_lock_anon_vma(page); if (av == NULL) /* Not actually mapped anymore */ @@ -896,6 +894,34 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } } + if (PageTransHuge(hpage)) { + /* + * Verify that this isn't a hugetlbfs head page, the check for + * PageAnon is just for avoid tripping a split_huge_page + * internal debug check, as split_huge_page refuses to deal with + * anything that isn't an anon page. PageAnon can't go away fro + * under us because we hold a refcount on the hpage, without a + * refcount on the hpage. split_huge_page can't be safely called + * in the first place, having a refcount on the tail isn't + * enough * to be safe. + */ + if (!PageHuge(hpage) && PageAnon(hpage)) { + if (unlikely(split_huge_page(hpage))) { + /* + * FIXME: if splitting THP is failed, it is + * better to stop the following operation rather + * than causing panic by unmapping. System might + * survive if the page is freed later. + */ + printk(KERN_INFO + "MCE %#lx: failed to split THP\n", pfn); + + BUG_ON(!PageHWPoison(p)); + return SWAP_FAIL; + } + } + } + /* * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, -- cgit v1.2.3-70-g09d2 From a6d30dddae4648837be5a0c0cb2c0ae9ad0377db Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 1 Feb 2011 15:52:40 -0800 Subject: thp: fix the wrong reported address of hwpoisoned hugepages When the tail page of THP is poisoned, the head page will be poisoned too. And the wrong address, address of head page, will be sent with sigbus always. So when the poisoned page is used by Guest OS which is running on KVM, after the address changing(hva->gpa) by qemu, the unexpected process on Guest OS will be killed by sigbus. What we expected is that the process using the poisoned tail page could be killed on Guest OS, but not that the process using the healthy head page is killed. Since it is not good to poison the healthy page, avoid poisoning other than the page which is really poisoned. (While we poison all pages in a huge page in case of hugetlb, we can do this for THP thanks to split_huge_page().) Here we fix two parts: 1. Isolate the poisoned page only to make sure the reported address is the address of poisoned page. 2. make the poisoned page work as the poisoned regular page. [akpm@linux-foundation.org: fix spello in comment] Signed-off-by: Jin Dongming Reviewed-by: Hidetoshi Seto Cc: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++++++- mm/memory-failure.c | 27 ++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index e187454d82f..b6c1ce3c53b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1162,7 +1162,12 @@ static void __split_huge_page_refcount(struct page *page) /* after clearing PageTail the gup refcount can be released */ smp_mb(); - page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + /* + * retain hwpoison flag of the poisoned tail page: + * fix for the unsuitable process killed on Guest Machine(KVM) + * by the memory-failure. + */ + page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP | __PG_HWPOISON; page_tail->flags |= (page->flags & ((1L << PG_referenced) | (1L << PG_swapbacked) | diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 1e9c30b241c..04158d6f44d 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -854,6 +854,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, int ret; int kill = 1; struct page *hpage = compound_head(p); + struct page *ppage; if (PageReserved(p) || PageSlab(p)) return SWAP_SUCCESS; @@ -894,6 +895,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, } } + /* + * ppage: poisoned page + * if p is regular page(4k page) + * ppage == real poisoned page; + * else p is hugetlb or THP, ppage == head page. + */ + ppage = hpage; + if (PageTransHuge(hpage)) { /* * Verify that this isn't a hugetlbfs head page, the check for @@ -919,6 +928,8 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, BUG_ON(!PageHWPoison(p)); return SWAP_FAIL; } + /* THP is split, so ppage should be the real poisoned page. */ + ppage = p; } } @@ -931,12 +942,18 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * there's nothing that can be done. */ if (kill) - collect_procs(hpage, &tokill); + collect_procs(ppage, &tokill); - ret = try_to_unmap(hpage, ttu); + if (hpage != ppage) + lock_page_nosync(ppage); + + ret = try_to_unmap(ppage, ttu); if (ret != SWAP_SUCCESS) printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n", - pfn, page_mapcount(hpage)); + pfn, page_mapcount(ppage)); + + if (hpage != ppage) + unlock_page(ppage); /* * Now that the dirty bit has been propagated to the @@ -947,7 +964,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - kill_procs_ao(&tokill, !!PageDirty(hpage), trapno, + kill_procs_ao(&tokill, !!PageDirty(ppage), trapno, ret != SWAP_SUCCESS, p, pfn); return ret; @@ -1090,7 +1107,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * For error on the tail page, we should set PG_hwpoison * on the head page to show that the hugepage is hwpoisoned */ - if (PageTail(p) && TestSetPageHWPoison(hpage)) { + if (PageHuge(p) && PageTail(p) && TestSetPageHWPoison(hpage)) { action_result(pfn, "hugepage already hardware poisoned", IGNORED); unlock_page(hpage); -- cgit v1.2.3-70-g09d2 From af241a083404acda7ba3690e5b7697949d729fcc Mon Sep 17 00:00:00 2001 From: Jin Dongming Date: Tue, 1 Feb 2011 15:52:41 -0800 Subject: thp: fix unsuitable behavior for hwpoisoned tail page When a tail page of THP is poisoned, memory-failure will do nothing except setting PG_hwpoison, while the expected behavior is that the process, who is using the poisoned tail page, should be killed. The above problem is caused by lru check of the poisoned tail page of THP. Because PG_lru flag is only set on the head page of THP, the check always consider the poisoned tail page as NON lru page. So the lru check for the tail page of THP should be avoided, as like as hugetlb. This patch adds !PageTransCompound() before lru check for THP, because of the check (!PageHuge() && !PageTransCompound()) the whole branch could be optimized away at build time when both hugetlbfs and THP are set with "N" (or in archs not supporting either of those). [akpm@linux-foundation.org: fix unrelated typo in shake_page() comment] Signed-off-by: Jin Dongming Reviewed-by: Hidetoshi Seto Cc: Andrea Arcangeli Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 31 +++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 04158d6f44d..0207c2f6f8b 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -233,8 +233,8 @@ void shake_page(struct page *p, int access) } /* - * Only all shrink_slab here (which would also - * shrink other caches) if access is not potentially fatal. + * Only call shrink_slab here (which would also shrink other caches) if + * access is not potentially fatal. */ if (access) { int nr; @@ -1065,19 +1065,22 @@ int __memory_failure(unsigned long pfn, int trapno, int flags) * The check (unnecessarily) ignores LRU pages being isolated and * walked by the page reclaim code, however that's not a big loss. */ - if (!PageLRU(p) && !PageHuge(p)) - shake_page(p, 0); - if (!PageLRU(p) && !PageHuge(p)) { - /* - * shake_page could have turned it free. - */ - if (is_free_buddy_page(p)) { - action_result(pfn, "free buddy, 2nd try", DELAYED); - return 0; + if (!PageHuge(p) && !PageTransCompound(p)) { + if (!PageLRU(p)) + shake_page(p, 0); + if (!PageLRU(p)) { + /* + * shake_page could have turned it free. + */ + if (is_free_buddy_page(p)) { + action_result(pfn, "free buddy, 2nd try", + DELAYED); + return 0; + } + action_result(pfn, "non LRU", IGNORED); + put_page(p); + return -EBUSY; } - action_result(pfn, "non LRU", IGNORED); - put_page(p); - return -EBUSY; } /* -- cgit v1.2.3-70-g09d2 From 9221edb7120e2dc3ae90f1c58514979f7ba40e46 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:42 -0800 Subject: memcg: prevent endless loop when charging huge pages The charging code can encounter a charge size that is bigger than a regular page in two situations: one is a batched charge to fill the per-cpu stocks, the other is a huge page charge. This code is distributed over two functions, however, and only the outer one is aware of huge pages. In case the charging fails, the inner function will tell the outer function to retry if the charge size is bigger than regular pages--assuming batched charging is the only case. And the outer function will retry forever charging a huge page. This patch makes sure the inner function can distinguish between batch charging and a single huge page charge. It will only signal another attempt if batch charging failed, and go into regular reclaim when it is called on behalf of a huge page. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79abb1fd39d..50eb50e100f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1837,8 +1837,15 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, flags |= MEM_CGROUP_RECLAIM_NOSWAP; } else mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); - - if (csize > PAGE_SIZE) /* change csize and retry */ + /* + * csize can be either a huge page (HPAGE_SIZE), a batch of + * regular pages (CHARGE_SIZE), or a single regular page + * (PAGE_SIZE). + * + * Never reclaim on behalf of optional batching, retry with a + * single page instead. + */ + if (csize == CHARGE_SIZE) return CHARGE_RETRY; if (!(gfp_mask & __GFP_WAIT)) -- cgit v1.2.3-70-g09d2 From 19942822df65ee4a47c2e6d6d70cace1b7f01710 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:43 -0800 Subject: memcg: prevent endless loop when charging huge pages to near-limit group If reclaim after a failed charging was unsuccessful, the limits are checked again, just in case they settled by means of other tasks. This is all fine as long as every charge is of size PAGE_SIZE, because in that case, being below the limit means having at least PAGE_SIZE bytes available. But with transparent huge pages, we may end up in an endless loop where charging and reclaim fail, but we keep going because the limits are not yet exceeded, although not allowing for a huge page. Fix this up by explicitely checking for enough room, not just whether we are within limits. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/res_counter.h | 20 ++++++++++++++++++++ mm/memcontrol.c | 35 ++++++++++++++++++++++++++++------- 2 files changed, 48 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h index fcb9884df61..a5930cb6614 100644 --- a/include/linux/res_counter.h +++ b/include/linux/res_counter.h @@ -182,6 +182,26 @@ static inline bool res_counter_check_under_limit(struct res_counter *cnt) return ret; } +/** + * res_counter_check_margin - check if the counter allows charging + * @cnt: the resource counter to check + * @bytes: the number of bytes to check the remaining space against + * + * Returns a boolean value on whether the counter can be charged + * @bytes or whether this would exceed the limit. + */ +static inline bool res_counter_check_margin(struct res_counter *cnt, + unsigned long bytes) +{ + bool ret; + unsigned long flags; + + spin_lock_irqsave(&cnt->lock, flags); + ret = cnt->limit - cnt->usage >= bytes; + spin_unlock_irqrestore(&cnt->lock, flags); + return ret; +} + static inline bool res_counter_check_under_soft_limit(struct res_counter *cnt) { bool ret; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 50eb50e100f..0e81eb5f0ae 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1111,6 +1111,23 @@ static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) return false; } +/** + * mem_cgroup_check_margin - check if the memory cgroup allows charging + * @mem: memory cgroup to check + * @bytes: the number of bytes the caller intends to charge + * + * Returns a boolean value on whether @mem can be charged @bytes or + * whether this would exceed the limit. + */ +static bool mem_cgroup_check_margin(struct mem_cgroup *mem, unsigned long bytes) +{ + if (!res_counter_check_margin(&mem->res, bytes)) + return false; + if (do_swap_account && !res_counter_check_margin(&mem->memsw, bytes)) + return false; + return true; +} + static unsigned int get_swappiness(struct mem_cgroup *memcg) { struct cgroup *cgrp = memcg->css.cgroup; @@ -1852,15 +1869,19 @@ static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask, return CHARGE_WOULDBLOCK; ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL, - gfp_mask, flags); + gfp_mask, flags); + if (mem_cgroup_check_margin(mem_over_limit, csize)) + return CHARGE_RETRY; /* - * try_to_free_mem_cgroup_pages() might not give us a full - * picture of reclaim. Some pages are reclaimed and might be - * moved to swap cache or just unmapped from the cgroup. - * Check the limit again to see if the reclaim reduced the - * current usage of the cgroup before giving up + * Even though the limit is exceeded at this point, reclaim + * may have been able to free some pages. Retry the charge + * before killing the task. + * + * Only for regular pages, though: huge pages are rather + * unlikely to succeed so close to the limit, and we fall back + * to regular pages anyway in case of failure. */ - if (ret || mem_cgroup_check_under_limit(mem_over_limit)) + if (csize == PAGE_SIZE && ret) return CHARGE_RETRY; /* -- cgit v1.2.3-70-g09d2 From 8493ae439f7038b502df1d687e61dde54c27ca92 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 1 Feb 2011 15:52:44 -0800 Subject: memcg: never OOM when charging huge pages Huge page coverage should obviously have less priority than the continued execution of a process. Never kill a process when charging it a huge page fails. Instead, give up after the first failed reclaim attempt and fall back to regular pages. Signed-off-by: Johannes Weiner Acked-by: KAMEZAWA Hiroyuki Reviewed-by: Minchan Kim Cc: Balbir Singh Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0e81eb5f0ae..fc75f34ba60 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2351,13 +2351,19 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, enum charge_type ctype) { struct mem_cgroup *mem = NULL; + int page_size = PAGE_SIZE; struct page_cgroup *pc; + bool oom = true; int ret; - int page_size = PAGE_SIZE; if (PageTransHuge(page)) { page_size <<= compound_order(page); VM_BUG_ON(!PageTransHuge(page)); + /* + * Never OOM-kill a process for a huge page. The + * fault handler will fall back to regular pages. + */ + oom = false; } pc = lookup_page_cgroup(page); @@ -2366,7 +2372,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, return 0; prefetchw(pc); - ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page_size); + ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, oom, page_size); if (ret || !mem) return ret; -- cgit v1.2.3-70-g09d2 From 3751d60430fe4c26460a5ca8ad8672d32f93bcb1 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 1 Feb 2011 15:52:45 -0800 Subject: memcg: fix event counting breakage from recent THP update Changes in e401f1761 ("memcg: modify accounting function for supporting THP better") adds nr_pages to support multiple page size in memory_cgroup_charge_statistics. But counting the number of event nees abs(nr_pages) for increasing counters. This patch fixes event counting. Signed-off-by: KAMEZAWA Hiroyuki Reviewed-by: Johannes Weiner Cc: Daisuke Nishimura Cc: Balbir Singh Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fc75f34ba60..da53a252b25 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -612,8 +612,10 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]); - else + else { __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]); + nr_pages = -nr_pages; /* for event */ + } __this_cpu_add(mem->stat->count[MEM_CGROUP_EVENTS], nr_pages); -- cgit v1.2.3-70-g09d2 From e6d2e2b2b1e1455df16d68a78f4a3874c7b3ad20 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 10 Feb 2011 15:01:30 -0800 Subject: memblock: don't adjust size in memblock_find_base() While applying patch to use memblock to find aperture for 64bit x86. Ingo found system with 1g + force_iommu > No AGP bridge found > Node 0: aperture @ 38000000 size 32 MB > Aperture pointing to e820 RAM. Ignoring. > Your BIOS doesn't leave a aperture memory hole > Please enable the IOMMU option in the BIOS setup > This costs you 64 MB of RAM > Cannot allocate aperture memory hole (0,65536K) the corresponding code: addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20); if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); return 0; } memblock_x86_reserve_range(addr, addr + aper_size, "aperture64") fails because memblock core code align the size with 512M. That could make size way too big. So don't align the size in that case. actually __memblock_alloc_base, the another caller already align that before calling that function. BTW. x86 does not use __memblock_alloc_base... Signed-off-by: Yinghai Lu Cc: Ingo Molnar Cc: David Miller Cc: "H. Peter Anvin" Cc: Benjamin Herrenschmidt Cc: Dave Airlie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/memblock.c b/mm/memblock.c index bdba245d8af..4618fda975a 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -137,8 +137,6 @@ static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size, BUG_ON(0 == size); - size = memblock_align_up(size, align); - /* Pump up max_addr */ if (end == MEMBLOCK_ALLOC_ACCESSIBLE) end = memblock.current_limit; -- cgit v1.2.3-70-g09d2 From e15f8c01af924e611bc7be1e45449c4a74e5dfdd Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Feb 2011 15:01:32 -0800 Subject: mlock: fix race when munlocking pages in do_wp_page() vmscan can lazily find pages that are mapped within VM_LOCKED vmas, and set the PageMlocked bit on these pages, transfering them onto the unevictable list. When do_wp_page() breaks COW within a VM_LOCKED vma, it may need to clear PageMlocked on the old page and set it on the new page instead. This change fixes an issue where do_wp_page() was clearing PageMlocked on the old page while the pte was still pointing to it (as well as rmap). Therefore, we were not protected against vmscan immediately transfering the old page back onto the unevictable list. This could cause pages to get stranded there forever. I propose to move the corresponding code to the end of do_wp_page(), after the pte (and rmap) have been pointed to the new page. Additionally, we can use munlock_vma_page() instead of clear_page_mlock(), so that the old page stays mlocked if there are still other VM_LOCKED vmas mapping it. Signed-off-by: Michel Lespinasse Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 31250faff39..32df03cf13a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2219,7 +2219,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } page_cache_release(old_page); @@ -2289,7 +2288,6 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, &ptl); if (!pte_same(*page_table, orig_pte)) { unlock_page(old_page); - page_cache_release(old_page); goto unlock; } @@ -2367,16 +2365,6 @@ gotten: } __SetPageUptodate(new_page); - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if ((vma->vm_flags & VM_LOCKED) && old_page) { - lock_page(old_page); /* for LRU manipulation */ - clear_page_mlock(old_page); - unlock_page(old_page); - } - if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) goto oom_free_new; @@ -2444,10 +2432,20 @@ gotten: if (new_page) page_cache_release(new_page); - if (old_page) - page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); + if (old_page) { + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if ((ret & VM_FAULT_WRITE) && (vma->vm_flags & VM_LOCKED)) { + lock_page(old_page); /* LRU manipulation */ + munlock_vma_page(old_page); + unlock_page(old_page); + } + page_cache_release(old_page); + } return ret; oom_free_new: page_cache_release(new_page); -- cgit v1.2.3-70-g09d2 From 419d8c96dbfa558f00e623023917d0a5afc46129 Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Thu, 10 Feb 2011 15:01:33 -0800 Subject: mlock: do not munlock pages in __do_fault() If the page is going to be written to, __do_page needs to break COW. However, the old page (before breaking COW) was never mapped mapped into the current pte (__do_fault is only called when the pte is not present), so vmscan can't have marked the old page as PageMlocked due to being mapped in __do_fault's VMA. Therefore, __do_fault() does not need to worry about clearing PageMlocked() on the old page. Signed-off-by: Michel Lespinasse Reviewed-by: KAMEZAWA Hiroyuki Acked-by: Hugh Dickins Cc: Rik van Riel Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 32df03cf13a..8e8c1832486 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3051,12 +3051,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out; } charged = 1; - /* - * Don't let another task, with possibly unlocked vma, - * keep the mlocked page. - */ - if (vma->vm_flags & VM_LOCKED) - clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { -- cgit v1.2.3-70-g09d2 From f0fdc5e8e6f579310458aef43d1610a0bb5e81a4 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 10 Feb 2011 15:01:34 -0800 Subject: vmscan: fix zone shrinking exit when scan work is done Commit 3e7d34497067 ("mm: vmscan: reclaim order-0 and use compaction instead of lumpy reclaim") introduced an indefinite loop in shrink_zone(). It meant to break out of this loop when no pages had been reclaimed and not a single page was even scanned. The way it would detect the latter is by taking a snapshot of sc->nr_scanned at the beginning of the function and comparing it against the new sc->nr_scanned after the scan loop. But it would re-iterate without updating that snapshot, looping forever if sc->nr_scanned changed at least once since shrink_zone() was invoked. This is not the sole condition that would exit that loop, but it requires other processes to change the zone state, as the reclaimer that is stuck obviously can not anymore. This is only happening for higher-order allocations, where reclaim is run back to back with compaction. Signed-off-by: Johannes Weiner Reported-by: Michal Hocko Tested-by: Kent Overstreet Reported-by: Kent Overstreet Acked-by: Mel Gorman Cc: Andrea Arcangeli Cc: Rik van Riel Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 148c6e630df..17497d0cd8b 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1882,12 +1882,12 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list l; - unsigned long nr_reclaimed; + unsigned long nr_reclaimed, nr_scanned; unsigned long nr_to_reclaim = sc->nr_to_reclaim; - unsigned long nr_scanned = sc->nr_scanned; restart: nr_reclaimed = 0; + nr_scanned = sc->nr_scanned; get_scan_count(zone, sc, nr, priority); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || -- cgit v1.2.3-70-g09d2 From 678ff896a37afdbca292c7846ec895463aed35a5 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Thu, 10 Feb 2011 15:01:36 -0800 Subject: memcg: fix leak of accounting at failure path of hugepage collapsing mem_cgroup_uncharge_page() should be called in all failure cases after mem_cgroup_charge_newpage() is called in huge_memory.c::collapse_huge_page() [ 4209.076861] BUG: Bad page state in process khugepaged pfn:1e9800 [ 4209.077601] page:ffffea0006b14000 count:0 mapcount:0 mapping: (null) index:0x2800 [ 4209.078674] page flags: 0x40000000004000(head) [ 4209.079294] pc:ffff880214a30000 pc->flags:2146246697418756 pc->mem_cgroup:ffffc9000177a000 [ 4209.082177] (/A) [ 4209.082500] Pid: 31, comm: khugepaged Not tainted 2.6.38-rc3-mm1 #1 [ 4209.083412] Call Trace: [ 4209.083678] [] ? bad_page+0xe4/0x140 [ 4209.084240] [] ? free_pages_prepare+0xd6/0x120 [ 4209.084837] [] ? rwsem_down_failed_common+0xbd/0x150 [ 4209.085509] [] ? __free_pages_ok+0x32/0xe0 [ 4209.086110] [] ? free_compound_page+0x1b/0x20 [ 4209.086699] [] ? __put_compound_page+0x1c/0x30 [ 4209.087333] [] ? put_compound_page+0x4d/0x200 [ 4209.087935] [] ? put_page+0x45/0x50 [ 4209.097361] [] ? khugepaged+0x9e9/0x1430 [ 4209.098364] [] ? autoremove_wake_function+0x0/0x40 [ 4209.099121] [] ? khugepaged+0x0/0x1430 [ 4209.099780] [] ? kthread+0x96/0xa0 [ 4209.100452] [] ? kernel_thread_helper+0x4/0x10 [ 4209.101214] [] ? kthread+0x0/0xa0 [ 4209.101842] [] ? kernel_thread_helper+0x0/0x10 Signed-off-by: KAMEZAWA Hiroyuki Acked-by: Daisuke Nishimura Reviewed-by: Johannes Weiner Cc: Andrea Arcangeli Reviewed-by: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6c1ce3c53b..e62ddb8f24b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1852,7 +1852,6 @@ static void collapse_huge_page(struct mm_struct *mm, set_pmd_at(mm, address, pmd, _pmd); spin_unlock(&mm->page_table_lock); anon_vma_unlock(vma->anon_vma); - mem_cgroup_uncharge_page(new_page); goto out; } @@ -1898,6 +1897,7 @@ out_up_write: return; out: + mem_cgroup_uncharge_page(new_page); #ifdef CONFIG_NUMA put_page(new_page); #endif -- cgit v1.2.3-70-g09d2