From ae8e3a915aef5af5ace5936c56f05f0b1502ded1 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:08:17 -0700 Subject: resource: add __adjust_resource() for internal use Add __adjust_resource(), which is called by adjust_resource() internally after the resource_lock is held. There is no interface change to adjust_resource(). This change allows other functions to call __adjust_resource() internally while the resource_lock is held. Signed-off-by: Toshi Kani Reviewed-by: Yasuaki Ishimatsu Acked-by: David Rientjes Cc: Ram Pai Cc: T Makphaibulchoke Cc: Wen Congyang Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 35 ++++++++++++++++++++++------------- 1 file changed, 22 insertions(+), 13 deletions(-) (limited to 'kernel/resource.c') diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b..ae246f97c5d 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -706,24 +706,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; - write_lock(&resource_lock); - if (!parent) goto skip; @@ -751,6 +740,26 @@ skip: result = 0; out: + return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) +{ + int result; + + write_lock(&resource_lock); + result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } -- cgit v1.2.3-70-g09d2 From 825f787bb49676083b97c1de1f8f2f8f26b5c908 Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Mon, 29 Apr 2013 15:08:19 -0700 Subject: resource: add release_mem_region_adjustable() Add release_mem_region_adjustable(), which releases a requested region from a currently busy memory resource. This interface adjusts the matched memory resource accordingly even if the requested region does not match exactly but still fits into. This new interface is intended for memory hot-delete. During bootup, memory resources are inserted from the boot descriptor table, such as EFI Memory Table and e820. Each memory resource entry usually covers the whole contigous memory range. Memory hot-delete request, on the other hand, may target to a particular range of memory resource, and its size can be much smaller than the whole contiguous memory. Since the existing release interfaces like __release_region() require a requested region to be exactly matched to a resource entry, they do not allow a partial resource to be released. This new interface is restrictive (i.e. release under certain conditions), which is consistent with other release interfaces, __release_region() and __release_resource(). Additional release conditions, such as an overlapping region to a resource entry, can be supported after they are confirmed as valid cases. There is no change to the existing interfaces since their restriction is valid for I/O resources. [akpm@linux-foundation.org: use GFP_ATOMIC under write_lock()] [akpm@linux-foundation.org: switch back to GFP_KERNEL, less buggily] [akpm@linux-foundation.org: remove unneeded and wrong kfree(), per Toshi] Signed-off-by: Toshi Kani Reviewed-by : Yasuaki Ishimatsu Cc: David Rientjes Reviewed-by: Ram Pai Cc: T Makphaibulchoke Cc: Wen Congyang Cc: Tang Chen Cc: Jiang Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ioport.h | 4 ++ kernel/resource.c | 103 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 107 insertions(+) (limited to 'kernel/resource.c') diff --git a/include/linux/ioport.h b/include/linux/ioport.h index 85ac9b9b72a..89b7c24a36e 100644 --- a/include/linux/ioport.h +++ b/include/linux/ioport.h @@ -192,6 +192,10 @@ extern struct resource * __request_region(struct resource *, extern int __check_region(struct resource *, resource_size_t, resource_size_t); extern void __release_region(struct resource *, resource_size_t, resource_size_t); +#ifdef CONFIG_MEMORY_HOTREMOVE +extern int release_mem_region_adjustable(struct resource *, resource_size_t, + resource_size_t); +#endif static inline int __deprecated check_region(resource_size_t s, resource_size_t n) diff --git a/kernel/resource.c b/kernel/resource.c index ae246f97c5d..4aef8867fd4 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1021,6 +1021,109 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete. The requested region + * is released from a currently busy memory resource. The requested region + * must either match exactly or fit into a single busy resource entry. In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + * supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + * assumes that all children remain in the lower address entry for + * simplicity. Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) +{ + struct resource **p; + struct resource *res; + struct resource *new_res; + resource_size_t end; + int ret = -EINVAL; + + end = start + size - 1; + if ((start < parent->start) || (end > parent->end)) + return ret; + + /* The kzalloc() result gets checked later */ + new_res = kzalloc(sizeof(struct resource), GFP_KERNEL); + + p = &parent->child; + write_lock(&resource_lock); + + while ((res = *p)) { + if (res->start >= end) + break; + + /* look for the next resource if it does not fit into */ + if (res->start > start || res->end < end) { + p = &res->sibling; + continue; + } + + if (!(res->flags & IORESOURCE_MEM)) + break; + + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + + /* found the target resource; let's adjust accordingly */ + if (res->start == start && res->end == end) { + /* free the whole entry */ + *p = res->sibling; + kfree(res); + ret = 0; + } else if (res->start == start && res->end != end) { + /* adjust the start */ + ret = __adjust_resource(res, end + 1, + res->end - end); + } else if (res->start != start && res->end == end) { + /* adjust the end */ + ret = __adjust_resource(res, res->start, + start - res->start); + } else { + /* split into two entries */ + if (!new_res) { + ret = -ENOMEM; + break; + } + new_res->name = res->name; + new_res->start = end + 1; + new_res->end = res->end; + new_res->flags = res->flags; + new_res->parent = res->parent; + new_res->sibling = res->sibling; + new_res->child = NULL; + + ret = __adjust_resource(res, res->start, + start - res->start); + if (ret) + break; + res->sibling = new_res; + new_res = NULL; + } + + break; + } + + write_unlock(&resource_lock); + kfree(new_res); + return ret; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* * Managed region resource */ -- cgit v1.2.3-70-g09d2 From ebff7d8f270d045338d9f4796014f4db429a17f9 Mon Sep 17 00:00:00 2001 From: Yasuaki Ishimatsu Date: Mon, 29 Apr 2013 15:08:56 -0700 Subject: mem hotunplug: fix kfree() of bootmem memory When hot removing memory presented at boot time, following messages are shown: kernel BUG at mm/slub.c:3409! invalid opcode: 0000 [#1] SMP Modules linked in: ebtable_nat ebtables xt_CHECKSUM iptable_mangle bridge stp llc ipmi_devintf ipmi_msghandler sunrpc ipt_REJECT nf_conntrack_ipv4 nf_defrag_ipv4 iptable_filter ip_tables ip6t_REJECT nf_conntrack_ipv6 nf_defrag_ipv6 xt_state nf_conntrack ip6table_filter ip6_tables binfmt_misc vfat fat dm_mirror dm_region_hash dm_log dm_mod vhost_net macvtap macvlan tun uinput iTCO_wdt iTCO_vendor_support coretemp kvm_intel kvm crc32c_intel ghash_clmulni_intel microcode pcspkr sg i2c_i801 lpc_ich mfd_core igb i2c_algo_bit i2c_core e1000e ptp pps_core tpm_infineon ioatdma dca sr_mod cdrom sd_mod crc_t10dif usb_storage megaraid_sas lpfc scsi_transport_fc scsi_tgt scsi_mod CPU 0 Pid: 5091, comm: kworker/0:2 Tainted: G W 3.9.0-rc6+ #15 RIP: kfree+0x232/0x240 Process kworker/0:2 (pid: 5091, threadinfo ffff88084678c000, task ffff88083928ca80) Call Trace: __release_region+0xd4/0xe0 __remove_pages+0x52/0x110 arch_remove_memory+0x89/0xd0 remove_memory+0xc4/0x100 acpi_memory_device_remove+0x6d/0xb1 acpi_device_remove+0x89/0xab __device_release_driver+0x7c/0xf0 device_release_driver+0x2f/0x50 acpi_bus_device_detach+0x6c/0x70 acpi_ns_walk_namespace+0x11a/0x250 acpi_walk_namespace+0xee/0x137 acpi_bus_trim+0x33/0x7a acpi_bus_hot_remove_device+0xc4/0x1a1 acpi_os_execute_deferred+0x27/0x34 process_one_work+0x1f7/0x590 worker_thread+0x11a/0x370 kthread+0xee/0x100 ret_from_fork+0x7c/0xb0 RIP [] kfree+0x232/0x240 RSP The reason why the messages are shown is to release a resource structure, allocated by bootmem, by kfree(). So when we release a resource structure, we should check whether it is allocated by bootmem or not. But even if we know a resource structure is allocated by bootmem, we cannot release it since SLxB cannot treat it. So for reusing a resource structure, this patch remembers it by using bootmem_resource as follows: When releasing a resource structure by free_resource(), free_resource() checks whether the resource structure is allocated by bootmem or not. If it is allocated by bootmem, free_resource() adds it to bootmem_resource. If it is not allocated by bootmem, free_resource() release it by kfree(). And when getting a new resource structure by get_resource(), get_resource() checks whether bootmem_resource has released resource structures or not. If there is a released resource structure, get_resource() returns it. If there is not a releaed resource structure, get_resource() returns new resource structure allocated by kzalloc(). [akpm@linux-foundation.org: s/get_resource/alloc_resource/] Signed-off-by: Yasuaki Ishimatsu Reviewed-by: Toshi Kani Cc: Johannes Weiner Cc: Ram Pai Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 68 ++++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 55 insertions(+), 13 deletions(-) (limited to 'kernel/resource.c') diff --git a/kernel/resource.c b/kernel/resource.c index 4aef8867fd4..d7386986e10 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@ #include #include #include +#include #include @@ -50,6 +51,14 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ + if (!res) + return; + + if (!PageSlab(virt_to_head_page(res))) { + spin_lock(&bootmem_resource_lock); + res->sibling = bootmem_resource_free; + bootmem_resource_free = res; + spin_unlock(&bootmem_resource_lock); + } else { + kfree(res); + } +} + +static struct resource *alloc_resource(gfp_t flags) +{ + struct resource *res = NULL; + + spin_lock(&bootmem_resource_lock); + if (bootmem_resource_free) { + res = bootmem_resource_free; + bootmem_resource_free = res->sibling; + } + spin_unlock(&bootmem_resource_lock); + + if (res) + memset(res, 0, sizeof(struct resource)); + else + res = kzalloc(sizeof(struct resource), flags); + + return res; +} + /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { @@ -771,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) @@ -796,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { - kfree(res); + free_resource(res); WARN_ON(next_res); break; } @@ -806,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { - next_res = kzalloc(sizeof(*next_res), - GFP_ATOMIC); + next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { - kfree(res); + free_resource(res); break; } next_res->name = name; @@ -899,7 +941,7 @@ struct resource * __request_region(struct resource *parent, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; @@ -933,7 +975,7 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - kfree(res); + free_resource(res); res = NULL; break; } @@ -967,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, return -EBUSY; release_resource(res); - kfree(res); + free_resource(res); return 0; } EXPORT_SYMBOL(__check_region); @@ -1007,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); - kfree(res); + free_resource(res); return; } p = &res->sibling; @@ -1055,8 +1097,8 @@ int release_mem_region_adjustable(struct resource *parent, if ((start < parent->start) || (end > parent->end)) return ret; - /* The kzalloc() result gets checked later */ - new_res = kzalloc(sizeof(struct resource), GFP_KERNEL); + /* The alloc_resource() result gets checked later */ + new_res = alloc_resource(GFP_KERNEL); p = &parent->child; write_lock(&resource_lock); @@ -1083,7 +1125,7 @@ int release_mem_region_adjustable(struct resource *parent, if (res->start == start && res->end == end) { /* free the whole entry */ *p = res->sibling; - kfree(res); + free_resource(res); ret = 0; } else if (res->start == start && res->end != end) { /* adjust the start */ @@ -1119,7 +1161,7 @@ int release_mem_region_adjustable(struct resource *parent, } write_unlock(&resource_lock); - kfree(new_res); + free_resource(new_res); return ret; } #endif /* CONFIG_MEMORY_HOTREMOVE */ -- cgit v1.2.3-70-g09d2