diff options
Diffstat (limited to 'mm/memory_hotplug.c')
-rw-r--r-- | mm/memory_hotplug.c | 332 |
1 files changed, 309 insertions, 23 deletions
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeacae2b9..de9cb14ae75 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - if (start_pfn < zone->zone_start_pfn) + if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - @@ -214,13 +214,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writeunlock(zone); } +static void resize_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + zone_span_writelock(zone); + + if (end_pfn - start_pfn) { + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } else { + /* + * make it consist as free_area_init_core(), + * if spanned_pages = 0, then keep start_pfn = 0 + */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } + + zone_span_writeunlock(zone); +} + +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + enum zone_type zid = zone_idx(zone); + int nid = zone->zone_pgdat->node_id; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) + set_page_links(pfn_to_page(pfn), zid, nid, pfn); +} + +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + int ret; + unsigned long flags; + unsigned long z1_start_pfn; + + if (!z1->wait_table) { + ret = init_currently_empty_zone(z1, start_pfn, + end_pfn - start_pfn, MEMMAP_HOTPLUG); + if (ret) + return ret; + } + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are higher than @z2 */ + if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) + goto out_fail; + /* the move out part mast at the left most of @z2 */ + if (start_pfn > z2->zone_start_pfn) + goto out_fail; + /* must included/overlap */ + if (end_pfn <= z2->zone_start_pfn) + goto out_fail; + + /* use start_pfn for z1's start_pfn if z1 is empty */ + if (z1->spanned_pages) + z1_start_pfn = z1->zone_start_pfn; + else + z1_start_pfn = start_pfn; + + resize_zone(z1, z1_start_pfn, end_pfn); + resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z1, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + int ret; + unsigned long flags; + unsigned long z2_end_pfn; + + if (!z2->wait_table) { + ret = init_currently_empty_zone(z2, start_pfn, + end_pfn - start_pfn, MEMMAP_HOTPLUG); + if (ret) + return ret; + } + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are lower than @z1 */ + if (z1->zone_start_pfn > start_pfn) + goto out_fail; + /* the move out part mast at the right most of @z1 */ + if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) + goto out_fail; + /* must included/overlap */ + if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) + goto out_fail; + + /* use end_pfn for z2's end_pfn if z2 is empty */ + if (z2->spanned_pages) + z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; + else + z2_end_pfn = end_pfn; + + resize_zone(z1, z1->zone_start_pfn, start_pfn); + resize_zone(z2, start_pfn, z2_end_pfn); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z2, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { unsigned long old_pgdat_end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; - if (start_pfn < pgdat->node_start_pfn) + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - @@ -460,8 +581,61 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +/* ensure every online node has NORMAL memory */ +static bool can_online_high_movable(struct zone *zone) +{ + return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); +} + +/* check which state of node_states will be changed when online memory */ +static void node_states_check_changes_online(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + int nid = zone_to_nid(zone); + enum zone_type zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * if the memory to be online is in a zone of 0...zone_last, and + * the zones of 0...zone_last don't have memory before online, we will + * need to set the node to node_states[N_NORMAL_MEMORY] after + * the memory is online. + */ + if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; + else + arg->status_change_nid_normal = -1; + + /* + * if the node don't have memory befor online, we will need to + * set the node to node_states[N_HIGH_MEMORY] after the memory + * is online. + */ + if (!node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid = nid; + else + arg->status_change_nid = -1; +} + +static void node_states_set_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_set_state(node, N_NORMAL_MEMORY); + + node_set_state(node, N_HIGH_MEMORY); +} -int __ref online_pages(unsigned long pfn, unsigned long nr_pages) + +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long onlined_pages = 0; struct zone *zone; @@ -471,13 +645,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) struct memory_notify arg; lock_memory_hotplug(); + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + + if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + !can_online_high_movable(zone)) { + unlock_memory_hotplug(); + return -1; + } + + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -1; + } + } + if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -1; + } + } + + /* Previous code may changed the zone of the pfn range */ + zone = page_zone(pfn_to_page(pfn)); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; + node_states_check_changes_online(nr_pages, zone, &arg); nid = page_to_nid(pfn_to_page(pfn)); - if (node_present_pages(nid) == 0) - arg.status_change_nid = nid; ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -487,23 +688,21 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) return ret; } /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. */ mutex_lock(&zonelists_mutex); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { need_zonelists_rebuild = 1; + build_all_zonelists(NULL, zone); + } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + if (need_zonelists_rebuild) + zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, @@ -517,9 +716,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; if (onlined_pages) { - node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) - build_all_zonelists(NULL, zone); + build_all_zonelists(NULL, NULL); else zone_pcp_update(zone); } @@ -847,7 +1046,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, { int ret; long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); offlined = nr_pages; if (!ret) *(long *)data += offlined; @@ -867,6 +1066,91 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } +/* ensure the node has NORMAL memory if it is still online */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt; + + for (zt = 0; zt <= ZONE_NORMAL; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + if (present_pages > nr_pages) + return true; + + present_pages = 0; + for (; zt <= ZONE_MOVABLE; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + /* + * we can't offline the last normal memory until all + * higher memory is offlined. + */ + return present_pages == 0; +} + +/* check which state of node_states will be changed when offline memory */ +static void node_states_check_changes_offline(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt, zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is in a zone of 0...zone_last, + * and it is the last present memory, 0...zone_last will + * become empty after offline , thus we can determind we will + * need to clear the node from node_states[N_NORMAL_MEMORY]. + */ + for (zt = 0; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_normal = zone_to_nid(zone); + else + arg->status_change_nid_normal = -1; + + /* + * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + */ + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_HIGH_MEMORY] will be changed + * If we try to offline the last present @nr_pages from the node, + * we can determind we will need to clear the node from + * node_states[N_HIGH_MEMORY]. + */ + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (nr_pages >= present_pages) + arg->status_change_nid = zone_to_nid(zone); + else + arg->status_change_nid = -1; +} + +static void node_states_clear_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_clear_state(node, N_NORMAL_MEMORY); + + if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && + (arg->status_change_nid >= 0)) + node_clear_state(node, N_HIGH_MEMORY); +} + static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { @@ -893,16 +1177,19 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; + ret = -EINVAL; + if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) + goto out; + /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + ret = start_isolate_page_range(start_pfn, end_pfn, + MIGRATE_MOVABLE, true); if (ret) goto out; arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; - if (nr_pages >= node_present_pages(node)) - arg.status_change_nid = node; + node_states_check_changes_offline(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); @@ -975,10 +1262,9 @@ repeat: } else zone_pcp_update(zone); - if (!node_present_pages(node)) { - node_clear_state(node, N_HIGH_MEMORY); + node_states_clear_node(node, &arg); + if (arg.status_change_nid >= 0) kswapd_stop(node); - } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); |