diff options
Diffstat (limited to 'drivers/hv/hv_balloon.c')
-rw-r--r-- | drivers/hv/hv_balloon.c | 544 |
1 files changed, 488 insertions, 56 deletions
diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 37873213e24..4c605c70ebf 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -117,7 +117,14 @@ union dm_caps { struct { __u64 balloon:1; __u64 hot_add:1; - __u64 reservedz:62; + /* + * To support guests that may have alignment + * limitations on hot-add, the guest can specify + * its alignment requirements; a value of n + * represents an alignment of 2^n in mega bytes. + */ + __u64 hot_add_alignment:4; + __u64 reservedz:58; } cap_bits; __u64 caps; } __packed; @@ -412,13 +419,45 @@ struct dm_info_msg { * End protocol definitions. */ -static bool hot_add; +/* + * State to manage hot adding memory into the guest. + * The range start_pfn : end_pfn specifies the range + * that the host has asked us to hot add. The range + * start_pfn : ha_end_pfn specifies the range that we have + * currently hot added. We hot add in multiples of 128M + * chunks; it is possible that we may not be able to bring + * online all the pages in the region. The range + * covered_start_pfn : covered_end_pfn defines the pages that can + * be brough online. + */ + +struct hv_hotadd_state { + struct list_head list; + unsigned long start_pfn; + unsigned long covered_start_pfn; + unsigned long covered_end_pfn; + unsigned long ha_end_pfn; + unsigned long end_pfn; +}; + +struct balloon_state { + __u32 num_pages; + struct work_struct wrk; +}; + +struct hot_add_wrk { + union dm_mem_page_range ha_page_range; + union dm_mem_page_range ha_region_range; + struct work_struct wrk; +}; + +static bool hot_add = true; static bool do_hot_add; /* * Delay reporting memory pressure by * the specified number of seconds. */ -static uint pressure_report_delay = 30; +static uint pressure_report_delay = 45; module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); @@ -446,6 +485,7 @@ enum hv_dm_state { static __u8 recv_buffer[PAGE_SIZE]; static __u8 *send_buffer; #define PAGES_IN_2M 512 +#define HA_CHUNK (32 * 1024) struct hv_dynmem_device { struct hv_device *dev; @@ -459,7 +499,28 @@ struct hv_dynmem_device { unsigned int num_pages_ballooned; /* - * This thread handles both balloon/hot-add + * State to manage the ballooning (up) operation. + */ + struct balloon_state balloon_wrk; + + /* + * State to execute the "hot-add" operation. + */ + struct hot_add_wrk ha_wrk; + + /* + * This state tracks if the host has specified a hot-add + * region. + */ + bool host_specified_ha_region; + + /* + * State to synchronize hot-add. + */ + struct completion ol_waitevent; + bool ha_waiting; + /* + * This thread handles hot-add * requests from the host as well as notifying * the host with regards to memory pressure in * the guest. @@ -467,6 +528,11 @@ struct hv_dynmem_device { struct task_struct *thread; /* + * A list of hot-add regions. + */ + struct list_head ha_region_list; + + /* * We start with the highest version we can support * and downgrade based on the host; we save here the * next version to try. @@ -476,35 +542,358 @@ struct hv_dynmem_device { static struct hv_dynmem_device dm_device; -static void hot_add_req(struct hv_dynmem_device *dm, struct dm_hot_add *msg) +#ifdef CONFIG_MEMORY_HOTPLUG + +static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size) { + int i; - struct dm_hot_add_response resp; + for (i = 0; i < size; i++) { + struct page *pg; + pg = pfn_to_page(start_pfn + i); + __online_page_set_limits(pg); + __online_page_increment_counters(pg); + __online_page_free(pg); + } +} - if (do_hot_add) { +static void hv_mem_hot_add(unsigned long start, unsigned long size, + unsigned long pfn_count, + struct hv_hotadd_state *has) +{ + int ret = 0; + int i, nid, t; + unsigned long start_pfn; + unsigned long processed_pfn; + unsigned long total_pfn = pfn_count; + + for (i = 0; i < (size/HA_CHUNK); i++) { + start_pfn = start + (i * HA_CHUNK); + has->ha_end_pfn += HA_CHUNK; + + if (total_pfn > HA_CHUNK) { + processed_pfn = HA_CHUNK; + total_pfn -= HA_CHUNK; + } else { + processed_pfn = total_pfn; + total_pfn = 0; + } + + has->covered_end_pfn += processed_pfn; + + init_completion(&dm_device.ol_waitevent); + dm_device.ha_waiting = true; + + nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); + ret = add_memory(nid, PFN_PHYS((start_pfn)), + (HA_CHUNK << PAGE_SHIFT)); - pr_info("Memory hot add not supported\n"); + if (ret) { + pr_info("hot_add memory failed error is %d\n", ret); + if (ret == -EEXIST) { + /* + * This error indicates that the error + * is not a transient failure. This is the + * case where the guest's physical address map + * precludes hot adding memory. Stop all further + * memory hot-add. + */ + do_hot_add = false; + } + has->ha_end_pfn -= HA_CHUNK; + has->covered_end_pfn -= processed_pfn; + break; + } /* - * Currently we do not support hot add. - * Just fail the request. + * Wait for the memory block to be onlined. */ + t = wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ); + if (t == 0) { + pr_info("hot_add memory timedout\n"); + has->ha_end_pfn -= HA_CHUNK; + has->covered_end_pfn -= processed_pfn; + break; + } + + } + + return; +} + +static void hv_online_page(struct page *pg) +{ + struct list_head *cur; + struct hv_hotadd_state *has; + unsigned long cur_start_pgp; + unsigned long cur_end_pgp; + + if (dm_device.ha_waiting) { + dm_device.ha_waiting = false; + complete(&dm_device.ol_waitevent); + } + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + cur_start_pgp = (unsigned long) + pfn_to_page(has->covered_start_pfn); + cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); + + if (((unsigned long)pg >= cur_start_pgp) && + ((unsigned long)pg < cur_end_pgp)) { + /* + * This frame is currently backed; online the + * page. + */ + __online_page_set_limits(pg); + __online_page_increment_counters(pg); + __online_page_free(pg); + has->covered_start_pfn++; + } + } +} + +static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) +{ + struct list_head *cur; + struct hv_hotadd_state *has; + unsigned long residual, new_inc; + + if (list_empty(&dm_device.ha_region_list)) + return false; + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + + /* + * If the pfn range we are dealing with is not in the current + * "hot add block", move on. + */ + if ((start_pfn >= has->end_pfn)) + continue; + /* + * If the current hot add-request extends beyond + * our current limit; extend it. + */ + if ((start_pfn + pfn_cnt) > has->end_pfn) { + residual = (start_pfn + pfn_cnt - has->end_pfn); + /* + * Extend the region by multiples of HA_CHUNK. + */ + new_inc = (residual / HA_CHUNK) * HA_CHUNK; + if (residual % HA_CHUNK) + new_inc += HA_CHUNK; + + has->end_pfn += new_inc; + } + + /* + * If the current start pfn is not where the covered_end + * is, update it. + */ + + if (has->covered_end_pfn != start_pfn) { + has->covered_end_pfn = start_pfn; + has->covered_start_pfn = start_pfn; + } + return true; + } + return false; +} + +static unsigned long handle_pg_range(unsigned long pg_start, + unsigned long pg_count) +{ + unsigned long start_pfn = pg_start; + unsigned long pfn_cnt = pg_count; + unsigned long size; + struct list_head *cur; + struct hv_hotadd_state *has; + unsigned long pgs_ol = 0; + unsigned long old_covered_state; + + if (list_empty(&dm_device.ha_region_list)) + return 0; + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + + /* + * If the pfn range we are dealing with is not in the current + * "hot add block", move on. + */ + if ((start_pfn >= has->end_pfn)) + continue; + + old_covered_state = has->covered_end_pfn; + + if (start_pfn < has->ha_end_pfn) { + /* + * This is the case where we are backing pages + * in an already hot added region. Bring + * these pages online first. + */ + pgs_ol = has->ha_end_pfn - start_pfn; + if (pgs_ol > pfn_cnt) + pgs_ol = pfn_cnt; + hv_bring_pgs_online(start_pfn, pgs_ol); + has->covered_end_pfn += pgs_ol; + has->covered_start_pfn += pgs_ol; + pfn_cnt -= pgs_ol; + } + + if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { + /* + * We have some residual hot add range + * that needs to be hot added; hot add + * it now. Hot add a multiple of + * of HA_CHUNK that fully covers the pages + * we have. + */ + size = (has->end_pfn - has->ha_end_pfn); + if (pfn_cnt <= size) { + size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); + if (pfn_cnt % HA_CHUNK) + size += HA_CHUNK; + } else { + pfn_cnt = size; + } + hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); + } + /* + * If we managed to online any pages that were given to us, + * we declare success. + */ + return has->covered_end_pfn - old_covered_state; + + } + + return 0; +} + +static unsigned long process_hot_add(unsigned long pg_start, + unsigned long pfn_cnt, + unsigned long rg_start, + unsigned long rg_size) +{ + struct hv_hotadd_state *ha_region = NULL; + + if (pfn_cnt == 0) + return 0; + + if (!dm_device.host_specified_ha_region) + if (pfn_covered(pg_start, pfn_cnt)) + goto do_pg_range; + + /* + * If the host has specified a hot-add range; deal with it first. + */ + + if (rg_size != 0) { + ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); + if (!ha_region) + return 0; + + INIT_LIST_HEAD(&ha_region->list); + + list_add_tail(&ha_region->list, &dm_device.ha_region_list); + ha_region->start_pfn = rg_start; + ha_region->ha_end_pfn = rg_start; + ha_region->covered_start_pfn = pg_start; + ha_region->covered_end_pfn = pg_start; + ha_region->end_pfn = rg_start + rg_size; + } + +do_pg_range: + /* + * Process the page range specified; bringing them + * online if possible. + */ + return handle_pg_range(pg_start, pfn_cnt); +} + +#endif + +static void hot_add_req(struct work_struct *dummy) +{ + struct dm_hot_add_response resp; +#ifdef CONFIG_MEMORY_HOTPLUG + unsigned long pg_start, pfn_cnt; + unsigned long rg_start, rg_sz; +#endif + struct hv_dynmem_device *dm = &dm_device; + memset(&resp, 0, sizeof(struct dm_hot_add_response)); resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; resp.hdr.size = sizeof(struct dm_hot_add_response); resp.hdr.trans_id = atomic_inc_return(&trans_id); - resp.page_count = 0; - resp.result = 0; +#ifdef CONFIG_MEMORY_HOTPLUG + pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; + pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; + + rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; + rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; + + if ((rg_start == 0) && (!dm->host_specified_ha_region)) { + unsigned long region_size; + unsigned long region_start; + + /* + * The host has not specified the hot-add region. + * Based on the hot-add page range being specified, + * compute a hot-add region that can cover the pages + * that need to be hot-added while ensuring the alignment + * and size requirements of Linux as it relates to hot-add. + */ + region_start = pg_start; + region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; + if (pfn_cnt % HA_CHUNK) + region_size += HA_CHUNK; + + region_start = (pg_start / HA_CHUNK) * HA_CHUNK; + + rg_start = region_start; + rg_sz = region_size; + } + + if (do_hot_add) + resp.page_count = process_hot_add(pg_start, pfn_cnt, + rg_start, rg_sz); +#endif + /* + * The result field of the response structure has the + * following semantics: + * + * 1. If all or some pages hot-added: Guest should return success. + * + * 2. If no pages could be hot-added: + * + * If the guest returns success, then the host + * will not attempt any further hot-add operations. This + * signifies a permanent failure. + * + * If the guest returns failure, then this failure will be + * treated as a transient failure and the host may retry the + * hot-add operation after some delay. + */ + if (resp.page_count > 0) + resp.result = 1; + else if (!do_hot_add) + resp.result = 1; + else + resp.result = 0; + + if (!do_hot_add || (resp.page_count == 0)) + pr_info("Memory hot add failed\n"); dm->state = DM_INITIALIZED; vmbus_sendpacket(dm->dev->channel, &resp, sizeof(struct dm_hot_add_response), (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); - } static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) @@ -523,7 +912,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) } } -unsigned long compute_balloon_floor(void) +static unsigned long compute_balloon_floor(void) { unsigned long min_pages; #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) @@ -644,6 +1033,14 @@ static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, dm->num_pages_ballooned += alloc_unit; + /* + * If we allocatted 2M pages; split them so we + * can free them in any order we get. + */ + + if (alloc_unit != 1) + split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); + bl_resp->range_count++; bl_resp->range_array[i].finfo.start_page = page_to_pfn(pg); @@ -657,9 +1054,9 @@ static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, -static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) +static void balloon_up(struct work_struct *dummy) { - int num_pages = req->num_pages; + int num_pages = dm_device.balloon_wrk.num_pages; int num_ballooned = 0; struct dm_balloon_response *bl_resp; int alloc_unit; @@ -670,9 +1067,10 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) /* - * Currently, we only support 4k allocations. + * We will attempt 2M allocations. However, if we fail to + * allocate 2M chunks, we will go back to 4k allocations. */ - alloc_unit = 1; + alloc_unit = 512; while (!done) { bl_resp = (struct dm_balloon_response *)send_buffer; @@ -684,14 +1082,19 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) num_pages -= num_ballooned; - num_ballooned = alloc_balloon_pages(dm, num_pages, + num_ballooned = alloc_balloon_pages(&dm_device, num_pages, bl_resp, alloc_unit, &alloc_error); + if ((alloc_error) && (alloc_unit != 1)) { + alloc_unit = 1; + continue; + } + if ((alloc_error) || (num_ballooned == num_pages)) { bl_resp->more_pages = 0; done = true; - dm->state = DM_INITIALIZED; + dm_device.state = DM_INITIALIZED; } /* @@ -719,7 +1122,7 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) pr_info("Balloon response failed\n"); for (i = 0; i < bl_resp->range_count; i++) - free_balloon_pages(dm, + free_balloon_pages(&dm_device, &bl_resp->range_array[i]); done = true; @@ -761,7 +1164,6 @@ static int dm_thread_func(void *dm_dev) { struct hv_dynmem_device *dm = dm_dev; int t; - unsigned long scan_start; while (!kthread_should_stop()) { t = wait_for_completion_timeout(&dm_device.config_event, 1*HZ); @@ -773,22 +1175,6 @@ static int dm_thread_func(void *dm_dev) if (t == 0) post_status(dm); - scan_start = jiffies; - switch (dm->state) { - case DM_BALLOON_UP: - balloon_up(dm, (struct dm_balloon *)recv_buffer); - break; - - case DM_HOT_ADD: - hot_add_req(dm, (struct dm_hot_add *)recv_buffer); - break; - default: - break; - } - - if (!time_in_range(jiffies, scan_start, scan_start + HZ)) - post_status(dm); - } return 0; @@ -861,6 +1247,10 @@ static void balloon_onchannelcallback(void *context) struct dm_message *dm_msg; struct dm_header *dm_hdr; struct hv_dynmem_device *dm = hv_get_drvdata(dev); + struct dm_balloon *bal_msg; + struct dm_hot_add *ha_msg; + union dm_mem_page_range *ha_pg_range; + union dm_mem_page_range *ha_region; memset(recv_buffer, 0, sizeof(recv_buffer)); vmbus_recvpacket(dev->channel, recv_buffer, @@ -882,8 +1272,12 @@ static void balloon_onchannelcallback(void *context) break; case DM_BALLOON_REQUEST: + if (dm->state == DM_BALLOON_UP) + pr_warn("Currently ballooning\n"); + bal_msg = (struct dm_balloon *)recv_buffer; dm->state = DM_BALLOON_UP; - complete(&dm->config_event); + dm_device.balloon_wrk.num_pages = bal_msg->num_pages; + schedule_work(&dm_device.balloon_wrk.wrk); break; case DM_UNBALLOON_REQUEST: @@ -893,8 +1287,31 @@ static void balloon_onchannelcallback(void *context) break; case DM_MEM_HOT_ADD_REQUEST: + if (dm->state == DM_HOT_ADD) + pr_warn("Currently hot-adding\n"); dm->state = DM_HOT_ADD; - complete(&dm->config_event); + ha_msg = (struct dm_hot_add *)recv_buffer; + if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { + /* + * This is a normal hot-add request specifying + * hot-add memory. + */ + ha_pg_range = &ha_msg->range; + dm->ha_wrk.ha_page_range = *ha_pg_range; + dm->ha_wrk.ha_region_range.page_range = 0; + } else { + /* + * Host is specifying that we first hot-add + * a region and then partially populate this + * region. + */ + dm->host_specified_ha_region = true; + ha_pg_range = &ha_msg->range; + ha_region = &ha_pg_range[1]; + dm->ha_wrk.ha_page_range = *ha_pg_range; + dm->ha_wrk.ha_region_range = *ha_region; + } + schedule_work(&dm_device.ha_wrk.wrk); break; case DM_INFO_MESSAGE: @@ -937,6 +1354,10 @@ static int balloon_probe(struct hv_device *dev, dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7; init_completion(&dm_device.host_event); init_completion(&dm_device.config_event); + INIT_LIST_HEAD(&dm_device.ha_region_list); + INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); + INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); + dm_device.host_specified_ha_region = false; dm_device.thread = kthread_run(dm_thread_func, &dm_device, "hv_balloon"); @@ -945,6 +1366,10 @@ static int balloon_probe(struct hv_device *dev, goto probe_error1; } +#ifdef CONFIG_MEMORY_HOTPLUG + set_online_page_callback(&hv_online_page); +#endif + hv_set_drvdata(dev, &dm_device); /* * Initiate the hand shake with the host and negotiate @@ -962,8 +1387,7 @@ static int balloon_probe(struct hv_device *dev, ret = vmbus_sendpacket(dev->channel, &version_req, sizeof(struct dm_version_request), (unsigned long)NULL, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + VM_PKT_DATA_INBAND, 0); if (ret) goto probe_error2; @@ -990,13 +1414,13 @@ static int balloon_probe(struct hv_device *dev, cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); cap_msg.caps.cap_bits.balloon = 1; + cap_msg.caps.cap_bits.hot_add = 1; + /* - * While we currently don't support hot-add, - * we still advertise this capability since the - * host requires that guests partcipating in the - * dynamic memory protocol support hot add. + * Specify our alignment requirements as it relates + * memory hot-add. Specify 128MB alignment. */ - cap_msg.caps.cap_bits.hot_add = 1; + cap_msg.caps.cap_bits.hot_add_alignment = 7; /* * Currently the host does not use these @@ -1009,8 +1433,7 @@ static int balloon_probe(struct hv_device *dev, ret = vmbus_sendpacket(dev->channel, &cap_msg, sizeof(struct dm_capabilities), (unsigned long)NULL, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + VM_PKT_DATA_INBAND, 0); if (ret) goto probe_error2; @@ -1034,6 +1457,9 @@ static int balloon_probe(struct hv_device *dev, return 0; probe_error2: +#ifdef CONFIG_MEMORY_HOTPLUG + restore_online_page_callback(&hv_online_page); +#endif kthread_stop(dm_device.thread); probe_error1: @@ -1046,13 +1472,26 @@ probe_error0: static int balloon_remove(struct hv_device *dev) { struct hv_dynmem_device *dm = hv_get_drvdata(dev); + struct list_head *cur, *tmp; + struct hv_hotadd_state *has; if (dm->num_pages_ballooned != 0) pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); + cancel_work_sync(&dm->balloon_wrk.wrk); + cancel_work_sync(&dm->ha_wrk.wrk); + vmbus_close(dev->channel); kthread_stop(dm->thread); kfree(send_buffer); +#ifdef CONFIG_MEMORY_HOTPLUG + restore_online_page_callback(&hv_online_page); +#endif + list_for_each_safe(cur, tmp, &dm->ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + list_del(&has->list); + kfree(has); + } return 0; } @@ -1079,14 +1518,7 @@ static int __init init_balloon_drv(void) return vmbus_driver_register(&balloon_drv); } -static void exit_balloon_drv(void) -{ - - vmbus_driver_unregister(&balloon_drv); -} - module_init(init_balloon_drv); -module_exit(exit_balloon_drv); MODULE_DESCRIPTION("Hyper-V Balloon"); MODULE_VERSION(HV_DRV_VERSION); |