diff options
Diffstat (limited to 'drivers/hv')
-rw-r--r-- | drivers/hv/Kconfig | 2 | ||||
-rw-r--r-- | drivers/hv/Makefile | 2 | ||||
-rw-r--r-- | drivers/hv/channel_mgmt.c | 13 | ||||
-rw-r--r-- | drivers/hv/hv.c | 5 | ||||
-rw-r--r-- | drivers/hv/hv_balloon.c | 544 | ||||
-rw-r--r-- | drivers/hv/hv_snapshot.c | 287 | ||||
-rw-r--r-- | drivers/hv/hv_util.c | 10 | ||||
-rw-r--r-- | drivers/hv/ring_buffer.c | 1 |
8 files changed, 802 insertions, 62 deletions
diff --git a/drivers/hv/Kconfig b/drivers/hv/Kconfig index 64630f15f18..0403b51d20b 100644 --- a/drivers/hv/Kconfig +++ b/drivers/hv/Kconfig @@ -2,7 +2,7 @@ menu "Microsoft Hyper-V guest support" config HYPERV tristate "Microsoft Hyper-V client drivers" - depends on X86 && ACPI && PCI && X86_LOCAL_APIC + depends on X86 && ACPI && PCI && X86_LOCAL_APIC && HYPERVISOR_GUEST help Select this option to run Linux as a Hyper-V client operating system. diff --git a/drivers/hv/Makefile b/drivers/hv/Makefile index e6abfa02d8b..0a74b566118 100644 --- a/drivers/hv/Makefile +++ b/drivers/hv/Makefile @@ -5,4 +5,4 @@ obj-$(CONFIG_HYPERV_BALLOON) += hv_balloon.o hv_vmbus-y := vmbus_drv.o \ hv.o connection.o channel.o \ channel_mgmt.o ring_buffer.o -hv_utils-y := hv_util.o hv_kvp.o +hv_utils-y := hv_util.o hv_kvp.o hv_snapshot.o diff --git a/drivers/hv/channel_mgmt.c b/drivers/hv/channel_mgmt.c index ff1be167eb0..21ef68934a2 100644 --- a/drivers/hv/channel_mgmt.c +++ b/drivers/hv/channel_mgmt.c @@ -165,8 +165,19 @@ static void vmbus_process_rescind_offer(struct work_struct *work) struct vmbus_channel *channel = container_of(work, struct vmbus_channel, work); + unsigned long flags; + struct vmbus_channel_relid_released msg; vmbus_device_unregister(channel->device_obj); + memset(&msg, 0, sizeof(struct vmbus_channel_relid_released)); + msg.child_relid = channel->offermsg.child_relid; + msg.header.msgtype = CHANNELMSG_RELID_RELEASED; + vmbus_post_msg(&msg, sizeof(struct vmbus_channel_relid_released)); + + spin_lock_irqsave(&vmbus_connection.channel_lock, flags); + list_del(&channel->listentry); + spin_unlock_irqrestore(&vmbus_connection.channel_lock, flags); + free_channel(channel); } void vmbus_free_channels(void) @@ -318,7 +329,7 @@ static u32 get_vp_index(uuid_le *type_guid) return 0; } cur_cpu = (++next_vp % max_cpus); - return cur_cpu; + return hv_context.vp_index[cur_cpu]; } /* diff --git a/drivers/hv/hv.c b/drivers/hv/hv.c index 731158910c1..ae4923756d9 100644 --- a/drivers/hv/hv.c +++ b/drivers/hv/hv.c @@ -289,9 +289,8 @@ void hv_synic_init(void *arg) /* Check the version */ rdmsrl(HV_X64_MSR_SVERSION, version); - hv_context.event_dpc[cpu] = (struct tasklet_struct *) - kmalloc(sizeof(struct tasklet_struct), - GFP_ATOMIC); + hv_context.event_dpc[cpu] = kmalloc(sizeof(struct tasklet_struct), + GFP_ATOMIC); if (hv_context.event_dpc[cpu] == NULL) { pr_err("Unable to allocate event dpc\n"); goto cleanup; diff --git a/drivers/hv/hv_balloon.c b/drivers/hv/hv_balloon.c index 37873213e24..4c605c70ebf 100644 --- a/drivers/hv/hv_balloon.c +++ b/drivers/hv/hv_balloon.c @@ -117,7 +117,14 @@ union dm_caps { struct { __u64 balloon:1; __u64 hot_add:1; - __u64 reservedz:62; + /* + * To support guests that may have alignment + * limitations on hot-add, the guest can specify + * its alignment requirements; a value of n + * represents an alignment of 2^n in mega bytes. + */ + __u64 hot_add_alignment:4; + __u64 reservedz:58; } cap_bits; __u64 caps; } __packed; @@ -412,13 +419,45 @@ struct dm_info_msg { * End protocol definitions. */ -static bool hot_add; +/* + * State to manage hot adding memory into the guest. + * The range start_pfn : end_pfn specifies the range + * that the host has asked us to hot add. The range + * start_pfn : ha_end_pfn specifies the range that we have + * currently hot added. We hot add in multiples of 128M + * chunks; it is possible that we may not be able to bring + * online all the pages in the region. The range + * covered_start_pfn : covered_end_pfn defines the pages that can + * be brough online. + */ + +struct hv_hotadd_state { + struct list_head list; + unsigned long start_pfn; + unsigned long covered_start_pfn; + unsigned long covered_end_pfn; + unsigned long ha_end_pfn; + unsigned long end_pfn; +}; + +struct balloon_state { + __u32 num_pages; + struct work_struct wrk; +}; + +struct hot_add_wrk { + union dm_mem_page_range ha_page_range; + union dm_mem_page_range ha_region_range; + struct work_struct wrk; +}; + +static bool hot_add = true; static bool do_hot_add; /* * Delay reporting memory pressure by * the specified number of seconds. */ -static uint pressure_report_delay = 30; +static uint pressure_report_delay = 45; module_param(hot_add, bool, (S_IRUGO | S_IWUSR)); MODULE_PARM_DESC(hot_add, "If set attempt memory hot_add"); @@ -446,6 +485,7 @@ enum hv_dm_state { static __u8 recv_buffer[PAGE_SIZE]; static __u8 *send_buffer; #define PAGES_IN_2M 512 +#define HA_CHUNK (32 * 1024) struct hv_dynmem_device { struct hv_device *dev; @@ -459,7 +499,28 @@ struct hv_dynmem_device { unsigned int num_pages_ballooned; /* - * This thread handles both balloon/hot-add + * State to manage the ballooning (up) operation. + */ + struct balloon_state balloon_wrk; + + /* + * State to execute the "hot-add" operation. + */ + struct hot_add_wrk ha_wrk; + + /* + * This state tracks if the host has specified a hot-add + * region. + */ + bool host_specified_ha_region; + + /* + * State to synchronize hot-add. + */ + struct completion ol_waitevent; + bool ha_waiting; + /* + * This thread handles hot-add * requests from the host as well as notifying * the host with regards to memory pressure in * the guest. @@ -467,6 +528,11 @@ struct hv_dynmem_device { struct task_struct *thread; /* + * A list of hot-add regions. + */ + struct list_head ha_region_list; + + /* * We start with the highest version we can support * and downgrade based on the host; we save here the * next version to try. @@ -476,35 +542,358 @@ struct hv_dynmem_device { static struct hv_dynmem_device dm_device; -static void hot_add_req(struct hv_dynmem_device *dm, struct dm_hot_add *msg) +#ifdef CONFIG_MEMORY_HOTPLUG + +static void hv_bring_pgs_online(unsigned long start_pfn, unsigned long size) { + int i; - struct dm_hot_add_response resp; + for (i = 0; i < size; i++) { + struct page *pg; + pg = pfn_to_page(start_pfn + i); + __online_page_set_limits(pg); + __online_page_increment_counters(pg); + __online_page_free(pg); + } +} - if (do_hot_add) { +static void hv_mem_hot_add(unsigned long start, unsigned long size, + unsigned long pfn_count, + struct hv_hotadd_state *has) +{ + int ret = 0; + int i, nid, t; + unsigned long start_pfn; + unsigned long processed_pfn; + unsigned long total_pfn = pfn_count; + + for (i = 0; i < (size/HA_CHUNK); i++) { + start_pfn = start + (i * HA_CHUNK); + has->ha_end_pfn += HA_CHUNK; + + if (total_pfn > HA_CHUNK) { + processed_pfn = HA_CHUNK; + total_pfn -= HA_CHUNK; + } else { + processed_pfn = total_pfn; + total_pfn = 0; + } + + has->covered_end_pfn += processed_pfn; + + init_completion(&dm_device.ol_waitevent); + dm_device.ha_waiting = true; + + nid = memory_add_physaddr_to_nid(PFN_PHYS(start_pfn)); + ret = add_memory(nid, PFN_PHYS((start_pfn)), + (HA_CHUNK << PAGE_SHIFT)); - pr_info("Memory hot add not supported\n"); + if (ret) { + pr_info("hot_add memory failed error is %d\n", ret); + if (ret == -EEXIST) { + /* + * This error indicates that the error + * is not a transient failure. This is the + * case where the guest's physical address map + * precludes hot adding memory. Stop all further + * memory hot-add. + */ + do_hot_add = false; + } + has->ha_end_pfn -= HA_CHUNK; + has->covered_end_pfn -= processed_pfn; + break; + } /* - * Currently we do not support hot add. - * Just fail the request. + * Wait for the memory block to be onlined. */ + t = wait_for_completion_timeout(&dm_device.ol_waitevent, 5*HZ); + if (t == 0) { + pr_info("hot_add memory timedout\n"); + has->ha_end_pfn -= HA_CHUNK; + has->covered_end_pfn -= processed_pfn; + break; + } + + } + + return; +} + +static void hv_online_page(struct page *pg) +{ + struct list_head *cur; + struct hv_hotadd_state *has; + unsigned long cur_start_pgp; + unsigned long cur_end_pgp; + + if (dm_device.ha_waiting) { + dm_device.ha_waiting = false; + complete(&dm_device.ol_waitevent); + } + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + cur_start_pgp = (unsigned long) + pfn_to_page(has->covered_start_pfn); + cur_end_pgp = (unsigned long)pfn_to_page(has->covered_end_pfn); + + if (((unsigned long)pg >= cur_start_pgp) && + ((unsigned long)pg < cur_end_pgp)) { + /* + * This frame is currently backed; online the + * page. + */ + __online_page_set_limits(pg); + __online_page_increment_counters(pg); + __online_page_free(pg); + has->covered_start_pfn++; + } + } +} + +static bool pfn_covered(unsigned long start_pfn, unsigned long pfn_cnt) +{ + struct list_head *cur; + struct hv_hotadd_state *has; + unsigned long residual, new_inc; + + if (list_empty(&dm_device.ha_region_list)) + return false; + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + + /* + * If the pfn range we are dealing with is not in the current + * "hot add block", move on. + */ + if ((start_pfn >= has->end_pfn)) + continue; + /* + * If the current hot add-request extends beyond + * our current limit; extend it. + */ + if ((start_pfn + pfn_cnt) > has->end_pfn) { + residual = (start_pfn + pfn_cnt - has->end_pfn); + /* + * Extend the region by multiples of HA_CHUNK. + */ + new_inc = (residual / HA_CHUNK) * HA_CHUNK; + if (residual % HA_CHUNK) + new_inc += HA_CHUNK; + + has->end_pfn += new_inc; + } + + /* + * If the current start pfn is not where the covered_end + * is, update it. + */ + + if (has->covered_end_pfn != start_pfn) { + has->covered_end_pfn = start_pfn; + has->covered_start_pfn = start_pfn; + } + return true; + } + return false; +} + +static unsigned long handle_pg_range(unsigned long pg_start, + unsigned long pg_count) +{ + unsigned long start_pfn = pg_start; + unsigned long pfn_cnt = pg_count; + unsigned long size; + struct list_head *cur; + struct hv_hotadd_state *has; + unsigned long pgs_ol = 0; + unsigned long old_covered_state; + + if (list_empty(&dm_device.ha_region_list)) + return 0; + + list_for_each(cur, &dm_device.ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + + /* + * If the pfn range we are dealing with is not in the current + * "hot add block", move on. + */ + if ((start_pfn >= has->end_pfn)) + continue; + + old_covered_state = has->covered_end_pfn; + + if (start_pfn < has->ha_end_pfn) { + /* + * This is the case where we are backing pages + * in an already hot added region. Bring + * these pages online first. + */ + pgs_ol = has->ha_end_pfn - start_pfn; + if (pgs_ol > pfn_cnt) + pgs_ol = pfn_cnt; + hv_bring_pgs_online(start_pfn, pgs_ol); + has->covered_end_pfn += pgs_ol; + has->covered_start_pfn += pgs_ol; + pfn_cnt -= pgs_ol; + } + + if ((has->ha_end_pfn < has->end_pfn) && (pfn_cnt > 0)) { + /* + * We have some residual hot add range + * that needs to be hot added; hot add + * it now. Hot add a multiple of + * of HA_CHUNK that fully covers the pages + * we have. + */ + size = (has->end_pfn - has->ha_end_pfn); + if (pfn_cnt <= size) { + size = ((pfn_cnt / HA_CHUNK) * HA_CHUNK); + if (pfn_cnt % HA_CHUNK) + size += HA_CHUNK; + } else { + pfn_cnt = size; + } + hv_mem_hot_add(has->ha_end_pfn, size, pfn_cnt, has); + } + /* + * If we managed to online any pages that were given to us, + * we declare success. + */ + return has->covered_end_pfn - old_covered_state; + + } + + return 0; +} + +static unsigned long process_hot_add(unsigned long pg_start, + unsigned long pfn_cnt, + unsigned long rg_start, + unsigned long rg_size) +{ + struct hv_hotadd_state *ha_region = NULL; + + if (pfn_cnt == 0) + return 0; + + if (!dm_device.host_specified_ha_region) + if (pfn_covered(pg_start, pfn_cnt)) + goto do_pg_range; + + /* + * If the host has specified a hot-add range; deal with it first. + */ + + if (rg_size != 0) { + ha_region = kzalloc(sizeof(struct hv_hotadd_state), GFP_KERNEL); + if (!ha_region) + return 0; + + INIT_LIST_HEAD(&ha_region->list); + + list_add_tail(&ha_region->list, &dm_device.ha_region_list); + ha_region->start_pfn = rg_start; + ha_region->ha_end_pfn = rg_start; + ha_region->covered_start_pfn = pg_start; + ha_region->covered_end_pfn = pg_start; + ha_region->end_pfn = rg_start + rg_size; + } + +do_pg_range: + /* + * Process the page range specified; bringing them + * online if possible. + */ + return handle_pg_range(pg_start, pfn_cnt); +} + +#endif + +static void hot_add_req(struct work_struct *dummy) +{ + struct dm_hot_add_response resp; +#ifdef CONFIG_MEMORY_HOTPLUG + unsigned long pg_start, pfn_cnt; + unsigned long rg_start, rg_sz; +#endif + struct hv_dynmem_device *dm = &dm_device; + memset(&resp, 0, sizeof(struct dm_hot_add_response)); resp.hdr.type = DM_MEM_HOT_ADD_RESPONSE; resp.hdr.size = sizeof(struct dm_hot_add_response); resp.hdr.trans_id = atomic_inc_return(&trans_id); - resp.page_count = 0; - resp.result = 0; +#ifdef CONFIG_MEMORY_HOTPLUG + pg_start = dm->ha_wrk.ha_page_range.finfo.start_page; + pfn_cnt = dm->ha_wrk.ha_page_range.finfo.page_cnt; + + rg_start = dm->ha_wrk.ha_region_range.finfo.start_page; + rg_sz = dm->ha_wrk.ha_region_range.finfo.page_cnt; + + if ((rg_start == 0) && (!dm->host_specified_ha_region)) { + unsigned long region_size; + unsigned long region_start; + + /* + * The host has not specified the hot-add region. + * Based on the hot-add page range being specified, + * compute a hot-add region that can cover the pages + * that need to be hot-added while ensuring the alignment + * and size requirements of Linux as it relates to hot-add. + */ + region_start = pg_start; + region_size = (pfn_cnt / HA_CHUNK) * HA_CHUNK; + if (pfn_cnt % HA_CHUNK) + region_size += HA_CHUNK; + + region_start = (pg_start / HA_CHUNK) * HA_CHUNK; + + rg_start = region_start; + rg_sz = region_size; + } + + if (do_hot_add) + resp.page_count = process_hot_add(pg_start, pfn_cnt, + rg_start, rg_sz); +#endif + /* + * The result field of the response structure has the + * following semantics: + * + * 1. If all or some pages hot-added: Guest should return success. + * + * 2. If no pages could be hot-added: + * + * If the guest returns success, then the host + * will not attempt any further hot-add operations. This + * signifies a permanent failure. + * + * If the guest returns failure, then this failure will be + * treated as a transient failure and the host may retry the + * hot-add operation after some delay. + */ + if (resp.page_count > 0) + resp.result = 1; + else if (!do_hot_add) + resp.result = 1; + else + resp.result = 0; + + if (!do_hot_add || (resp.page_count == 0)) + pr_info("Memory hot add failed\n"); dm->state = DM_INITIALIZED; vmbus_sendpacket(dm->dev->channel, &resp, sizeof(struct dm_hot_add_response), (unsigned long)NULL, VM_PKT_DATA_INBAND, 0); - } static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) @@ -523,7 +912,7 @@ static void process_info(struct hv_dynmem_device *dm, struct dm_info_msg *msg) } } -unsigned long compute_balloon_floor(void) +static unsigned long compute_balloon_floor(void) { unsigned long min_pages; #define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) @@ -644,6 +1033,14 @@ static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, dm->num_pages_ballooned += alloc_unit; + /* + * If we allocatted 2M pages; split them so we + * can free them in any order we get. + */ + + if (alloc_unit != 1) + split_page(pg, get_order(alloc_unit << PAGE_SHIFT)); + bl_resp->range_count++; bl_resp->range_array[i].finfo.start_page = page_to_pfn(pg); @@ -657,9 +1054,9 @@ static int alloc_balloon_pages(struct hv_dynmem_device *dm, int num_pages, -static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) +static void balloon_up(struct work_struct *dummy) { - int num_pages = req->num_pages; + int num_pages = dm_device.balloon_wrk.num_pages; int num_ballooned = 0; struct dm_balloon_response *bl_resp; int alloc_unit; @@ -670,9 +1067,10 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) /* - * Currently, we only support 4k allocations. + * We will attempt 2M allocations. However, if we fail to + * allocate 2M chunks, we will go back to 4k allocations. */ - alloc_unit = 1; + alloc_unit = 512; while (!done) { bl_resp = (struct dm_balloon_response *)send_buffer; @@ -684,14 +1082,19 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) num_pages -= num_ballooned; - num_ballooned = alloc_balloon_pages(dm, num_pages, + num_ballooned = alloc_balloon_pages(&dm_device, num_pages, bl_resp, alloc_unit, &alloc_error); + if ((alloc_error) && (alloc_unit != 1)) { + alloc_unit = 1; + continue; + } + if ((alloc_error) || (num_ballooned == num_pages)) { bl_resp->more_pages = 0; done = true; - dm->state = DM_INITIALIZED; + dm_device.state = DM_INITIALIZED; } /* @@ -719,7 +1122,7 @@ static void balloon_up(struct hv_dynmem_device *dm, struct dm_balloon *req) pr_info("Balloon response failed\n"); for (i = 0; i < bl_resp->range_count; i++) - free_balloon_pages(dm, + free_balloon_pages(&dm_device, &bl_resp->range_array[i]); done = true; @@ -761,7 +1164,6 @@ static int dm_thread_func(void *dm_dev) { struct hv_dynmem_device *dm = dm_dev; int t; - unsigned long scan_start; while (!kthread_should_stop()) { t = wait_for_completion_timeout(&dm_device.config_event, 1*HZ); @@ -773,22 +1175,6 @@ static int dm_thread_func(void *dm_dev) if (t == 0) post_status(dm); - scan_start = jiffies; - switch (dm->state) { - case DM_BALLOON_UP: - balloon_up(dm, (struct dm_balloon *)recv_buffer); - break; - - case DM_HOT_ADD: - hot_add_req(dm, (struct dm_hot_add *)recv_buffer); - break; - default: - break; - } - - if (!time_in_range(jiffies, scan_start, scan_start + HZ)) - post_status(dm); - } return 0; @@ -861,6 +1247,10 @@ static void balloon_onchannelcallback(void *context) struct dm_message *dm_msg; struct dm_header *dm_hdr; struct hv_dynmem_device *dm = hv_get_drvdata(dev); + struct dm_balloon *bal_msg; + struct dm_hot_add *ha_msg; + union dm_mem_page_range *ha_pg_range; + union dm_mem_page_range *ha_region; memset(recv_buffer, 0, sizeof(recv_buffer)); vmbus_recvpacket(dev->channel, recv_buffer, @@ -882,8 +1272,12 @@ static void balloon_onchannelcallback(void *context) break; case DM_BALLOON_REQUEST: + if (dm->state == DM_BALLOON_UP) + pr_warn("Currently ballooning\n"); + bal_msg = (struct dm_balloon *)recv_buffer; dm->state = DM_BALLOON_UP; - complete(&dm->config_event); + dm_device.balloon_wrk.num_pages = bal_msg->num_pages; + schedule_work(&dm_device.balloon_wrk.wrk); break; case DM_UNBALLOON_REQUEST: @@ -893,8 +1287,31 @@ static void balloon_onchannelcallback(void *context) break; case DM_MEM_HOT_ADD_REQUEST: + if (dm->state == DM_HOT_ADD) + pr_warn("Currently hot-adding\n"); dm->state = DM_HOT_ADD; - complete(&dm->config_event); + ha_msg = (struct dm_hot_add *)recv_buffer; + if (ha_msg->hdr.size == sizeof(struct dm_hot_add)) { + /* + * This is a normal hot-add request specifying + * hot-add memory. + */ + ha_pg_range = &ha_msg->range; + dm->ha_wrk.ha_page_range = *ha_pg_range; + dm->ha_wrk.ha_region_range.page_range = 0; + } else { + /* + * Host is specifying that we first hot-add + * a region and then partially populate this + * region. + */ + dm->host_specified_ha_region = true; + ha_pg_range = &ha_msg->range; + ha_region = &ha_pg_range[1]; + dm->ha_wrk.ha_page_range = *ha_pg_range; + dm->ha_wrk.ha_region_range = *ha_region; + } + schedule_work(&dm_device.ha_wrk.wrk); break; case DM_INFO_MESSAGE: @@ -937,6 +1354,10 @@ static int balloon_probe(struct hv_device *dev, dm_device.next_version = DYNMEM_PROTOCOL_VERSION_WIN7; init_completion(&dm_device.host_event); init_completion(&dm_device.config_event); + INIT_LIST_HEAD(&dm_device.ha_region_list); + INIT_WORK(&dm_device.balloon_wrk.wrk, balloon_up); + INIT_WORK(&dm_device.ha_wrk.wrk, hot_add_req); + dm_device.host_specified_ha_region = false; dm_device.thread = kthread_run(dm_thread_func, &dm_device, "hv_balloon"); @@ -945,6 +1366,10 @@ static int balloon_probe(struct hv_device *dev, goto probe_error1; } +#ifdef CONFIG_MEMORY_HOTPLUG + set_online_page_callback(&hv_online_page); +#endif + hv_set_drvdata(dev, &dm_device); /* * Initiate the hand shake with the host and negotiate @@ -962,8 +1387,7 @@ static int balloon_probe(struct hv_device *dev, ret = vmbus_sendpacket(dev->channel, &version_req, sizeof(struct dm_version_request), (unsigned long)NULL, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + VM_PKT_DATA_INBAND, 0); if (ret) goto probe_error2; @@ -990,13 +1414,13 @@ static int balloon_probe(struct hv_device *dev, cap_msg.hdr.trans_id = atomic_inc_return(&trans_id); cap_msg.caps.cap_bits.balloon = 1; + cap_msg.caps.cap_bits.hot_add = 1; + /* - * While we currently don't support hot-add, - * we still advertise this capability since the - * host requires that guests partcipating in the - * dynamic memory protocol support hot add. + * Specify our alignment requirements as it relates + * memory hot-add. Specify 128MB alignment. */ - cap_msg.caps.cap_bits.hot_add = 1; + cap_msg.caps.cap_bits.hot_add_alignment = 7; /* * Currently the host does not use these @@ -1009,8 +1433,7 @@ static int balloon_probe(struct hv_device *dev, ret = vmbus_sendpacket(dev->channel, &cap_msg, sizeof(struct dm_capabilities), (unsigned long)NULL, - VM_PKT_DATA_INBAND, - VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED); + VM_PKT_DATA_INBAND, 0); if (ret) goto probe_error2; @@ -1034,6 +1457,9 @@ static int balloon_probe(struct hv_device *dev, return 0; probe_error2: +#ifdef CONFIG_MEMORY_HOTPLUG + restore_online_page_callback(&hv_online_page); +#endif kthread_stop(dm_device.thread); probe_error1: @@ -1046,13 +1472,26 @@ probe_error0: static int balloon_remove(struct hv_device *dev) { struct hv_dynmem_device *dm = hv_get_drvdata(dev); + struct list_head *cur, *tmp; + struct hv_hotadd_state *has; if (dm->num_pages_ballooned != 0) pr_warn("Ballooned pages: %d\n", dm->num_pages_ballooned); + cancel_work_sync(&dm->balloon_wrk.wrk); + cancel_work_sync(&dm->ha_wrk.wrk); + vmbus_close(dev->channel); kthread_stop(dm->thread); kfree(send_buffer); +#ifdef CONFIG_MEMORY_HOTPLUG + restore_online_page_callback(&hv_online_page); +#endif + list_for_each_safe(cur, tmp, &dm->ha_region_list) { + has = list_entry(cur, struct hv_hotadd_state, list); + list_del(&has->list); + kfree(has); + } return 0; } @@ -1079,14 +1518,7 @@ static int __init init_balloon_drv(void) return vmbus_driver_register(&balloon_drv); } -static void exit_balloon_drv(void) -{ - - vmbus_driver_unregister(&balloon_drv); -} - module_init(init_balloon_drv); -module_exit(exit_balloon_drv); MODULE_DESCRIPTION("Hyper-V Balloon"); MODULE_VERSION(HV_DRV_VERSION); diff --git a/drivers/hv/hv_snapshot.c b/drivers/hv/hv_snapshot.c new file mode 100644 index 00000000000..8ad5653ce44 --- /dev/null +++ b/drivers/hv/hv_snapshot.c @@ -0,0 +1,287 @@ +/* + * An implementation of host initiated guest snapshot. + * + * + * Copyright (C) 2013, Microsoft, Inc. + * Author : K. Y. Srinivasan <kys@microsoft.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published + * by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/net.h> +#include <linux/nls.h> +#include <linux/connector.h> +#include <linux/workqueue.h> +#include <linux/hyperv.h> + + + +/* + * Global state maintained for transaction that is being processed. + * Note that only one transaction can be active at any point in time. + * + * This state is set when we receive a request from the host; we + * cleanup this state when the transaction is completed - when we respond + * to the host with the key value. + */ + +static struct { + bool active; /* transaction status - active or not */ + int recv_len; /* number of bytes received. */ + struct vmbus_channel *recv_channel; /* chn we got the request */ + u64 recv_req_id; /* request ID. */ + struct hv_vss_msg *msg; /* current message */ +} vss_transaction; + + +static void vss_respond_to_host(int error); + +static struct cb_id vss_id = { CN_VSS_IDX, CN_VSS_VAL }; +static const char vss_name[] = "vss_kernel_module"; +static __u8 *recv_buffer; + +static void vss_send_op(struct work_struct *dummy); +static DECLARE_WORK(vss_send_op_work, vss_send_op); + +/* + * Callback when data is received from user mode. + */ + +static void +vss_cn_callback(struct cn_msg *msg, struct netlink_skb_parms *nsp) +{ + struct hv_vss_msg *vss_msg; + + vss_msg = (struct hv_vss_msg *)msg->data; + + if (vss_msg->vss_hdr.operation == VSS_OP_REGISTER) { + pr_info("VSS daemon registered\n"); + vss_transaction.active = false; + if (vss_transaction.recv_channel != NULL) + hv_vss_onchannelcallback(vss_transaction.recv_channel); + return; + + } + vss_respond_to_host(vss_msg->error); +} + + +static void vss_send_op(struct work_struct *dummy) +{ + int op = vss_transaction.msg->vss_hdr.operation; + struct cn_msg *msg; + struct hv_vss_msg *vss_msg; + + msg = kzalloc(sizeof(*msg) + sizeof(*vss_msg), GFP_ATOMIC); + if (!msg) + return; + + vss_msg = (struct hv_vss_msg *)msg->data; + + msg->id.idx = CN_VSS_IDX; + msg->id.val = CN_VSS_VAL; + + vss_msg->vss_hdr.operation = op; + msg->len = sizeof(struct hv_vss_msg); + + cn_netlink_send(msg, 0, GFP_ATOMIC); + kfree(msg); + + return; +} + +/* + * Send a response back to the host. + */ + +static void +vss_respond_to_host(int error) +{ + struct icmsg_hdr *icmsghdrp; + u32 buf_len; + struct vmbus_channel *channel; + u64 req_id; + + /* + * If a transaction is not active; log and return. + */ + + if (!vss_transaction.active) { + /* + * This is a spurious call! + */ + pr_warn("VSS: Transaction not active\n"); + return; + } + /* + * Copy the global state for completing the transaction. Note that + * only one transaction can be active at a time. + */ + + buf_len = vss_transaction.recv_len; + channel = vss_transaction.recv_channel; + req_id = vss_transaction.recv_req_id; + vss_transaction.active = false; + + icmsghdrp = (struct icmsg_hdr *) + &recv_buffer[sizeof(struct vmbuspipe_hdr)]; + + if (channel->onchannel_callback == NULL) + /* + * We have raced with util driver being unloaded; + * silently return. + */ + return; + + icmsghdrp->status = error; + + icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION | ICMSGHDRFLAG_RESPONSE; + + vmbus_sendpacket(channel, recv_buffer, buf_len, req_id, + VM_PKT_DATA_INBAND, 0); + +} + +/* + * This callback is invoked when we get a VSS message from the host. + * The host ensures that only one VSS transaction can be active at a time. + */ + +void hv_vss_onchannelcallback(void *context) +{ + struct vmbus_channel *channel = context; + u32 recvlen; + u64 requestid; + struct hv_vss_msg *vss_msg; + + + struct icmsg_hdr *icmsghdrp; + struct icmsg_negotiate *negop = NULL; + + if (vss_transaction.active) { + /* + * We will defer processing this callback once + * the current transaction is complete. + */ + vss_transaction.recv_channel = channel; + return; + } + + vmbus_recvpacket(channel, recv_buffer, PAGE_SIZE * 2, &recvlen, + &requestid); + + if (recvlen > 0) { + icmsghdrp = (struct icmsg_hdr *)&recv_buffer[ + sizeof(struct vmbuspipe_hdr)]; + + if (icmsghdrp->icmsgtype == ICMSGTYPE_NEGOTIATE) { + vmbus_prep_negotiate_resp(icmsghdrp, negop, + recv_buffer, MAX_SRV_VER, MAX_SRV_VER); + /* + * We currently negotiate the highest number the + * host has presented. If this version is not + * atleast 5.0, reject. + */ + negop = (struct icmsg_negotiate *)&recv_buffer[ + sizeof(struct vmbuspipe_hdr) + + sizeof(struct icmsg_hdr)]; + + if (negop->icversion_data[1].major < 5) + negop->icframe_vercnt = 0; + } else { + vss_msg = (struct hv_vss_msg *)&recv_buffer[ + sizeof(struct vmbuspipe_hdr) + + sizeof(struct icmsg_hdr)]; + + /* + * Stash away this global state for completing the + * transaction; note transactions are serialized. + */ + + vss_transaction.recv_len = recvlen; + vss_transaction.recv_channel = channel; + vss_transaction.recv_req_id = requestid; + vss_transaction.active = true; + vss_transaction.msg = (struct hv_vss_msg *)vss_msg; + + switch (vss_msg->vss_hdr.operation) { + /* + * Initiate a "freeze/thaw" + * operation in the guest. + * We respond to the host once + * the operation is complete. + * + * We send the message to the + * user space daemon and the + * operation is performed in + * the daemon. + */ + case VSS_OP_FREEZE: + case VSS_OP_THAW: + schedule_work(&vss_send_op_work); + return; + + case VSS_OP_HOT_BACKUP: + vss_msg->vss_cf.flags = + VSS_HBU_NO_AUTO_RECOVERY; + vss_respond_to_host(0); + return; + + case VSS_OP_GET_DM_INFO: + vss_msg->dm_info.flags = 0; + vss_respond_to_host(0); + return; + + default: + vss_respond_to_host(0); + return; + + } + + } + + icmsghdrp->icflags = ICMSGHDRFLAG_TRANSACTION + | ICMSGHDRFLAG_RESPONSE; + + vmbus_sendpacket(channel, recv_buffer, + recvlen, requestid, + VM_PKT_DATA_INBAND, 0); + } + +} + +int +hv_vss_init(struct hv_util_service *srv) +{ + int err; + + err = cn_add_callback(&vss_id, vss_name, vss_cn_callback); + if (err) + return err; + recv_buffer = srv->recv_buffer; + + /* + * When this driver loads, the user level daemon that + * processes the host requests may not yet be running. + * Defer processing channel callbacks until the daemon + * has registered. + */ + vss_transaction.active = true; + return 0; +} + +void hv_vss_deinit(void) +{ + cn_del_callback(&vss_id); + cancel_work_sync(&vss_send_op_work); +} diff --git a/drivers/hv/hv_util.c b/drivers/hv/hv_util.c index 1d4cbd8e826..2f561c5dfe2 100644 --- a/drivers/hv/hv_util.c +++ b/drivers/hv/hv_util.c @@ -49,6 +49,12 @@ static struct hv_util_service util_kvp = { .util_deinit = hv_kvp_deinit, }; +static struct hv_util_service util_vss = { + .util_cb = hv_vss_onchannelcallback, + .util_init = hv_vss_init, + .util_deinit = hv_vss_deinit, +}; + static void perform_shutdown(struct work_struct *dummy) { orderly_poweroff(true); @@ -339,6 +345,10 @@ static const struct hv_vmbus_device_id id_table[] = { { HV_KVP_GUID, .driver_data = (unsigned long)&util_kvp }, + /* VSS GUID */ + { HV_VSS_GUID, + .driver_data = (unsigned long)&util_vss + }, { }, }; diff --git a/drivers/hv/ring_buffer.c b/drivers/hv/ring_buffer.c index cafa72ffdc3..d6fbb5772b8 100644 --- a/drivers/hv/ring_buffer.c +++ b/drivers/hv/ring_buffer.c @@ -71,6 +71,7 @@ u32 hv_end_read(struct hv_ring_buffer_info *rbi) static bool hv_need_to_signal(u32 old_write, struct hv_ring_buffer_info *rbi) { + smp_mb(); if (rbi->ring_buffer->interrupt_mask) return false; |