diff options
Diffstat (limited to 'drivers/xen')
25 files changed, 798 insertions, 267 deletions
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 03bc471c3ee..8795480c235 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -11,7 +11,7 @@ config XEN_BALLOON config XEN_SELFBALLOONING bool "Dynamically self-balloon kernel memory to target" - depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP + depends on XEN && XEN_BALLOON && CLEANCACHE && SWAP && XEN_TMEM default n help Self-ballooning dynamically balloons available kernel memory driven @@ -26,6 +26,36 @@ config XEN_SELFBALLOONING kernel boot parameter. Note that systems without a sufficiently large swap device should not enable self-ballooning. +config XEN_BALLOON_MEMORY_HOTPLUG + bool "Memory hotplug support for Xen balloon driver" + default n + depends on XEN_BALLOON && MEMORY_HOTPLUG + help + Memory hotplug support for Xen balloon driver allows expanding memory + available for the system above limit declared at system startup. + It is very useful on critical systems which require long + run without rebooting. + + Memory could be hotplugged in following steps: + + 1) dom0: xl mem-max <domU> <maxmem> + where <maxmem> is >= requested memory size, + + 2) dom0: xl mem-set <domU> <memory> + where <memory> is requested memory size; alternatively memory + could be added by writing proper value to + /sys/devices/system/xen_memory/xen_memory0/target or + /sys/devices/system/xen_memory/xen_memory0/target_kb on dumU, + + 3) domU: for i in /sys/devices/system/memory/memory*/state; do \ + [ "`cat "$i"`" = offline ] && echo online > "$i"; done + + Memory could be onlined automatically on domU by adding following line to udev rules: + + SUBSYSTEM=="memory", ACTION=="add", RUN+="/bin/sh -c '[ -f /sys$devpath/state ] && echo online > /sys$devpath/state'" + + In that case step 3 should be omitted. + config XEN_SCRUB_PAGES bool "Scrub pages before returning them to system" depends on XEN_BALLOON @@ -107,16 +137,6 @@ config XEN_GRANT_DEV_ALLOC to other domains. This can be used to implement frontend drivers or as part of an inter-domain shared memory channel. -config XEN_PLATFORM_PCI - tristate "xen platform pci device driver" - depends on XEN_PVHVM && PCI - default m - help - Driver for the Xen PCI Platform device: it is responsible for - initializing xenbus and grant_table when running in a Xen HVM - domain. As a consequence this driver is required to run any Xen PV - frontend on Xen HVM. - config SWIOTLB_XEN def_bool y depends on PCI diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index 72bbb27d7a6..974fffdf22b 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -14,7 +14,7 @@ obj-$(CONFIG_XEN_GNTDEV) += xen-gntdev.o obj-$(CONFIG_XEN_GRANT_DEV_ALLOC) += xen-gntalloc.o obj-$(CONFIG_XENFS) += xenfs/ obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o -obj-$(CONFIG_XEN_PLATFORM_PCI) += xen-platform-pci.o +obj-$(CONFIG_XEN_PVHVM) += platform-pci.o obj-$(CONFIG_XEN_TMEM) += tmem.o obj-$(CONFIG_SWIOTLB_XEN) += swiotlb-xen.o obj-$(CONFIG_XEN_DOM0) += pci.o @@ -23,5 +23,3 @@ obj-$(CONFIG_XEN_PCIDEV_BACKEND) += xen-pciback/ xen-evtchn-y := evtchn.o xen-gntdev-y := gntdev.o xen-gntalloc-y := gntalloc.o - -xen-platform-pci-y := platform-pci.o diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index 61c0ee7aa7d..a767884a6c7 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -4,6 +4,12 @@ * Copyright (c) 2003, B Dragovic * Copyright (c) 2003-2004, M Williamson, K Fraser * Copyright (c) 2005 Dan M. Smith, IBM Corporation + * Copyright (c) 2010 Daniel Kiper + * + * Memory hotplug support was written by Daniel Kiper. Work on + * it was sponsored by Google under Google Summer of Code 2010 + * program. Jeremy Fitzhardinge from Citrix was the mentor for + * this project. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License version 2 @@ -33,6 +39,7 @@ #include <linux/kernel.h> #include <linux/sched.h> #include <linux/errno.h> +#include <linux/module.h> #include <linux/mm.h> #include <linux/bootmem.h> #include <linux/pagemap.h> @@ -40,6 +47,9 @@ #include <linux/mutex.h> #include <linux/list.h> #include <linux/gfp.h> +#include <linux/notifier.h> +#include <linux/memory.h> +#include <linux/memory_hotplug.h> #include <asm/page.h> #include <asm/pgalloc.h> @@ -193,6 +203,87 @@ static enum bp_state update_schedule(enum bp_state state) return BP_EAGAIN; } +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG +static long current_credit(void) +{ + return balloon_stats.target_pages - balloon_stats.current_pages - + balloon_stats.hotplug_pages; +} + +static bool balloon_is_inflated(void) +{ + if (balloon_stats.balloon_low || balloon_stats.balloon_high || + balloon_stats.balloon_hotplug) + return true; + else + return false; +} + +/* + * reserve_additional_memory() adds memory region of size >= credit above + * max_pfn. New region is section aligned and size is modified to be multiple + * of section size. Those features allow optimal use of address space and + * establish proper alignment when this function is called first time after + * boot (last section not fully populated at boot time contains unused memory + * pages with PG_reserved bit not set; online_pages_range() does not allow page + * onlining in whole range if first onlined page does not have PG_reserved + * bit set). Real size of added memory is established at page onlining stage. + */ + +static enum bp_state reserve_additional_memory(long credit) +{ + int nid, rc; + u64 hotplug_start_paddr; + unsigned long balloon_hotplug = credit; + + hotplug_start_paddr = PFN_PHYS(SECTION_ALIGN_UP(max_pfn)); + balloon_hotplug = round_up(balloon_hotplug, PAGES_PER_SECTION); + nid = memory_add_physaddr_to_nid(hotplug_start_paddr); + + rc = add_memory(nid, hotplug_start_paddr, balloon_hotplug << PAGE_SHIFT); + + if (rc) { + pr_info("xen_balloon: %s: add_memory() failed: %i\n", __func__, rc); + return BP_EAGAIN; + } + + balloon_hotplug -= credit; + + balloon_stats.hotplug_pages += credit; + balloon_stats.balloon_hotplug = balloon_hotplug; + + return BP_DONE; +} + +static void xen_online_page(struct page *page) +{ + __online_page_set_limits(page); + + mutex_lock(&balloon_mutex); + + __balloon_append(page); + + if (balloon_stats.hotplug_pages) + --balloon_stats.hotplug_pages; + else + --balloon_stats.balloon_hotplug; + + mutex_unlock(&balloon_mutex); +} + +static int xen_memory_notifier(struct notifier_block *nb, unsigned long val, void *v) +{ + if (val == MEM_ONLINE) + schedule_delayed_work(&balloon_worker, 0); + + return NOTIFY_OK; +} + +static struct notifier_block xen_memory_nb = { + .notifier_call = xen_memory_notifier, + .priority = 0 +}; +#else static long current_credit(void) { unsigned long target = balloon_stats.target_pages; @@ -205,6 +296,21 @@ static long current_credit(void) return target - balloon_stats.current_pages; } +static bool balloon_is_inflated(void) +{ + if (balloon_stats.balloon_low || balloon_stats.balloon_high) + return true; + else + return false; +} + +static enum bp_state reserve_additional_memory(long credit) +{ + balloon_stats.target_pages = balloon_stats.current_pages; + return BP_DONE; +} +#endif /* CONFIG_XEN_BALLOON_MEMORY_HOTPLUG */ + static enum bp_state increase_reservation(unsigned long nr_pages) { int rc; @@ -216,6 +322,15 @@ static enum bp_state increase_reservation(unsigned long nr_pages) .domid = DOMID_SELF }; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (!balloon_stats.balloon_low && !balloon_stats.balloon_high) { + nr_pages = min(nr_pages, balloon_stats.balloon_hotplug); + balloon_stats.hotplug_pages += nr_pages; + balloon_stats.balloon_hotplug -= nr_pages; + return BP_DONE; + } +#endif + if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -278,6 +393,15 @@ static enum bp_state decrease_reservation(unsigned long nr_pages, gfp_t gfp) .domid = DOMID_SELF }; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + if (balloon_stats.hotplug_pages) { + nr_pages = min(nr_pages, balloon_stats.hotplug_pages); + balloon_stats.hotplug_pages -= nr_pages; + balloon_stats.balloon_hotplug += nr_pages; + return BP_DONE; + } +#endif + if (nr_pages > ARRAY_SIZE(frame_list)) nr_pages = ARRAY_SIZE(frame_list); @@ -339,8 +463,12 @@ static void balloon_process(struct work_struct *work) do { credit = current_credit(); - if (credit > 0) - state = increase_reservation(credit); + if (credit > 0) { + if (balloon_is_inflated()) + state = increase_reservation(credit); + else + state = reserve_additional_memory(credit); + } if (credit < 0) state = decrease_reservation(-credit, GFP_BALLOON); @@ -373,20 +501,24 @@ EXPORT_SYMBOL_GPL(balloon_set_new_target); * alloc_xenballooned_pages - get pages that have been ballooned out * @nr_pages: Number of pages to get * @pages: pages returned + * @highmem: highmem or lowmem pages * @return 0 on success, error otherwise */ -int alloc_xenballooned_pages(int nr_pages, struct page **pages) +int alloc_xenballooned_pages(int nr_pages, struct page **pages, bool highmem) { int pgno = 0; struct page *page; mutex_lock(&balloon_mutex); while (pgno < nr_pages) { - page = balloon_retrieve(true); - if (page) { + page = balloon_retrieve(highmem); + if (page && PageHighMem(page) == highmem) { pages[pgno++] = page; } else { enum bp_state st; - st = decrease_reservation(nr_pages - pgno, GFP_HIGHUSER); + if (page) + balloon_append(page); + st = decrease_reservation(nr_pages - pgno, + highmem ? GFP_HIGHUSER : GFP_USER); if (st != BP_DONE) goto out_undo; } @@ -427,17 +559,40 @@ void free_xenballooned_pages(int nr_pages, struct page **pages) } EXPORT_SYMBOL(free_xenballooned_pages); -static int __init balloon_init(void) +static void __init balloon_add_region(unsigned long start_pfn, + unsigned long pages) { unsigned long pfn, extra_pfn_end; struct page *page; + /* + * If the amount of usable memory has been limited (e.g., with + * the 'mem' command line parameter), don't add pages beyond + * this limit. + */ + extra_pfn_end = min(max_pfn, start_pfn + pages); + + for (pfn = start_pfn; pfn < extra_pfn_end; pfn++) { + page = pfn_to_page(pfn); + /* totalram_pages and totalhigh_pages do not + include the boot-time balloon extension, so + don't subtract from it. */ + __balloon_append(page); + } +} + +static int __init balloon_init(void) +{ + int i; + if (!xen_domain()) return -ENODEV; pr_info("xen/balloon: Initialising balloon driver.\n"); - balloon_stats.current_pages = xen_pv_domain() ? min(xen_start_info->nr_pages, max_pfn) : max_pfn; + balloon_stats.current_pages = xen_pv_domain() + ? min(xen_start_info->nr_pages - xen_released_pages, max_pfn) + : max_pfn; balloon_stats.target_pages = balloon_stats.current_pages; balloon_stats.balloon_low = 0; balloon_stats.balloon_high = 0; @@ -447,25 +602,22 @@ static int __init balloon_init(void) balloon_stats.retry_count = 1; balloon_stats.max_retry_count = RETRY_UNLIMITED; +#ifdef CONFIG_XEN_BALLOON_MEMORY_HOTPLUG + balloon_stats.hotplug_pages = 0; + balloon_stats.balloon_hotplug = 0; + + set_online_page_callback(&xen_online_page); + register_memory_notifier(&xen_memory_nb); +#endif + /* - * Initialise the balloon with excess memory space. We need - * to make sure we don't add memory which doesn't exist or - * logically exist. The E820 map can be trimmed to be smaller - * than the amount of physical memory due to the mem= command - * line parameter. And if this is a 32-bit non-HIGHMEM kernel - * on a system with memory which requires highmem to access, - * don't try to use it. + * Initialize the balloon with pages from the extra memory + * regions (see arch/x86/xen/setup.c). */ - extra_pfn_end = min(min(max_pfn, e820_end_of_ram_pfn()), - (unsigned long)PFN_DOWN(xen_extra_mem_start + xen_extra_mem_size)); - for (pfn = PFN_UP(xen_extra_mem_start); - pfn < extra_pfn_end; - pfn++) { - page = pfn_to_page(pfn); - /* totalram_pages and totalhigh_pages do not include the boot-time - balloon extension, so don't subtract from it. */ - __balloon_append(page); - } + for (i = 0; i < XEN_EXTRA_MEM_MAX_REGIONS; i++) + if (xen_extra_mem[i].size) + balloon_add_region(PFN_UP(xen_extra_mem[i].start), + PFN_DOWN(xen_extra_mem[i].size)); return 0; } diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 8876ffd0877..6e075cdd0c6 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -54,7 +54,7 @@ * This lock protects updates to the following mapping and reference-count * arrays. The lock does not need to be acquired to read the mapping tables. */ -static DEFINE_SPINLOCK(irq_mapping_update_lock); +static DEFINE_MUTEX(irq_mapping_update_lock); static LIST_HEAD(xen_irq_list_head); @@ -431,7 +431,8 @@ static int __must_check xen_allocate_irq_dynamic(void) irq = irq_alloc_desc_from(first, -1); - xen_irq_init(irq); + if (irq >= 0) + xen_irq_init(irq); return irq; } @@ -630,7 +631,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, int irq = -1; struct physdev_irq irq_op; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = find_irq_by_gsi(gsi); if (irq != -1) { @@ -683,7 +684,7 @@ int xen_bind_pirq_gsi_to_irq(unsigned gsi, handle_edge_irq, name); out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -709,10 +710,10 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, { int irq, ret; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = xen_allocate_irq_dynamic(); - if (irq == -1) + if (irq < 0) goto out; irq_set_chip_and_handler_name(irq, &xen_pirq_chip, handle_edge_irq, @@ -723,12 +724,12 @@ int xen_bind_pirq_msi_to_irq(struct pci_dev *dev, struct msi_desc *msidesc, if (ret < 0) goto error_irq; out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; error_irq: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); xen_free_irq(irq); - return -1; + return ret; } #endif @@ -739,7 +740,7 @@ int xen_destroy_irq(int irq) struct irq_info *info = info_for_irq(irq); int rc = -ENOENT; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); desc = irq_to_desc(irq); if (!desc) @@ -765,7 +766,7 @@ int xen_destroy_irq(int irq) xen_free_irq(irq); out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return rc; } @@ -775,10 +776,10 @@ int xen_irq_from_pirq(unsigned pirq) struct irq_info *info; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); list_for_each_entry(info, &xen_irq_list_head, list) { - if (info == NULL || info->type != IRQT_PIRQ) + if (info->type != IRQT_PIRQ) continue; irq = info->irq; if (info->u.pirq.pirq == pirq) @@ -786,7 +787,7 @@ int xen_irq_from_pirq(unsigned pirq) } irq = -1; out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -801,7 +802,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) { int irq; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = evtchn_to_irq[evtchn]; @@ -817,7 +818,7 @@ int bind_evtchn_to_irq(unsigned int evtchn) } out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -828,7 +829,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) struct evtchn_bind_ipi bind_ipi; int evtchn, irq; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = per_cpu(ipi_to_irq, cpu)[ipi]; @@ -852,7 +853,7 @@ static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) } out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -871,13 +872,34 @@ static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, return err ? : bind_evtchn_to_irq(bind_interdomain.local_port); } +static int find_virq(unsigned int virq, unsigned int cpu) +{ + struct evtchn_status status; + int port, rc = -ENOENT; + + memset(&status, 0, sizeof(status)); + for (port = 0; port <= NR_EVENT_CHANNELS; port++) { + status.dom = DOMID_SELF; + status.port = port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_status, &status); + if (rc < 0) + continue; + if (status.status != EVTCHNSTAT_virq) + continue; + if (status.u.virq == virq && status.vcpu == cpu) { + rc = port; + break; + } + } + return rc; +} int bind_virq_to_irq(unsigned int virq, unsigned int cpu) { struct evtchn_bind_virq bind_virq; - int evtchn, irq; + int evtchn, irq, ret; - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); irq = per_cpu(virq_to_irq, cpu)[virq]; @@ -891,10 +913,16 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) bind_virq.virq = virq; bind_virq.vcpu = cpu; - if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, - &bind_virq) != 0) - BUG(); - evtchn = bind_virq.port; + ret = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (ret == 0) + evtchn = bind_virq.port; + else { + if (ret == -EEXIST) + ret = find_virq(virq, cpu); + BUG_ON(ret < 0); + evtchn = ret; + } xen_irq_info_virq_init(cpu, irq, evtchn, virq); @@ -902,7 +930,7 @@ int bind_virq_to_irq(unsigned int virq, unsigned int cpu) } out: - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); return irq; } @@ -912,7 +940,7 @@ static void unbind_from_irq(unsigned int irq) struct evtchn_close close; int evtchn = evtchn_from_irq(irq); - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); if (VALID_EVTCHN(evtchn)) { close.port = evtchn; @@ -942,7 +970,7 @@ static void unbind_from_irq(unsigned int irq) xen_free_irq(irq); - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); } int bind_evtchn_to_irqhandler(unsigned int evtchn, @@ -1020,7 +1048,7 @@ int bind_ipi_to_irqhandler(enum ipi_vector ipi, if (irq < 0) return irq; - irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME; + irqflags |= IRQF_NO_SUSPEND | IRQF_FORCE_RESUME | IRQF_EARLY_RESUME; retval = request_irq(irq, handler, irqflags, devname, dev_id); if (retval != 0) { unbind_from_irq(irq); @@ -1278,7 +1306,7 @@ void rebind_evtchn_irq(int evtchn, int irq) will also be masked. */ disable_irq(irq); - spin_lock(&irq_mapping_update_lock); + mutex_lock(&irq_mapping_update_lock); /* After resume the irq<->evtchn mappings are all cleared out */ BUG_ON(evtchn_to_irq[evtchn] != -1); @@ -1288,7 +1316,7 @@ void rebind_evtchn_irq(int evtchn, int irq) xen_irq_info_evtchn_init(irq, evtchn); - spin_unlock(&irq_mapping_update_lock); + mutex_unlock(&irq_mapping_update_lock); /* new event channels are always bound to cpu 0 */ irq_set_affinity(irq, cpumask_of(0)); @@ -1669,6 +1697,7 @@ void __init xen_init_IRQ(void) evtchn_to_irq = kcalloc(NR_EVENT_CHANNELS, sizeof(*evtchn_to_irq), GFP_KERNEL); + BUG_ON(!evtchn_to_irq); for (i = 0; i < NR_EVENT_CHANNELS; i++) evtchn_to_irq[i] = -1; diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index 772a5b8bbf2..39871326afa 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -83,6 +83,7 @@ struct grant_map { struct ioctl_gntdev_grant_ref *grants; struct gnttab_map_grant_ref *map_ops; struct gnttab_unmap_grant_ref *unmap_ops; + struct gnttab_map_grant_ref *kmap_ops; struct page **pages; }; @@ -116,19 +117,22 @@ static struct grant_map *gntdev_alloc_map(struct gntdev_priv *priv, int count) add->grants = kzalloc(sizeof(add->grants[0]) * count, GFP_KERNEL); add->map_ops = kzalloc(sizeof(add->map_ops[0]) * count, GFP_KERNEL); add->unmap_ops = kzalloc(sizeof(add->unmap_ops[0]) * count, GFP_KERNEL); + add->kmap_ops = kzalloc(sizeof(add->kmap_ops[0]) * count, GFP_KERNEL); add->pages = kzalloc(sizeof(add->pages[0]) * count, GFP_KERNEL); if (NULL == add->grants || NULL == add->map_ops || NULL == add->unmap_ops || + NULL == add->kmap_ops || NULL == add->pages) goto err; - if (alloc_xenballooned_pages(count, add->pages)) + if (alloc_xenballooned_pages(count, add->pages, false /* lowmem */)) goto err; for (i = 0; i < count; i++) { add->map_ops[i].handle = -1; add->unmap_ops[i].handle = -1; + add->kmap_ops[i].handle = -1; } add->index = 0; @@ -142,6 +146,7 @@ err: kfree(add->grants); kfree(add->map_ops); kfree(add->unmap_ops); + kfree(add->kmap_ops); kfree(add); return NULL; } @@ -242,10 +247,35 @@ static int map_grant_pages(struct grant_map *map) gnttab_set_unmap_op(&map->unmap_ops[i], addr, map->flags, -1 /* handle */); } + } else { + /* + * Setup the map_ops corresponding to the pte entries pointing + * to the kernel linear addresses of the struct pages. + * These ptes are completely different from the user ptes dealt + * with find_grant_ptes. + */ + for (i = 0; i < map->count; i++) { + unsigned level; + unsigned long address = (unsigned long) + pfn_to_kaddr(page_to_pfn(map->pages[i])); + pte_t *ptep; + u64 pte_maddr = 0; + BUG_ON(PageHighMem(map->pages[i])); + + ptep = lookup_address(address, &level); + pte_maddr = arbitrary_virt_to_machine(ptep).maddr; + gnttab_set_map_op(&map->kmap_ops[i], pte_maddr, + map->flags | + GNTMAP_host_map | + GNTMAP_contains_pte, + map->grants[i].ref, + map->grants[i].domid); + } } pr_debug("map %d+%d\n", map->index, map->count); - err = gnttab_map_refs(map->map_ops, map->pages, map->count); + err = gnttab_map_refs(map->map_ops, use_ptemod ? map->kmap_ops : NULL, + map->pages, map->count); if (err) return err; @@ -461,13 +491,11 @@ static int gntdev_release(struct inode *inode, struct file *flip) pr_debug("priv %p\n", priv); - spin_lock(&priv->lock); while (!list_empty(&priv->maps)) { map = list_entry(priv->maps.next, struct grant_map, next); list_del(&map->next); gntdev_put_map(map); } - spin_unlock(&priv->lock); if (use_ptemod) mmu_notifier_unregister(&priv->mn, priv->mm); @@ -531,10 +559,11 @@ static long gntdev_ioctl_unmap_grant_ref(struct gntdev_priv *priv, map = gntdev_find_map_index(priv, op.index >> PAGE_SHIFT, op.count); if (map) { list_del(&map->next); - gntdev_put_map(map); err = 0; } spin_unlock(&priv->lock); + if (map) + gntdev_put_map(map); return err; } diff --git a/drivers/xen/grant-table.c b/drivers/xen/grant-table.c index 3a3dceb7063..bf1c094f4eb 100644 --- a/drivers/xen/grant-table.c +++ b/drivers/xen/grant-table.c @@ -82,7 +82,7 @@ static inline grant_ref_t *__gnttab_entry(grant_ref_t entry) static int get_free_entries(unsigned count) { unsigned long flags; - int ref, rc; + int ref, rc = 0; grant_ref_t head; spin_lock_irqsave(&gnttab_list_lock, flags); @@ -448,7 +448,8 @@ unsigned int gnttab_max_grant_frames(void) EXPORT_SYMBOL_GPL(gnttab_max_grant_frames); int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, - struct page **pages, unsigned int count) + struct gnttab_map_grant_ref *kmap_ops, + struct page **pages, unsigned int count) { int i, ret; pte_t *pte; @@ -488,8 +489,7 @@ int gnttab_map_refs(struct gnttab_map_grant_ref *map_ops, */ return -EOPNOTSUPP; } - ret = m2p_add_override(mfn, pages[i], - map_ops[i].flags & GNTMAP_contains_pte); + ret = m2p_add_override(mfn, pages[i], &kmap_ops[i]); if (ret) return ret; } diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 0b5366b5be2..ce4fa083186 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -9,6 +9,7 @@ #include <linux/stop_machine.h> #include <linux/freezer.h> #include <linux/syscore_ops.h> +#include <linux/export.h> #include <xen/xen.h> #include <xen/xenbus.h> diff --git a/drivers/xen/pci.c b/drivers/xen/pci.c index c4448ee5595..b84bf0b6cc3 100644 --- a/drivers/xen/pci.c +++ b/drivers/xen/pci.c @@ -18,6 +18,7 @@ */ #include <linux/pci.h> +#include <linux/acpi.h> #include <xen/xen.h> #include <xen/interface/physdev.h> #include <xen/interface/xen.h> @@ -26,26 +27,85 @@ #include <asm/xen/hypercall.h> #include "../pci/pci.h" +static bool __read_mostly pci_seg_supported = true; + static int xen_add_device(struct device *dev) { int r; struct pci_dev *pci_dev = to_pci_dev(dev); +#ifdef CONFIG_PCI_IOV + struct pci_dev *physfn = pci_dev->physfn; +#endif + + if (pci_seg_supported) { + struct physdev_pci_device_add add = { + .seg = pci_domain_nr(pci_dev->bus), + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; +#ifdef CONFIG_ACPI + acpi_handle handle; +#endif + +#ifdef CONFIG_PCI_IOV + if (pci_dev->is_virtfn) { + add.flags = XEN_PCI_DEV_VIRTFN; + add.physfn.bus = physfn->bus->number; + add.physfn.devfn = physfn->devfn; + } else +#endif + if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) + add.flags = XEN_PCI_DEV_EXTFN; + +#ifdef CONFIG_ACPI + handle = DEVICE_ACPI_HANDLE(&pci_dev->dev); + if (!handle) + handle = DEVICE_ACPI_HANDLE(pci_dev->bus->bridge); +#ifdef CONFIG_PCI_IOV + if (!handle && pci_dev->is_virtfn) + handle = DEVICE_ACPI_HANDLE(physfn->bus->bridge); +#endif + if (handle) { + acpi_status status; + + do { + unsigned long long pxm; + + status = acpi_evaluate_integer(handle, "_PXM", + NULL, &pxm); + if (ACPI_SUCCESS(status)) { + add.optarr[0] = pxm; + add.flags |= XEN_PCI_DEV_PXM; + break; + } + status = acpi_get_parent(handle, &handle); + } while (ACPI_SUCCESS(status)); + } +#endif /* CONFIG_ACPI */ + + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_add, &add); + if (r != -ENOSYS) + return r; + pci_seg_supported = false; + } + if (pci_domain_nr(pci_dev->bus)) + r = -ENOSYS; #ifdef CONFIG_PCI_IOV - if (pci_dev->is_virtfn) { + else if (pci_dev->is_virtfn) { struct physdev_manage_pci_ext manage_pci_ext = { .bus = pci_dev->bus->number, .devfn = pci_dev->devfn, - .is_virtfn = 1, - .physfn.bus = pci_dev->physfn->bus->number, - .physfn.devfn = pci_dev->physfn->devfn, + .is_virtfn = 1, + .physfn.bus = physfn->bus->number, + .physfn.devfn = physfn->devfn, }; r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_add_ext, &manage_pci_ext); - } else + } #endif - if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { + else if (pci_ari_enabled(pci_dev->bus) && PCI_SLOT(pci_dev->devfn)) { struct physdev_manage_pci_ext manage_pci_ext = { .bus = pci_dev->bus->number, .devfn = pci_dev->devfn, @@ -71,13 +131,27 @@ static int xen_remove_device(struct device *dev) { int r; struct pci_dev *pci_dev = to_pci_dev(dev); - struct physdev_manage_pci manage_pci; - manage_pci.bus = pci_dev->bus->number; - manage_pci.devfn = pci_dev->devfn; + if (pci_seg_supported) { + struct physdev_pci_device device = { + .seg = pci_domain_nr(pci_dev->bus), + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; - r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, - &manage_pci); + r = HYPERVISOR_physdev_op(PHYSDEVOP_pci_device_remove, + &device); + } else if (pci_domain_nr(pci_dev->bus)) + r = -ENOSYS; + else { + struct physdev_manage_pci manage_pci = { + .bus = pci_dev->bus->number, + .devfn = pci_dev->devfn + }; + + r = HYPERVISOR_physdev_op(PHYSDEVOP_manage_pci_remove, + &manage_pci); + } return r; } @@ -96,13 +170,16 @@ static int xen_pci_notifier(struct notifier_block *nb, r = xen_remove_device(dev); break; default: - break; + return NOTIFY_DONE; } - - return r; + if (r) + dev_err(dev, "Failed to %s - passthrough or MSI/MSI-X might fail!\n", + action == BUS_NOTIFY_ADD_DEVICE ? "add" : + (action == BUS_NOTIFY_DEL_DEVICE ? "delete" : "?")); + return NOTIFY_OK; } -struct notifier_block device_nb = { +static struct notifier_block device_nb = { .notifier_call = xen_pci_notifier, }; diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 6e8c15a2320..8e964b91c44 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -35,9 +35,11 @@ #include <linux/bootmem.h> #include <linux/dma-mapping.h> +#include <linux/export.h> #include <xen/swiotlb-xen.h> #include <xen/page.h> #include <xen/xen-ops.h> +#include <xen/hvc-console.h> /* * Used to do a quick range check in swiotlb_tbl_unmap_single and * swiotlb_tbl_sync_single_*, to see if the memory was in fact allocated by this @@ -146,8 +148,10 @@ xen_swiotlb_fixup(void *buf, size_t size, unsigned long nslabs) void __init xen_swiotlb_init(int verbose) { unsigned long bytes; - int rc; + int rc = -ENOMEM; unsigned long nr_tbl; + char *m = NULL; + unsigned int repeat = 3; nr_tbl = swioltb_nr_tbl(); if (nr_tbl) @@ -156,16 +160,17 @@ void __init xen_swiotlb_init(int verbose) xen_io_tlb_nslabs = (64 * 1024 * 1024 >> IO_TLB_SHIFT); xen_io_tlb_nslabs = ALIGN(xen_io_tlb_nslabs, IO_TLB_SEGSIZE); } - +retry: bytes = xen_io_tlb_nslabs << IO_TLB_SHIFT; /* * Get IO TLB memory from any location. */ xen_io_tlb_start = alloc_bootmem(bytes); - if (!xen_io_tlb_start) - panic("Cannot allocate SWIOTLB buffer"); - + if (!xen_io_tlb_start) { + m = "Cannot allocate Xen-SWIOTLB buffer!\n"; + goto error; + } xen_io_tlb_end = xen_io_tlb_start + bytes; /* * And replace that memory with pages under 4GB. @@ -173,17 +178,28 @@ void __init xen_swiotlb_init(int verbose) rc = xen_swiotlb_fixup(xen_io_tlb_start, bytes, xen_io_tlb_nslabs); - if (rc) + if (rc) { + free_bootmem(__pa(xen_io_tlb_start), bytes); + m = "Failed to get contiguous memory for DMA from Xen!\n"\ + "You either: don't have the permissions, do not have"\ + " enough free memory under 4GB, or the hypervisor memory"\ + "is too fragmented!"; goto error; - + } start_dma_addr = xen_virt_to_bus(xen_io_tlb_start); swiotlb_init_with_tbl(xen_io_tlb_start, xen_io_tlb_nslabs, verbose); return; error: - panic("DMA(%d): Failed to exchange pages allocated for DMA with Xen! "\ - "We either don't have the permission or you do not have enough"\ - "free memory under 4GB!\n", rc); + if (repeat--) { + xen_io_tlb_nslabs = max(1024UL, /* Min is 2MB */ + (xen_io_tlb_nslabs >> 1)); + printk(KERN_INFO "Xen-SWIOTLB: Lowering to %luMB\n", + (xen_io_tlb_nslabs << IO_TLB_SHIFT) >> 20); + goto retry; + } + xen_raw_printk("%s (rc:%d)", m, rc); + panic("%s (rc:%d)", m, rc); } void * @@ -194,6 +210,8 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, int order = get_order(size); u64 dma_mask = DMA_BIT_MASK(32); unsigned long vstart; + phys_addr_t phys; + dma_addr_t dev_addr; /* * Ignore region specifiers - the kernel's ideas of @@ -209,18 +227,26 @@ xen_swiotlb_alloc_coherent(struct device *hwdev, size_t size, vstart = __get_free_pages(flags, order); ret = (void *)vstart; + if (!ret) + return ret; + if (hwdev && hwdev->coherent_dma_mask) - dma_mask = dma_alloc_coherent_mask(hwdev, flags); + dma_mask = hwdev->coherent_dma_mask; - if (ret) { + phys = virt_to_phys(ret); + dev_addr = xen_phys_to_bus(phys); + if (((dev_addr + size - 1 <= dma_mask)) && + !range_straddles_page_boundary(phys, size)) + *dma_handle = dev_addr; + else { if (xen_create_contiguous_region(vstart, order, fls64(dma_mask)) != 0) { free_pages(vstart, order); return NULL; } - memset(ret, 0, size); *dma_handle = virt_to_machine(ret).maddr; } + memset(ret, 0, size); return ret; } EXPORT_SYMBOL_GPL(xen_swiotlb_alloc_coherent); @@ -230,11 +256,21 @@ xen_swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, dma_addr_t dev_addr) { int order = get_order(size); + phys_addr_t phys; + u64 dma_mask = DMA_BIT_MASK(32); if (dma_release_from_coherent(hwdev, order, vaddr)) return; - xen_destroy_contiguous_region((unsigned long)vaddr, order); + if (hwdev && hwdev->coherent_dma_mask) + dma_mask = hwdev->coherent_dma_mask; + + phys = virt_to_phys(vaddr); + + if (((dev_addr + size - 1 > dma_mask)) || + range_straddles_page_boundary(phys, size)) + xen_destroy_contiguous_region((unsigned long)vaddr, order); + free_pages((unsigned long)vaddr, order); } EXPORT_SYMBOL_GPL(xen_swiotlb_free_coherent); @@ -278,9 +314,10 @@ dma_addr_t xen_swiotlb_map_page(struct device *dev, struct page *page, /* * Ensure that the address returned is DMA'ble */ - if (!dma_capable(dev, dev_addr, size)) - panic("map_single: bounce buffer is not DMA'ble"); - + if (!dma_capable(dev, dev_addr, size)) { + swiotlb_tbl_unmap_single(dev, map, size, dir); + dev_addr = 0; + } return dev_addr; } EXPORT_SYMBOL_GPL(xen_swiotlb_map_page); diff --git a/drivers/xen/xen-pciback/conf_space.c b/drivers/xen/xen-pciback/conf_space.c index a8031445d94..52fed16d870 100644 --- a/drivers/xen/xen-pciback/conf_space.c +++ b/drivers/xen/xen-pciback/conf_space.c @@ -10,12 +10,12 @@ */ #include <linux/kernel.h> +#include <linux/module.h> #include <linux/pci.h> #include "pciback.h" #include "conf_space.h" #include "conf_space_quirks.h" -#define DRV_NAME "xen-pciback" static int permissive; module_param(permissive, bool, 0644); diff --git a/drivers/xen/xen-pciback/conf_space_header.c b/drivers/xen/xen-pciback/conf_space_header.c index da3cbdfcb5d..3daf862d739 100644 --- a/drivers/xen/xen-pciback/conf_space_header.c +++ b/drivers/xen/xen-pciback/conf_space_header.c @@ -15,7 +15,6 @@ struct pci_bar_info { int which; }; -#define DRV_NAME "xen-pciback" #define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) #define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) @@ -25,7 +24,7 @@ static int command_read(struct pci_dev *dev, int offset, u16 *value, void *data) int ret; ret = xen_pcibk_read_config_word(dev, offset, value, data); - if (!atomic_read(&dev->enable_cnt)) + if (!pci_is_enabled(dev)) return ret; for (i = 0; i < PCI_ROM_RESOURCE; i++) { @@ -187,7 +186,7 @@ static inline void read_dev_bar(struct pci_dev *dev, bar_info->val = res[pos].start | (res[pos].flags & PCI_REGION_FLAG_MASK); - bar_info->len_val = res[pos].end - res[pos].start + 1; + bar_info->len_val = resource_size(&res[pos]); } static void *bar_init(struct pci_dev *dev, int offset) diff --git a/drivers/xen/xen-pciback/conf_space_quirks.c b/drivers/xen/xen-pciback/conf_space_quirks.c index 921a889e65e..7476791cab4 100644 --- a/drivers/xen/xen-pciback/conf_space_quirks.c +++ b/drivers/xen/xen-pciback/conf_space_quirks.c @@ -12,7 +12,6 @@ #include "conf_space_quirks.h" LIST_HEAD(xen_pcibk_quirks); -#define DRV_NAME "xen-pciback" static inline const struct pci_device_id * match_one_device(const struct pci_device_id *id, const struct pci_dev *dev) { @@ -36,7 +35,7 @@ static struct xen_pcibk_config_quirk *xen_pcibk_find_quirk(struct pci_dev *dev) goto out; tmp_quirk = NULL; printk(KERN_DEBUG DRV_NAME - ":quirk didn't match any device xen_pciback knows about\n"); + ": quirk didn't match any device known\n"); out: return tmp_quirk; } diff --git a/drivers/xen/xen-pciback/passthrough.c b/drivers/xen/xen-pciback/passthrough.c index 1d32a9a42c0..828dddc360d 100644 --- a/drivers/xen/xen-pciback/passthrough.c +++ b/drivers/xen/xen-pciback/passthrough.c @@ -7,13 +7,13 @@ #include <linux/list.h> #include <linux/pci.h> -#include <linux/spinlock.h> +#include <linux/mutex.h> #include "pciback.h" struct passthrough_dev_data { /* Access to dev_list must be protected by lock */ struct list_head dev_list; - spinlock_t lock; + struct mutex lock; }; static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, @@ -24,9 +24,8 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, struct passthrough_dev_data *dev_data = pdev->pci_dev_data; struct pci_dev_entry *dev_entry; struct pci_dev *dev = NULL; - unsigned long flags; - spin_lock_irqsave(&dev_data->lock, flags); + mutex_lock(&dev_data->lock); list_for_each_entry(dev_entry, &dev_data->dev_list, list) { if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) @@ -37,7 +36,7 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, } } - spin_unlock_irqrestore(&dev_data->lock, flags); + mutex_unlock(&dev_data->lock); return dev; } @@ -48,7 +47,6 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, { struct passthrough_dev_data *dev_data = pdev->pci_dev_data; struct pci_dev_entry *dev_entry; - unsigned long flags; unsigned int domain, bus, devfn; int err; @@ -57,9 +55,9 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, return -ENOMEM; dev_entry->dev = dev; - spin_lock_irqsave(&dev_data->lock, flags); + mutex_lock(&dev_data->lock); list_add_tail(&dev_entry->list, &dev_data->dev_list); - spin_unlock_irqrestore(&dev_data->lock, flags); + mutex_unlock(&dev_data->lock); /* Publish this device. */ domain = (unsigned int)pci_domain_nr(dev->bus); @@ -76,9 +74,8 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, struct passthrough_dev_data *dev_data = pdev->pci_dev_data; struct pci_dev_entry *dev_entry, *t; struct pci_dev *found_dev = NULL; - unsigned long flags; - spin_lock_irqsave(&dev_data->lock, flags); + mutex_lock(&dev_data->lock); list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { if (dev_entry->dev == dev) { @@ -88,7 +85,7 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, } } - spin_unlock_irqrestore(&dev_data->lock, flags); + mutex_unlock(&dev_data->lock); if (found_dev) pcistub_put_pci_dev(found_dev); @@ -102,7 +99,7 @@ static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) if (!dev_data) return -ENOMEM; - spin_lock_init(&dev_data->lock); + mutex_init(&dev_data->lock); INIT_LIST_HEAD(&dev_data->dev_list); @@ -116,14 +113,14 @@ static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, { int err = 0; struct passthrough_dev_data *dev_data = pdev->pci_dev_data; - struct pci_dev_entry *dev_entry, *e, *tmp; + struct pci_dev_entry *dev_entry, *e; struct pci_dev *dev; int found; unsigned int domain, bus; - spin_lock(&dev_data->lock); + mutex_lock(&dev_data->lock); - list_for_each_entry_safe(dev_entry, tmp, &dev_data->dev_list, list) { + list_for_each_entry(dev_entry, &dev_data->dev_list, list) { /* Only publish this device as a root if none of its * parent bridges are exported */ @@ -142,16 +139,13 @@ static int __xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, bus = (unsigned int)dev_entry->dev->bus->number; if (!found) { - spin_unlock(&dev_data->lock); err = publish_root_cb(pdev, domain, bus); if (err) break; - spin_lock(&dev_data->lock); } } - if (!err) - spin_unlock(&dev_data->lock); + mutex_unlock(&dev_data->lock); return err; } @@ -182,7 +176,7 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, return 1; } -struct xen_pcibk_backend xen_pcibk_passthrough_backend = { +const struct xen_pcibk_backend xen_pcibk_passthrough_backend = { .name = "passthrough", .init = __xen_pcibk_init_devices, .free = __xen_pcibk_release_devices, diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c index aec214ac0a1..8f06e1ed028 100644 --- a/drivers/xen/xen-pciback/pci_stub.c +++ b/drivers/xen/xen-pciback/pci_stub.c @@ -21,8 +21,6 @@ #include "conf_space.h" #include "conf_space_quirks.h" -#define DRV_NAME "xen-pciback" - static char *pci_devs_to_hide; wait_queue_head_t xen_pcibk_aer_wait_queue; /*Add sem for sync AER handling and xen_pcibk remove/reconfigue ops, @@ -222,6 +220,8 @@ void pcistub_put_pci_dev(struct pci_dev *dev) } spin_unlock_irqrestore(&pcistub_devices_lock, flags); + if (WARN_ON(!found_psdev)) + return; /*hold this lock for avoiding breaking link between * pcistub and xen_pcibk when AER is in processing @@ -514,12 +514,9 @@ static void kill_domain_by_device(struct pcistub_device *psdev) int err; char nodename[PCI_NODENAME_MAX]; - if (!psdev) - dev_err(&psdev->dev->dev, - "device is NULL when do AER recovery/kill_domain\n"); + BUG_ON(!psdev); snprintf(nodename, PCI_NODENAME_MAX, "/local/domain/0/backend/pci/%d/0", psdev->pdev->xdev->otherend_id); - nodename[strlen(nodename)] = '\0'; again: err = xenbus_transaction_start(&xbt); @@ -605,7 +602,7 @@ static pci_ers_result_t common_process(struct pcistub_device *psdev, if (test_bit(_XEN_PCIF_active, (unsigned long *)&psdev->pdev->sh_info->flags)) { dev_dbg(&psdev->dev->dev, - "schedule pci_conf service in xen_pcibk\n"); + "schedule pci_conf service in " DRV_NAME "\n"); xen_pcibk_test_and_schedule_op(psdev->pdev); } @@ -995,8 +992,7 @@ out: err = count; return err; } - -DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); +static DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, size_t count) @@ -1015,8 +1011,7 @@ out: err = count; return err; } - -DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); +static DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) { @@ -1039,8 +1034,7 @@ static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) return count; } - -DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); +static DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) { @@ -1069,8 +1063,7 @@ static ssize_t pcistub_irq_handler_show(struct device_driver *drv, char *buf) spin_unlock_irqrestore(&pcistub_devices_lock, flags); return count; } - -DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); +static DRIVER_ATTR(irq_handlers, S_IRUSR, pcistub_irq_handler_show, NULL); static ssize_t pcistub_irq_handler_switch(struct device_driver *drv, const char *buf, @@ -1106,7 +1099,8 @@ out: err = count; return err; } -DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, pcistub_irq_handler_switch); +static DRIVER_ATTR(irq_handler_state, S_IWUSR, NULL, + pcistub_irq_handler_switch); static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, size_t count) @@ -1170,8 +1164,8 @@ out: return count; } - -DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add); +static DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, + pcistub_quirk_add); static ssize_t permissive_add(struct device_driver *drv, const char *buf, size_t count) @@ -1236,8 +1230,8 @@ static ssize_t permissive_show(struct device_driver *drv, char *buf) spin_unlock_irqrestore(&pcistub_devices_lock, flags); return count; } - -DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add); +static DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, + permissive_add); static void pcistub_exit(void) { @@ -1374,3 +1368,4 @@ module_init(xen_pcibk_init); module_exit(xen_pcibk_cleanup); MODULE_LICENSE("Dual BSD/GPL"); +MODULE_ALIAS("xen-backend:pci"); diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h index a0e131a8150..e9b4011c5f9 100644 --- a/drivers/xen/xen-pciback/pciback.h +++ b/drivers/xen/xen-pciback/pciback.h @@ -15,6 +15,8 @@ #include <linux/atomic.h> #include <xen/interface/io/pciif.h> +#define DRV_NAME "xen-pciback" + struct pci_dev_entry { struct list_head list; struct pci_dev *dev; @@ -27,7 +29,7 @@ struct pci_dev_entry { struct xen_pcibk_device { void *pci_dev_data; - spinlock_t dev_lock; + struct mutex dev_lock; struct xenbus_device *xdev; struct xenbus_watch be_watch; u8 be_watching; @@ -89,7 +91,7 @@ typedef int (*publish_pci_root_cb) (struct xen_pcibk_device *pdev, * passthrough - BDFs are exactly like in the host. */ struct xen_pcibk_backend { - char *name; + const char *name; int (*init)(struct xen_pcibk_device *pdev); void (*free)(struct xen_pcibk_device *pdev); int (*find)(struct pci_dev *pcidev, struct xen_pcibk_device *pdev, @@ -104,9 +106,9 @@ struct xen_pcibk_backend { unsigned int devfn); }; -extern struct xen_pcibk_backend xen_pcibk_vpci_backend; -extern struct xen_pcibk_backend xen_pcibk_passthrough_backend; -extern struct xen_pcibk_backend *xen_pcibk_backend; +extern const struct xen_pcibk_backend xen_pcibk_vpci_backend; +extern const struct xen_pcibk_backend xen_pcibk_passthrough_backend; +extern const struct xen_pcibk_backend *xen_pcibk_backend; static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev, @@ -116,13 +118,14 @@ static inline int xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, if (xen_pcibk_backend && xen_pcibk_backend->add) return xen_pcibk_backend->add(pdev, dev, devid, publish_cb); return -1; -}; +} + static inline void xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev *dev) { if (xen_pcibk_backend && xen_pcibk_backend->free) return xen_pcibk_backend->release(pdev, dev); -}; +} static inline struct pci_dev * xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain, @@ -131,7 +134,8 @@ xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, unsigned int domain, if (xen_pcibk_backend && xen_pcibk_backend->get) return xen_pcibk_backend->get(pdev, domain, bus, devfn); return NULL; -}; +} + /** * Add for domain0 PCIE-AER handling. Get guest domain/bus/devfn in xen_pcibk * before sending aer request to pcifront, so that guest could identify @@ -148,25 +152,29 @@ static inline int xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, return xen_pcibk_backend->find(pcidev, pdev, domain, bus, devfn); return -1; -}; +} + static inline int xen_pcibk_init_devices(struct xen_pcibk_device *pdev) { if (xen_pcibk_backend && xen_pcibk_backend->init) return xen_pcibk_backend->init(pdev); return -1; -}; +} + static inline int xen_pcibk_publish_pci_roots(struct xen_pcibk_device *pdev, publish_pci_root_cb cb) { if (xen_pcibk_backend && xen_pcibk_backend->publish) return xen_pcibk_backend->publish(pdev, cb); return -1; -}; +} + static inline void xen_pcibk_release_devices(struct xen_pcibk_device *pdev) { if (xen_pcibk_backend && xen_pcibk_backend->free) return xen_pcibk_backend->free(pdev); -}; +} + /* Handles events from front-end */ irqreturn_t xen_pcibk_handle_event(int irq, void *dev_id); void xen_pcibk_do_op(struct work_struct *data); diff --git a/drivers/xen/xen-pciback/pciback_ops.c b/drivers/xen/xen-pciback/pciback_ops.c index 8c95c3415b7..63616d7453e 100644 --- a/drivers/xen/xen-pciback/pciback_ops.c +++ b/drivers/xen/xen-pciback/pciback_ops.c @@ -10,7 +10,6 @@ #include <linux/sched.h> #include "pciback.h" -#define DRV_NAME "xen-pciback" int verbose_request; module_param(verbose_request, int, 0644); diff --git a/drivers/xen/xen-pciback/vpci.c b/drivers/xen/xen-pciback/vpci.c index 4a42cfb0959..46d140baebd 100644 --- a/drivers/xen/xen-pciback/vpci.c +++ b/drivers/xen/xen-pciback/vpci.c @@ -8,16 +8,15 @@ #include <linux/list.h> #include <linux/slab.h> #include <linux/pci.h> -#include <linux/spinlock.h> +#include <linux/mutex.h> #include "pciback.h" #define PCI_SLOT_MAX 32 -#define DRV_NAME "xen-pciback" struct vpci_dev_data { /* Access to dev_list must be protected by lock */ struct list_head dev_list[PCI_SLOT_MAX]; - spinlock_t lock; + struct mutex lock; }; static inline struct list_head *list_first(struct list_head *head) @@ -33,13 +32,12 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, struct pci_dev_entry *entry; struct pci_dev *dev = NULL; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; - unsigned long flags; if (domain != 0 || bus != 0) return NULL; if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { - spin_lock_irqsave(&vpci_dev->lock, flags); + mutex_lock(&vpci_dev->lock); list_for_each_entry(entry, &vpci_dev->dev_list[PCI_SLOT(devfn)], @@ -50,7 +48,7 @@ static struct pci_dev *__xen_pcibk_get_pci_dev(struct xen_pcibk_device *pdev, } } - spin_unlock_irqrestore(&vpci_dev->lock, flags); + mutex_unlock(&vpci_dev->lock); } return dev; } @@ -71,7 +69,6 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, int err = 0, slot, func = -1; struct pci_dev_entry *t, *dev_entry; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; - unsigned long flags; if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { err = -EFAULT; @@ -90,7 +87,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, dev_entry->dev = dev; - spin_lock_irqsave(&vpci_dev->lock, flags); + mutex_lock(&vpci_dev->lock); /* Keep multi-function devices together on the virtual PCI bus */ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { @@ -129,7 +126,7 @@ static int __xen_pcibk_add_pci_dev(struct xen_pcibk_device *pdev, "No more space on root virtual PCI bus"); unlock: - spin_unlock_irqrestore(&vpci_dev->lock, flags); + mutex_unlock(&vpci_dev->lock); /* Publish this device. */ if (!err) @@ -145,14 +142,13 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, int slot; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; struct pci_dev *found_dev = NULL; - unsigned long flags; - spin_lock_irqsave(&vpci_dev->lock, flags); + mutex_lock(&vpci_dev->lock); for (slot = 0; slot < PCI_SLOT_MAX; slot++) { - struct pci_dev_entry *e, *tmp; - list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], - list) { + struct pci_dev_entry *e; + + list_for_each_entry(e, &vpci_dev->dev_list[slot], list) { if (e->dev == dev) { list_del(&e->list); found_dev = e->dev; @@ -163,7 +159,7 @@ static void __xen_pcibk_release_pci_dev(struct xen_pcibk_device *pdev, } out: - spin_unlock_irqrestore(&vpci_dev->lock, flags); + mutex_unlock(&vpci_dev->lock); if (found_dev) pcistub_put_pci_dev(found_dev); @@ -178,7 +174,7 @@ static int __xen_pcibk_init_devices(struct xen_pcibk_device *pdev) if (!vpci_dev) return -ENOMEM; - spin_lock_init(&vpci_dev->lock); + mutex_init(&vpci_dev->lock); for (slot = 0; slot < PCI_SLOT_MAX; slot++) INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); @@ -222,10 +218,9 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, struct pci_dev_entry *entry; struct pci_dev *dev = NULL; struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; - unsigned long flags; int found = 0, slot; - spin_lock_irqsave(&vpci_dev->lock, flags); + mutex_lock(&vpci_dev->lock); for (slot = 0; slot < PCI_SLOT_MAX; slot++) { list_for_each_entry(entry, &vpci_dev->dev_list[slot], @@ -243,11 +238,11 @@ static int __xen_pcibk_get_pcifront_dev(struct pci_dev *pcidev, } } } - spin_unlock_irqrestore(&vpci_dev->lock, flags); + mutex_unlock(&vpci_dev->lock); return found; } -struct xen_pcibk_backend xen_pcibk_vpci_backend = { +const struct xen_pcibk_backend xen_pcibk_vpci_backend = { .name = "vpci", .init = __xen_pcibk_init_devices, .free = __xen_pcibk_release_devices, diff --git a/drivers/xen/xen-pciback/xenbus.c b/drivers/xen/xen-pciback/xenbus.c index 206c4ce030b..075525945e3 100644 --- a/drivers/xen/xen-pciback/xenbus.c +++ b/drivers/xen/xen-pciback/xenbus.c @@ -11,10 +11,8 @@ #include <xen/xenbus.h> #include <xen/events.h> #include <asm/xen/pci.h> -#include <linux/workqueue.h> #include "pciback.h" -#define DRV_NAME "xen-pciback" #define INVALID_EVTCHN_IRQ (-1) struct workqueue_struct *xen_pcibk_wq; @@ -45,7 +43,7 @@ static struct xen_pcibk_device *alloc_pdev(struct xenbus_device *xdev) pdev->xdev = xdev; dev_set_drvdata(&xdev->dev, pdev); - spin_lock_init(&pdev->dev_lock); + mutex_init(&pdev->dev_lock); pdev->sh_info = NULL; pdev->evtchn_irq = INVALID_EVTCHN_IRQ; @@ -63,14 +61,12 @@ out: static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) { - spin_lock(&pdev->dev_lock); - + mutex_lock(&pdev->dev_lock); /* Ensure the guest can't trigger our handler before removing devices */ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) { unbind_from_irqhandler(pdev->evtchn_irq, pdev); pdev->evtchn_irq = INVALID_EVTCHN_IRQ; } - spin_unlock(&pdev->dev_lock); /* If the driver domain started an op, make sure we complete it * before releasing the shared memory */ @@ -78,13 +74,11 @@ static void xen_pcibk_disconnect(struct xen_pcibk_device *pdev) /* Note, the workqueue does not use spinlocks at all.*/ flush_workqueue(xen_pcibk_wq); - spin_lock(&pdev->dev_lock); if (pdev->sh_info != NULL) { xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_info); pdev->sh_info = NULL; } - spin_unlock(&pdev->dev_lock); - + mutex_unlock(&pdev->dev_lock); } static void free_pdev(struct xen_pcibk_device *pdev) @@ -121,9 +115,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, goto out; } - spin_lock(&pdev->dev_lock); pdev->sh_info = vaddr; - spin_unlock(&pdev->dev_lock); err = bind_interdomain_evtchn_to_irqhandler( pdev->xdev->otherend_id, remote_evtchn, xen_pcibk_handle_event, @@ -133,10 +125,7 @@ static int xen_pcibk_do_attach(struct xen_pcibk_device *pdev, int gnt_ref, "Error binding event channel to IRQ"); goto out; } - - spin_lock(&pdev->dev_lock); pdev->evtchn_irq = err; - spin_unlock(&pdev->dev_lock); err = 0; dev_dbg(&pdev->xdev->dev, "Attached!\n"); @@ -151,6 +140,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev) char *magic = NULL; + mutex_lock(&pdev->dev_lock); /* Make sure we only do this setup once */ if (xenbus_read_driver_state(pdev->xdev->nodename) != XenbusStateInitialised) @@ -177,7 +167,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev) if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { xenbus_dev_fatal(pdev->xdev, -EFAULT, "version mismatch (%s/%s) with pcifront - " - "halting xen_pcibk", + "halting " DRV_NAME, magic, XEN_PCI_MAGIC); goto out; } @@ -195,6 +185,7 @@ static int xen_pcibk_attach(struct xen_pcibk_device *pdev) dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); out: + mutex_unlock(&pdev->dev_lock); kfree(magic); @@ -250,6 +241,7 @@ static int xen_pcibk_export_device(struct xen_pcibk_device *pdev, goto out; dev_dbg(&dev->dev, "registering for %d\n", pdev->xdev->otherend_id); + dev->dev_flags |= PCI_DEV_FLAGS_ASSIGNED; if (xen_register_device_domain_owner(dev, pdev->xdev->otherend_id) != 0) { dev_err(&dev->dev, "device has been assigned to another " \ @@ -289,6 +281,7 @@ static int xen_pcibk_remove_device(struct xen_pcibk_device *pdev, } dev_dbg(&dev->dev, "unregistering for %d\n", pdev->xdev->otherend_id); + dev->dev_flags &= ~PCI_DEV_FLAGS_ASSIGNED; xen_unregister_device_domain_owner(dev); xen_pcibk_release_pci_dev(pdev, dev); @@ -370,6 +363,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) dev_dbg(&pdev->xdev->dev, "Reconfiguring device ...\n"); + mutex_lock(&pdev->dev_lock); /* Make sure we only reconfigure once */ if (xenbus_read_driver_state(pdev->xdev->nodename) != XenbusStateReconfiguring) @@ -507,6 +501,7 @@ static int xen_pcibk_reconfigure(struct xen_pcibk_device *pdev) } out: + mutex_unlock(&pdev->dev_lock); return 0; } @@ -563,6 +558,7 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev) char dev_str[64]; char state_str[64]; + mutex_lock(&pdev->dev_lock); /* It's possible we could get the call to setup twice, so make sure * we're not already connected. */ @@ -643,10 +639,10 @@ static int xen_pcibk_setup_backend(struct xen_pcibk_device *pdev) "Error switching to initialised state!"); out: + mutex_unlock(&pdev->dev_lock); if (!err) /* see if pcifront is already configured (if not, we'll wait) */ xen_pcibk_attach(pdev); - return err; } @@ -725,7 +721,7 @@ static struct xenbus_driver xenbus_xen_pcibk_driver = { .otherend_changed = xen_pcibk_frontend_changed, }; -struct xen_pcibk_backend *xen_pcibk_backend; +const struct xen_pcibk_backend *__read_mostly xen_pcibk_backend; int __init xen_pcibk_xenbus_register(void) { diff --git a/drivers/xen/xen-selfballoon.c b/drivers/xen/xen-selfballoon.c index 010937b5a7c..d93c70857e0 100644 --- a/drivers/xen/xen-selfballoon.c +++ b/drivers/xen/xen-selfballoon.c @@ -68,12 +68,15 @@ */ #include <linux/kernel.h> +#include <linux/bootmem.h> +#include <linux/swap.h> #include <linux/mm.h> #include <linux/mman.h> - +#include <linux/module.h> +#include <linux/workqueue.h> #include <xen/balloon.h> - #include <xen/tmem.h> +#include <xen/xen.h> /* Enable/disable with sysfs. */ static int xen_selfballooning_enabled __read_mostly; @@ -92,6 +95,15 @@ static unsigned int selfballoon_uphysteresis __read_mostly = 1; /* In HZ, controls frequency of worker invocation. */ static unsigned int selfballoon_interval __read_mostly = 5; +/* + * Minimum usable RAM in MB for selfballooning target for balloon. + * If non-zero, it is added to totalreserve_pages and self-ballooning + * will not balloon below the sum. If zero, a piecewise linear function + * is calculated as a minimum and added to totalreserve_pages. Note that + * setting this value indiscriminately may cause OOMs and crashes. + */ +static unsigned int selfballoon_min_usable_mb; + static void selfballoon_process(struct work_struct *work); static DECLARE_DELAYED_WORK(selfballoon_worker, selfballoon_process); @@ -188,20 +200,23 @@ static int __init xen_selfballooning_setup(char *s) __setup("selfballooning", xen_selfballooning_setup); #endif /* CONFIG_FRONTSWAP */ +#define MB2PAGES(mb) ((mb) << (20 - PAGE_SHIFT)) + /* * Use current balloon size, the goal (vm_committed_as), and hysteresis * parameters to set a new target balloon size */ static void selfballoon_process(struct work_struct *work) { - unsigned long cur_pages, goal_pages, tgt_pages; + unsigned long cur_pages, goal_pages, tgt_pages, floor_pages; + unsigned long useful_pages; bool reset_timer = false; if (xen_selfballooning_enabled) { - cur_pages = balloon_stats.current_pages; + cur_pages = totalram_pages; tgt_pages = cur_pages; /* default is no change */ goal_pages = percpu_counter_read_positive(&vm_committed_as) + - balloon_stats.current_pages - totalram_pages; + totalreserve_pages; #ifdef CONFIG_FRONTSWAP /* allow space for frontswap pages to be repatriated */ if (frontswap_selfshrinking && frontswap_enabled) @@ -216,7 +231,26 @@ static void selfballoon_process(struct work_struct *work) ((goal_pages - cur_pages) / selfballoon_uphysteresis); /* else if cur_pages == goal_pages, no change */ - balloon_set_new_target(tgt_pages); + useful_pages = max_pfn - totalreserve_pages; + if (selfballoon_min_usable_mb != 0) + floor_pages = totalreserve_pages + + MB2PAGES(selfballoon_min_usable_mb); + /* piecewise linear function ending in ~3% slope */ + else if (useful_pages < MB2PAGES(16)) + floor_pages = max_pfn; /* not worth ballooning */ + else if (useful_pages < MB2PAGES(64)) + floor_pages = totalreserve_pages + MB2PAGES(16) + + ((useful_pages - MB2PAGES(16)) >> 1); + else if (useful_pages < MB2PAGES(512)) + floor_pages = totalreserve_pages + MB2PAGES(40) + + ((useful_pages - MB2PAGES(40)) >> 3); + else /* useful_pages >= MB2PAGES(512) */ + floor_pages = totalreserve_pages + MB2PAGES(99) + + ((useful_pages - MB2PAGES(99)) >> 5); + if (tgt_pages < floor_pages) + tgt_pages = floor_pages; + balloon_set_new_target(tgt_pages + + balloon_stats.current_pages - totalram_pages); reset_timer = true; } #ifdef CONFIG_FRONTSWAP @@ -339,6 +373,31 @@ static ssize_t store_selfballoon_uphys(struct sys_device *dev, static SYSDEV_ATTR(selfballoon_uphysteresis, S_IRUGO | S_IWUSR, show_selfballoon_uphys, store_selfballoon_uphys); +SELFBALLOON_SHOW(selfballoon_min_usable_mb, "%d\n", + selfballoon_min_usable_mb); + +static ssize_t store_selfballoon_min_usable_mb(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, + size_t count) +{ + unsigned long val; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + err = strict_strtoul(buf, 10, &val); + if (err || val == 0) + return -EINVAL; + selfballoon_min_usable_mb = val; + return count; +} + +static SYSDEV_ATTR(selfballoon_min_usable_mb, S_IRUGO | S_IWUSR, + show_selfballoon_min_usable_mb, + store_selfballoon_min_usable_mb); + + #ifdef CONFIG_FRONTSWAP SELFBALLOON_SHOW(frontswap_selfshrinking, "%d\n", frontswap_selfshrinking); @@ -420,6 +479,7 @@ static struct attribute *selfballoon_attrs[] = { &attr_selfballoon_interval.attr, &attr_selfballoon_downhysteresis.attr, &attr_selfballoon_uphysteresis.attr, + &attr_selfballoon_min_usable_mb.attr, #ifdef CONFIG_FRONTSWAP &attr_frontswap_selfshrinking.attr, &attr_frontswap_hysteresis.attr, diff --git a/drivers/xen/xenbus/xenbus_client.c b/drivers/xen/xenbus/xenbus_client.c index cdacf923e07..81c3ce6b8bb 100644 --- a/drivers/xen/xenbus/xenbus_client.c +++ b/drivers/xen/xenbus/xenbus_client.c @@ -33,6 +33,7 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/vmalloc.h> +#include <linux/export.h> #include <asm/xen/hypervisor.h> #include <xen/interface/xen.h> #include <xen/interface/event_channel.h> @@ -443,7 +444,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr) *vaddr = NULL; - area = xen_alloc_vm_area(PAGE_SIZE); + area = alloc_vm_area(PAGE_SIZE); if (!area) return -ENOMEM; @@ -453,7 +454,7 @@ int xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref, void **vaddr) BUG(); if (op.status != GNTST_okay) { - xen_free_vm_area(area); + free_vm_area(area); xenbus_dev_fatal(dev, op.status, "mapping in shared page %d from domain %d", gnt_ref, dev->otherend_id); @@ -552,7 +553,7 @@ int xenbus_unmap_ring_vfree(struct xenbus_device *dev, void *vaddr) BUG(); if (op.status == GNTST_okay) - xen_free_vm_area(area); + free_vm_area(area); else xenbus_dev_error(dev, op.status, "unmapping page at handle %d error %d", diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index 090c61ee8fd..2eff7a6aaa2 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -212,7 +212,9 @@ int xb_init_comms(void) printk(KERN_WARNING "XENBUS response ring is not quiescent " "(%08x:%08x): fixing up\n", intf->rsp_cons, intf->rsp_prod); - intf->rsp_cons = intf->rsp_prod; + /* breaks kdump */ + if (!reset_devices) + intf->rsp_cons = intf->rsp_prod; } if (xenbus_irq) { diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index d4c7a9ffbcb..1b178c6e893 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -46,6 +46,7 @@ #include <linux/mutex.h> #include <linux/io.h> #include <linux/slab.h> +#include <linux/module.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -683,64 +684,74 @@ static int __init xenbus_probe_initcall(void) device_initcall(xenbus_probe_initcall); -static int __init xenbus_init(void) +/* Set up event channel for xenstored which is run as a local process + * (this is normally used only in dom0) + */ +static int __init xenstored_local_init(void) { int err = 0; unsigned long page = 0; + struct evtchn_alloc_unbound alloc_unbound; - DPRINTK(""); + /* Allocate Xenstore page */ + page = get_zeroed_page(GFP_KERNEL); + if (!page) + goto out_err; - err = -ENODEV; - if (!xen_domain()) - return err; + xen_store_mfn = xen_start_info->store_mfn = + pfn_to_mfn(virt_to_phys((void *)page) >> + PAGE_SHIFT); - /* - * Domain0 doesn't have a store_evtchn or store_mfn yet. - */ - if (xen_initial_domain()) { - struct evtchn_alloc_unbound alloc_unbound; + /* Next allocate a local port which xenstored can bind to */ + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = DOMID_SELF; - /* Allocate Xenstore page */ - page = get_zeroed_page(GFP_KERNEL); - if (!page) - goto out_error; + err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (err == -ENOSYS) + goto out_err; - xen_store_mfn = xen_start_info->store_mfn = - pfn_to_mfn(virt_to_phys((void *)page) >> - PAGE_SHIFT); + BUG_ON(err); + xen_store_evtchn = xen_start_info->store_evtchn = + alloc_unbound.port; - /* Next allocate a local port which xenstored can bind to */ - alloc_unbound.dom = DOMID_SELF; - alloc_unbound.remote_dom = 0; + return 0; - err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, - &alloc_unbound); - if (err == -ENOSYS) - goto out_error; + out_err: + if (page != 0) + free_page(page); + return err; +} - BUG_ON(err); - xen_store_evtchn = xen_start_info->store_evtchn = - alloc_unbound.port; +static int __init xenbus_init(void) +{ + int err = 0; - xen_store_interface = mfn_to_virt(xen_store_mfn); + if (!xen_domain()) + return -ENODEV; + + if (xen_hvm_domain()) { + uint64_t v = 0; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + if (err) + goto out_error; + xen_store_mfn = (unsigned long)v; + xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); } else { - if (xen_hvm_domain()) { - uint64_t v = 0; - err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); - if (err) - goto out_error; - xen_store_evtchn = (int)v; - err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); + xen_store_evtchn = xen_start_info->store_evtchn; + xen_store_mfn = xen_start_info->store_mfn; + if (xen_store_evtchn) + xenstored_ready = 1; + else { + err = xenstored_local_init(); if (err) goto out_error; - xen_store_mfn = (unsigned long)v; - xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, PAGE_SIZE); - } else { - xen_store_evtchn = xen_start_info->store_evtchn; - xen_store_mfn = xen_start_info->store_mfn; - xen_store_interface = mfn_to_virt(xen_store_mfn); - xenstored_ready = 1; } + xen_store_interface = mfn_to_virt(xen_store_mfn); } /* Initialize the interface to xenstore. */ @@ -759,12 +770,7 @@ static int __init xenbus_init(void) proc_mkdir("xen", NULL); #endif - return 0; - out_error: - if (page != 0) - free_page(page); - return err; } diff --git a/drivers/xen/xenbus/xenbus_probe_backend.c b/drivers/xen/xenbus/xenbus_probe_backend.c index 60adf919d78..c3c7cd195c1 100644 --- a/drivers/xen/xenbus/xenbus_probe_backend.c +++ b/drivers/xen/xenbus/xenbus_probe_backend.c @@ -42,6 +42,7 @@ #include <linux/fcntl.h> #include <linux/mm.h> #include <linux/notifier.h> +#include <linux/export.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -104,8 +105,6 @@ static int xenbus_uevent_backend(struct device *dev, xdev = to_xenbus_device(dev); bus = container_of(xdev->dev.bus, struct xen_bus_type, bus); - if (xdev == NULL) - return -ENODEV; if (add_uevent_var(env, "MODALIAS=xen-backend:%s", xdev->devicetype)) return -ENOMEM; diff --git a/drivers/xen/xenbus/xenbus_probe_frontend.c b/drivers/xen/xenbus/xenbus_probe_frontend.c index ed2ba474a56..2f73195512b 100644 --- a/drivers/xen/xenbus/xenbus_probe_frontend.c +++ b/drivers/xen/xenbus/xenbus_probe_frontend.c @@ -13,6 +13,7 @@ #include <linux/kthread.h> #include <linux/mutex.h> #include <linux/io.h> +#include <linux/module.h> #include <asm/page.h> #include <asm/pgtable.h> @@ -248,10 +249,131 @@ int __xenbus_register_frontend(struct xenbus_driver *drv, } EXPORT_SYMBOL_GPL(__xenbus_register_frontend); +static DECLARE_WAIT_QUEUE_HEAD(backend_state_wq); +static int backend_state; + +static void xenbus_reset_backend_state_changed(struct xenbus_watch *w, + const char **v, unsigned int l) +{ + xenbus_scanf(XBT_NIL, v[XS_WATCH_PATH], "", "%i", &backend_state); + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + v[XS_WATCH_PATH], xenbus_strstate(backend_state)); + wake_up(&backend_state_wq); +} + +static void xenbus_reset_wait_for_backend(char *be, int expected) +{ + long timeout; + timeout = wait_event_interruptible_timeout(backend_state_wq, + backend_state == expected, 5 * HZ); + if (timeout <= 0) + printk(KERN_INFO "XENBUS: backend %s timed out.\n", be); +} + +/* + * Reset frontend if it is in Connected or Closed state. + * Wait for backend to catch up. + * State Connected happens during kdump, Closed after kexec. + */ +static void xenbus_reset_frontend(char *fe, char *be, int be_state) +{ + struct xenbus_watch be_watch; + + printk(KERN_DEBUG "XENBUS: backend %s %s\n", + be, xenbus_strstate(be_state)); + + memset(&be_watch, 0, sizeof(be_watch)); + be_watch.node = kasprintf(GFP_NOIO | __GFP_HIGH, "%s/state", be); + if (!be_watch.node) + return; + + be_watch.callback = xenbus_reset_backend_state_changed; + backend_state = XenbusStateUnknown; + + printk(KERN_INFO "XENBUS: triggering reconnect on %s\n", be); + register_xenbus_watch(&be_watch); + + /* fall through to forward backend to state XenbusStateInitialising */ + switch (be_state) { + case XenbusStateConnected: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosing); + xenbus_reset_wait_for_backend(be, XenbusStateClosing); + + case XenbusStateClosing: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateClosed); + xenbus_reset_wait_for_backend(be, XenbusStateClosed); + + case XenbusStateClosed: + xenbus_printf(XBT_NIL, fe, "state", "%d", XenbusStateInitialising); + xenbus_reset_wait_for_backend(be, XenbusStateInitWait); + } + + unregister_xenbus_watch(&be_watch); + printk(KERN_INFO "XENBUS: reconnect done on %s\n", be); + kfree(be_watch.node); +} + +static void xenbus_check_frontend(char *class, char *dev) +{ + int be_state, fe_state, err; + char *backend, *frontend; + + frontend = kasprintf(GFP_NOIO | __GFP_HIGH, "device/%s/%s", class, dev); + if (!frontend) + return; + + err = xenbus_scanf(XBT_NIL, frontend, "state", "%i", &fe_state); + if (err != 1) + goto out; + + switch (fe_state) { + case XenbusStateConnected: + case XenbusStateClosed: + printk(KERN_DEBUG "XENBUS: frontend %s %s\n", + frontend, xenbus_strstate(fe_state)); + backend = xenbus_read(XBT_NIL, frontend, "backend", NULL); + if (!backend || IS_ERR(backend)) + goto out; + err = xenbus_scanf(XBT_NIL, backend, "state", "%i", &be_state); + if (err == 1) + xenbus_reset_frontend(frontend, backend, be_state); + kfree(backend); + break; + default: + break; + } +out: + kfree(frontend); +} + +static void xenbus_reset_state(void) +{ + char **devclass, **dev; + int devclass_n, dev_n; + int i, j; + + devclass = xenbus_directory(XBT_NIL, "device", "", &devclass_n); + if (IS_ERR(devclass)) + return; + + for (i = 0; i < devclass_n; i++) { + dev = xenbus_directory(XBT_NIL, "device", devclass[i], &dev_n); + if (IS_ERR(dev)) + continue; + for (j = 0; j < dev_n; j++) + xenbus_check_frontend(devclass[i], dev[j]); + kfree(dev); + } + kfree(devclass); +} + static int frontend_probe_and_watch(struct notifier_block *notifier, unsigned long event, void *data) { + /* reset devices in Connected or Closed state */ + if (xen_hvm_domain()) + xenbus_reset_state(); /* Enumerate devices in xenstore and watch for changes. */ xenbus_probe_devices(&xenbus_frontend); register_xenbus_watch(&fe_watch); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 5534690075a..b3b8f2f3ad1 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -45,6 +45,7 @@ #include <linux/module.h> #include <linux/mutex.h> #include <xen/xenbus.h> +#include <xen/xen.h> #include "xenbus_comms.h" struct xs_stored_msg { @@ -620,6 +621,15 @@ static struct xenbus_watch *find_watch(const char *token) return NULL; } +static void xs_reset_watches(void) +{ + int err; + + err = xs_error(xs_single(XBT_NIL, XS_RESET_WATCHES, "", NULL)); + if (err && err != -EEXIST) + printk(KERN_WARNING "xs_reset_watches failed: %d\n", err); +} + /* Register callback to watch this node. */ int register_xenbus_watch(struct xenbus_watch *watch) { @@ -638,8 +648,7 @@ int register_xenbus_watch(struct xenbus_watch *watch) err = xs_watch(watch->node, token); - /* Ignore errors due to multiple registration. */ - if ((err != 0) && (err != -EEXIST)) { + if (err) { spin_lock(&watches_lock); list_del(&watch->list); spin_unlock(&watches_lock); @@ -897,5 +906,9 @@ int xs_init(void) if (IS_ERR(task)) return PTR_ERR(task); + /* shutdown watches for kexec boot */ + if (xen_hvm_domain()) + xs_reset_watches(); + return 0; } |